mirror of https://github.com/apache/lucene.git
Merged /lucene/dev/trunk:r1430124-1432061
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1432065 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
f7c90c50f3
|
@ -281,18 +281,9 @@
|
||||||
<copy todir="${fakeRelease}/lucene">
|
<copy todir="${fakeRelease}/lucene">
|
||||||
<fileset dir="lucene/dist"/>
|
<fileset dir="lucene/dist"/>
|
||||||
</copy>
|
</copy>
|
||||||
<copy todir="${fakeRelease}/lucene/changes">
|
|
||||||
<fileset dir="lucene/build/docs/changes"/>
|
|
||||||
</copy>
|
|
||||||
<get src="http://people.apache.org/keys/group/lucene.asc"
|
|
||||||
dest="${fakeRelease}/lucene/KEYS"/>
|
|
||||||
<copy todir="${fakeRelease}/solr">
|
<copy todir="${fakeRelease}/solr">
|
||||||
<fileset dir="solr/package"/>
|
<fileset dir="solr/package"/>
|
||||||
</copy>
|
</copy>
|
||||||
<copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
|
|
||||||
<copy todir="${fakeRelease}/solr/changes">
|
|
||||||
<fileset dir="solr/build/docs/changes"/>
|
|
||||||
</copy>
|
|
||||||
<makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
|
<makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
|
||||||
<exec executable="${python32.exe}" failonerror="true">
|
<exec executable="${python32.exe}" failonerror="true">
|
||||||
<arg value="-u"/>
|
<arg value="-u"/>
|
||||||
|
|
|
@ -106,7 +106,15 @@ Changes in backwards compatibility policy
|
||||||
|
|
||||||
* LUCENE-4659: Massive cleanup to CategoryPath API. Additionally, CategoryPath is
|
* LUCENE-4659: Massive cleanup to CategoryPath API. Additionally, CategoryPath is
|
||||||
now immutable, so you don't need to clone() it. (Shai Erera)
|
now immutable, so you don't need to clone() it. (Shai Erera)
|
||||||
|
|
||||||
|
* LUCENE-4670: StoredFieldsWriter and TermVectorsWriter have new finish* callbacks
|
||||||
|
which are called after a doc/field/term has been completely added.
|
||||||
|
(Adrien Grand, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4620: IntEncoder/Decoder were changed to do bulk encoding/decoding. As a
|
||||||
|
result, few other classes such as Aggregator and CategoryListIterator were
|
||||||
|
changed to handle bulk category ordinals. (Shai Erera)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of
|
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of
|
||||||
|
@ -324,6 +332,8 @@ Bug Fixes
|
||||||
|
|
||||||
* LUCENE-4662: Add missing elided articles and prepositions to FrenchAnalyzer's
|
* LUCENE-4662: Add missing elided articles and prepositions to FrenchAnalyzer's
|
||||||
DEFAULT_ARTICLES list passed to ElisionFilter. (David Leunen via Steve Rowe)
|
DEFAULT_ARTICLES list passed to ElisionFilter. (David Leunen via Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-4671: Fix CharsRef.subSequence method. (Tim Smith via Robert Muir)
|
||||||
|
|
||||||
Changes in Runtime Behavior
|
Changes in Runtime Behavior
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,7 @@ import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.IdentityHashMap;
|
import java.util.IdentityHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -66,6 +67,8 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
|
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
|
||||||
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
|
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||||
|
@ -103,67 +106,145 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
||||||
static List<Constructor<? extends CharFilter>> charfilters;
|
static List<Constructor<? extends CharFilter>> charfilters;
|
||||||
|
|
||||||
// TODO: fix those and remove
|
private static interface Predicate<T> {
|
||||||
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
boolean apply(T o);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Predicate<Object[]> ALWAYS = new Predicate<Object[]>() {
|
||||||
|
public boolean apply(Object[] args) {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
|
||||||
static {
|
static {
|
||||||
// TODO: can we promote some of these to be only
|
try {
|
||||||
// offsets offenders?
|
brokenConstructors.put(
|
||||||
Collections.<Class<?>>addAll(brokenComponents,
|
LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class),
|
||||||
// doesn't actual reset itself!
|
ALWAYS);
|
||||||
CachingTokenFilter.class,
|
brokenConstructors.put(
|
||||||
// doesn't consume whole stream!
|
LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
|
||||||
LimitTokenCountFilter.class,
|
new Predicate<Object[]>() {
|
||||||
// Not broken: we forcefully add this, so we shouldn't
|
@Override
|
||||||
// also randomly pick it:
|
public boolean apply(Object[] args) {
|
||||||
ValidatingTokenFilter.class,
|
assert args.length == 3;
|
||||||
// NOTE: these by themselves won't cause any 'basic assertions' to fail.
|
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
|
||||||
// but see https://issues.apache.org/jira/browse/LUCENE-3920, if any
|
}
|
||||||
// tokenfilter that combines words (e.g. shingles) comes after them,
|
});
|
||||||
// this will create bogus offsets because their 'offsets go backwards',
|
for (Class<?> c : Arrays.<Class<?>>asList(
|
||||||
// causing shingle or whatever to make a single token with a
|
// TODO: can we promote some of these to be only
|
||||||
// startOffset thats > its endOffset
|
// offsets offenders?
|
||||||
// (see LUCENE-3738 for a list of other offenders here)
|
// doesn't actual reset itself!
|
||||||
// broken!
|
CachingTokenFilter.class,
|
||||||
NGramTokenizer.class,
|
// Not broken: we forcefully add this, so we shouldn't
|
||||||
// broken!
|
// also randomly pick it:
|
||||||
NGramTokenFilter.class,
|
ValidatingTokenFilter.class,
|
||||||
// broken!
|
// NOTE: these by themselves won't cause any 'basic assertions' to fail.
|
||||||
EdgeNGramTokenizer.class,
|
// but see https://issues.apache.org/jira/browse/LUCENE-3920, if any
|
||||||
// broken!
|
// tokenfilter that combines words (e.g. shingles) comes after them,
|
||||||
EdgeNGramTokenFilter.class,
|
// this will create bogus offsets because their 'offsets go backwards',
|
||||||
// broken!
|
// causing shingle or whatever to make a single token with a
|
||||||
WordDelimiterFilter.class,
|
// startOffset thats > its endOffset
|
||||||
// broken!
|
// (see LUCENE-3738 for a list of other offenders here)
|
||||||
TrimFilter.class
|
// broken!
|
||||||
);
|
NGramTokenizer.class,
|
||||||
|
// broken!
|
||||||
|
NGramTokenFilter.class,
|
||||||
|
// broken!
|
||||||
|
EdgeNGramTokenizer.class,
|
||||||
|
// broken!
|
||||||
|
EdgeNGramTokenFilter.class,
|
||||||
|
// broken!
|
||||||
|
WordDelimiterFilter.class)) {
|
||||||
|
for (Constructor<?> ctor : c.getConstructors()) {
|
||||||
|
brokenConstructors.put(ctor, ALWAYS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new Error(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: also fix these and remove (maybe):
|
// TODO: also fix these and remove (maybe):
|
||||||
// Classes that don't produce consistent graph offsets:
|
// Classes/options that don't produce consistent graph offsets:
|
||||||
private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
|
||||||
static {
|
static {
|
||||||
Collections.<Class<?>>addAll(brokenOffsetsComponents,
|
try {
|
||||||
ReversePathHierarchyTokenizer.class,
|
brokenOffsetsConstructors.put(
|
||||||
PathHierarchyTokenizer.class,
|
TrimFilter.class.getConstructor(TokenStream.class, boolean.class),
|
||||||
HyphenationCompoundWordTokenFilter.class,
|
new Predicate<Object[]>() {
|
||||||
DictionaryCompoundWordTokenFilter.class,
|
@Override
|
||||||
// TODO: corrumpts graphs (offset consistency check):
|
public boolean apply(Object[] args) {
|
||||||
PositionFilter.class,
|
assert args.length == 2;
|
||||||
// TODO: it seems to mess up offsets!?
|
return (Boolean) args[1]; // args are broken if updateOffsets is true
|
||||||
WikipediaTokenizer.class,
|
}
|
||||||
// TODO: doesn't handle graph inputs
|
});
|
||||||
ThaiWordFilter.class,
|
brokenOffsetsConstructors.put(
|
||||||
// TODO: doesn't handle graph inputs
|
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class),
|
||||||
CJKBigramFilter.class,
|
new Predicate<Object[]>() {
|
||||||
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
|
@Override
|
||||||
HyphenatedWordsFilter.class,
|
public boolean apply(Object[] args) {
|
||||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
assert args.length == 4;
|
||||||
TypeTokenFilter.class,
|
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||||
// TODO: doesn't handle graph inputs
|
return !(Boolean) args[0];
|
||||||
CommonGramsQueryFilter.class
|
}
|
||||||
);
|
});
|
||||||
|
brokenOffsetsConstructors.put(
|
||||||
|
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class),
|
||||||
|
new Predicate<Object[]>() {
|
||||||
|
@Override
|
||||||
|
public boolean apply(Object[] args) {
|
||||||
|
assert args.length == 3;
|
||||||
|
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||||
|
return !(Boolean) args[0];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
brokenOffsetsConstructors.put(
|
||||||
|
LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class),
|
||||||
|
new Predicate<Object[]>() {
|
||||||
|
@Override
|
||||||
|
public boolean apply(Object[] args) {
|
||||||
|
assert args.length == 4;
|
||||||
|
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||||
|
return !(Boolean) args[0];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
brokenOffsetsConstructors.put(
|
||||||
|
KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class),
|
||||||
|
new Predicate<Object[]>() {
|
||||||
|
@Override
|
||||||
|
public boolean apply(Object[] args) {
|
||||||
|
assert args.length == 3;
|
||||||
|
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||||
|
return !(Boolean) args[0];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
for (Class<?> c : Arrays.<Class<?>>asList(
|
||||||
|
ReversePathHierarchyTokenizer.class,
|
||||||
|
PathHierarchyTokenizer.class,
|
||||||
|
HyphenationCompoundWordTokenFilter.class,
|
||||||
|
DictionaryCompoundWordTokenFilter.class,
|
||||||
|
// TODO: corrumpts graphs (offset consistency check):
|
||||||
|
PositionFilter.class,
|
||||||
|
// TODO: it seems to mess up offsets!?
|
||||||
|
WikipediaTokenizer.class,
|
||||||
|
// TODO: doesn't handle graph inputs
|
||||||
|
ThaiWordFilter.class,
|
||||||
|
// TODO: doesn't handle graph inputs
|
||||||
|
CJKBigramFilter.class,
|
||||||
|
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
|
||||||
|
HyphenatedWordsFilter.class,
|
||||||
|
// TODO: doesn't handle graph inputs
|
||||||
|
CommonGramsQueryFilter.class)) {
|
||||||
|
for (Constructor<?> ctor : c.getConstructors()) {
|
||||||
|
brokenOffsetsConstructors.put(ctor, ALWAYS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new Error(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void beforeClass() throws Exception {
|
public static void beforeClass() throws Exception {
|
||||||
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
|
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
|
||||||
|
@ -176,7 +257,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// don't waste time with abstract classes or deprecated known-buggy ones
|
// don't waste time with abstract classes or deprecated known-buggy ones
|
||||||
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|
||||||
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
||||||
|| brokenComponents.contains(c)
|
|
||||||
|| c.isAnnotationPresent(Deprecated.class)
|
|| c.isAnnotationPresent(Deprecated.class)
|
||||||
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
|
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
|
||||||
) {
|
) {
|
||||||
|
@ -185,7 +265,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
for (final Constructor<?> ctor : c.getConstructors()) {
|
for (final Constructor<?> ctor : c.getConstructors()) {
|
||||||
// don't test synthetic or deprecated ctors, they likely have known bugs:
|
// don't test synthetic or deprecated ctors, they likely have known bugs:
|
||||||
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
|
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (Tokenizer.class.isAssignableFrom(c)) {
|
if (Tokenizer.class.isAssignableFrom(c)) {
|
||||||
|
@ -679,7 +759,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
return null; // no success
|
return null; // no success
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean broken(Constructor<?> ctor, Object[] args) {
|
||||||
|
final Predicate<Object[]> pred = brokenConstructors.get(ctor);
|
||||||
|
return pred != null && pred.apply(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean brokenOffsets(Constructor<?> ctor, Object[] args) {
|
||||||
|
final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor);
|
||||||
|
return pred != null && pred.apply(args);
|
||||||
|
}
|
||||||
|
|
||||||
// create a new random tokenizer from classpath
|
// create a new random tokenizer from classpath
|
||||||
private TokenizerSpec newTokenizer(Random random, Reader reader) {
|
private TokenizerSpec newTokenizer(Random random, Reader reader) {
|
||||||
TokenizerSpec spec = new TokenizerSpec();
|
TokenizerSpec spec = new TokenizerSpec();
|
||||||
|
@ -688,11 +778,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
final StringBuilder descr = new StringBuilder();
|
final StringBuilder descr = new StringBuilder();
|
||||||
final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
|
final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
|
||||||
final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
|
final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
|
||||||
|
if (broken(ctor, args)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
spec.tokenizer = createComponent(ctor, args, descr);
|
spec.tokenizer = createComponent(ctor, args, descr);
|
||||||
if (spec.tokenizer != null) {
|
if (spec.tokenizer != null) {
|
||||||
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
|
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
|
||||||
spec.offsetsAreCorrect = false;
|
|
||||||
}
|
|
||||||
spec.toString = descr.toString();
|
spec.toString = descr.toString();
|
||||||
} else {
|
} else {
|
||||||
assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
|
assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
|
||||||
|
@ -710,6 +801,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
while (true) {
|
while (true) {
|
||||||
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
||||||
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
|
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
|
||||||
|
if (broken(ctor, args)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
reader = createComponent(ctor, args, descr);
|
reader = createComponent(ctor, args, descr);
|
||||||
if (reader != null) {
|
if (reader != null) {
|
||||||
spec.reader = reader;
|
spec.reader = reader;
|
||||||
|
@ -746,11 +840,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
||||||
|
if (broken(ctor, args)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
final TokenFilter flt = createComponent(ctor, args, descr);
|
final TokenFilter flt = createComponent(ctor, args, descr);
|
||||||
if (flt != null) {
|
if (flt != null) {
|
||||||
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
|
spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
|
||||||
spec.offsetsAreCorrect = false;
|
|
||||||
}
|
|
||||||
spec.stream = flt;
|
spec.stream = flt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -132,7 +132,7 @@ public class TokenInfoDictionaryBuilder {
|
||||||
System.out.println(" encode...");
|
System.out.println(" encode...");
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
|
||||||
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true);
|
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, true);
|
||||||
IntsRef scratch = new IntsRef();
|
IntsRef scratch = new IntsRef();
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
|
|
@ -458,7 +458,20 @@
|
||||||
<!-- ================================================================== -->
|
<!-- ================================================================== -->
|
||||||
<target name="dist-src" depends="package-tgz-src"/>
|
<target name="dist-src" depends="package-tgz-src"/>
|
||||||
|
|
||||||
<target name="dist-all" depends="dist, dist-src"/>
|
<target name="dist-all" depends="dist, dist-src, -dist-changes, -dist-keys"/>
|
||||||
|
|
||||||
|
<!-- copy changes/ to the release folder -->
|
||||||
|
<target name="-dist-changes">
|
||||||
|
<copy todir="${dist.dir}/changes">
|
||||||
|
<fileset dir="${build.dir}/docs/changes"/>
|
||||||
|
</copy>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<!-- copy KEYS to the release folder -->
|
||||||
|
<target name="-dist-keys">
|
||||||
|
<get src="http://people.apache.org/keys/group/lucene.asc"
|
||||||
|
dest="${dist.dir}/KEYS"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="copy-to-stage">
|
<target name="copy-to-stage">
|
||||||
<copy-to-stage-macro artifacts.dir="${dist.dir}"/>
|
<copy-to-stage-macro artifacts.dir="${dist.dir}"/>
|
||||||
|
|
|
@ -24,23 +24,25 @@
|
||||||
|
|
||||||
<import file="../module-build.xml"/>
|
<import file="../module-build.xml"/>
|
||||||
|
|
||||||
<path id="base.classpath">
|
<path id="classpath">
|
||||||
<pathelement location="${common.dir}/build/core/classes/java"/>
|
<path refid="base.classpath"/>
|
||||||
|
<pathelement path="${lucene-core.jar}"/>
|
||||||
<pathelement path="${queries.jar}"/>
|
<pathelement path="${queries.jar}"/>
|
||||||
<pathelement path="${project.classpath}"/>
|
<pathelement path="${project.classpath}"/>
|
||||||
|
<pathelement location="${build.dir}/classes/java" />
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
<path id="test.classpath">
|
<path id="test.classpath">
|
||||||
<pathelement path="${analyzers-common.jar}"/>
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
<pathelement location="${common.dir}/build/test-framework/classes/java"/>
|
<pathelement location="${test-framework.jar}"/>
|
||||||
<pathelement location="${common.dir}/build/codecs/classes/java"/>
|
<pathelement location="${codecs.jar}"/>
|
||||||
<path refid="classpath"/>
|
<path refid="test.base.classpath"/>
|
||||||
<path refid="junit-path"/>
|
|
||||||
<pathelement location="${build.dir}/classes/java"/>
|
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
<target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
|
<target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
|
||||||
|
|
||||||
|
<target name="jar-core" depends="common.jar-core" />
|
||||||
|
|
||||||
<target name="javadocs" depends="javadocs-queries,compile-core">
|
<target name="javadocs" depends="javadocs-queries,compile-core">
|
||||||
<invoke-module-javadoc>
|
<invoke-module-javadoc>
|
||||||
<links>
|
<links>
|
||||||
|
|
|
@ -113,7 +113,7 @@ public final class MemoryPostingsFormat extends PostingsFormat {
|
||||||
this.field = field;
|
this.field = field;
|
||||||
this.doPackFST = doPackFST;
|
this.doPackFST = doPackFST;
|
||||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||||
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio);
|
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private class PostingsWriter extends PostingsConsumer {
|
private class PostingsWriter extends PostingsConsumer {
|
||||||
|
|
|
@ -419,7 +419,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
|
final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
|
||||||
0, 0, true, false, Integer.MAX_VALUE,
|
0, 0, true, false, Integer.MAX_VALUE,
|
||||||
outputs, null, false);
|
outputs, null, false, true);
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" compile index for prefix=" + prefix);
|
// System.out.println(" compile index for prefix=" + prefix);
|
||||||
//}
|
//}
|
||||||
|
@ -962,7 +962,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
0, 0, true,
|
0, 0, true,
|
||||||
true, Integer.MAX_VALUE,
|
true, Integer.MAX_VALUE,
|
||||||
noOutputs,
|
noOutputs,
|
||||||
new FindBlocks(), false);
|
new FindBlocks(), false, true);
|
||||||
|
|
||||||
postingsWriter.setField(fieldInfo);
|
postingsWriter.setField(fieldInfo);
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,7 +55,10 @@ public abstract class StoredFieldsWriter implements Closeable {
|
||||||
* called even if the document has no stored fields, in
|
* called even if the document has no stored fields, in
|
||||||
* this case <code>numStoredFields</code> will be zero. */
|
* this case <code>numStoredFields</code> will be zero. */
|
||||||
public abstract void startDocument(int numStoredFields) throws IOException;
|
public abstract void startDocument(int numStoredFields) throws IOException;
|
||||||
|
|
||||||
|
/** Called when a document and all its fields have been added. */
|
||||||
|
public void finishDocument() throws IOException {}
|
||||||
|
|
||||||
/** Writes a single stored field. */
|
/** Writes a single stored field. */
|
||||||
public abstract void writeField(FieldInfo info, StorableField field) throws IOException;
|
public abstract void writeField(FieldInfo info, StorableField field) throws IOException;
|
||||||
|
|
||||||
|
@ -116,6 +119,8 @@ public abstract class StoredFieldsWriter implements Closeable {
|
||||||
for (StorableField field : doc) {
|
for (StorableField field : doc) {
|
||||||
writeField(fieldInfos.fieldInfo(field.name()), field);
|
writeField(fieldInfos.fieldInfo(field.name()), field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
finishDocument();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -71,18 +71,27 @@ public abstract class TermVectorsWriter implements Closeable {
|
||||||
* has no vector fields, in this case <code>numVectorFields</code>
|
* has no vector fields, in this case <code>numVectorFields</code>
|
||||||
* will be zero. */
|
* will be zero. */
|
||||||
public abstract void startDocument(int numVectorFields) throws IOException;
|
public abstract void startDocument(int numVectorFields) throws IOException;
|
||||||
|
|
||||||
|
/** Called after a doc and all its fields have been added. */
|
||||||
|
public void finishDocument() throws IOException {};
|
||||||
|
|
||||||
/** Called before writing the terms of the field.
|
/** Called before writing the terms of the field.
|
||||||
* {@link #startTerm(BytesRef, int)} will be called <code>numTerms</code> times. */
|
* {@link #startTerm(BytesRef, int)} will be called <code>numTerms</code> times. */
|
||||||
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException;
|
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException;
|
||||||
|
|
||||||
|
/** Called after a field and all its terms have been added. */
|
||||||
|
public void finishField() throws IOException {};
|
||||||
|
|
||||||
/** Adds a term and its term frequency <code>freq</code>.
|
/** Adds a term and its term frequency <code>freq</code>.
|
||||||
* If this field has positions and/or offsets enabled, then
|
* If this field has positions and/or offsets enabled, then
|
||||||
* {@link #addPosition(int, int, int, BytesRef)} will be called
|
* {@link #addPosition(int, int, int, BytesRef)} will be called
|
||||||
* <code>freq</code> times respectively.
|
* <code>freq</code> times respectively.
|
||||||
*/
|
*/
|
||||||
public abstract void startTerm(BytesRef term, int freq) throws IOException;
|
public abstract void startTerm(BytesRef term, int freq) throws IOException;
|
||||||
|
|
||||||
|
/** Called after a term and all its positions have been added. */
|
||||||
|
public void finishTerm() throws IOException {}
|
||||||
|
|
||||||
/** Adds a term position and offsets */
|
/** Adds a term position and offsets */
|
||||||
public abstract void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException;
|
public abstract void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException;
|
||||||
|
|
||||||
|
@ -97,7 +106,7 @@ public abstract class TermVectorsWriter implements Closeable {
|
||||||
* check that this is the case to detect the JRE bug described
|
* check that this is the case to detect the JRE bug described
|
||||||
* in LUCENE-1282. */
|
* in LUCENE-1282. */
|
||||||
public abstract void finish(FieldInfos fis, int numDocs) throws IOException;
|
public abstract void finish(FieldInfos fis, int numDocs) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called by IndexWriter when writing new segments.
|
* Called by IndexWriter when writing new segments.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -197,6 +206,7 @@ public abstract class TermVectorsWriter implements Closeable {
|
||||||
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
|
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
|
||||||
if (vectors == null) {
|
if (vectors == null) {
|
||||||
startDocument(0);
|
startDocument(0);
|
||||||
|
finishDocument();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -275,10 +285,13 @@ public abstract class TermVectorsWriter implements Closeable {
|
||||||
addPosition(pos, startOffset, endOffset, payload);
|
addPosition(pos, startOffset, endOffset, payload);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
finishTerm();
|
||||||
}
|
}
|
||||||
assert termCount == numTerms;
|
assert termCount == numTerms;
|
||||||
|
finishField();
|
||||||
}
|
}
|
||||||
assert fieldCount == numFields;
|
assert fieldCount == numFields;
|
||||||
|
finishDocument();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return the BytesRef Comparator used to sort terms
|
/** Return the BytesRef Comparator used to sort terms
|
||||||
|
|
|
@ -395,8 +395,10 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
|
||||||
* Copy compressed data.
|
* Copy compressed data.
|
||||||
*/
|
*/
|
||||||
void copyCompressedData(DataOutput out) throws IOException {
|
void copyCompressedData(DataOutput out) throws IOException {
|
||||||
final int chunkSize = chunkSize();
|
final long chunkEnd = docBase + chunkDocs == numDocs
|
||||||
decompressor.copyCompressedData(fieldsStream, chunkSize, out);
|
? fieldsStream.length()
|
||||||
|
: indexReader.getStartPointer(docBase + chunkDocs);
|
||||||
|
out.copyBytes(fieldsStream, chunkEnd - fieldsStream.getFilePointer());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -136,19 +136,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void endWithPreviousDocument() throws IOException {
|
|
||||||
if (numBufferedDocs > 0) {
|
|
||||||
endOffsets[numBufferedDocs - 1] = bufferedDocs.length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startDocument(int numStoredFields) throws IOException {
|
public void startDocument(int numStoredFields) throws IOException {
|
||||||
endWithPreviousDocument();
|
|
||||||
if (triggerFlush()) {
|
|
||||||
flush();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (numBufferedDocs == this.numStoredFields.length) {
|
if (numBufferedDocs == this.numStoredFields.length) {
|
||||||
final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4);
|
final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4);
|
||||||
this.numStoredFields = Arrays.copyOf(this.numStoredFields, newLength);
|
this.numStoredFields = Arrays.copyOf(this.numStoredFields, newLength);
|
||||||
|
@ -158,6 +147,14 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
++numBufferedDocs;
|
++numBufferedDocs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishDocument() throws IOException {
|
||||||
|
endOffsets[numBufferedDocs - 1] = bufferedDocs.length;
|
||||||
|
if (triggerFlush()) {
|
||||||
|
flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void saveInts(int[] values, int length, DataOutput out) throws IOException {
|
private static void saveInts(int[] values, int length, DataOutput out) throws IOException {
|
||||||
assert length > 0;
|
assert length > 0;
|
||||||
if (length == 1) {
|
if (length == 1) {
|
||||||
|
@ -295,9 +292,10 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(FieldInfos fis, int numDocs) throws IOException {
|
public void finish(FieldInfos fis, int numDocs) throws IOException {
|
||||||
endWithPreviousDocument();
|
|
||||||
if (numBufferedDocs > 0) {
|
if (numBufferedDocs > 0) {
|
||||||
flush();
|
flush();
|
||||||
|
} else {
|
||||||
|
assert bufferedDocs.length == 0;
|
||||||
}
|
}
|
||||||
if (docBase != numDocs) {
|
if (docBase != numDocs) {
|
||||||
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
|
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
|
||||||
|
@ -351,17 +349,13 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
|
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
|
||||||
&& (numBufferedDocs == 0 || triggerFlush()) // starting a new chunk
|
&& numBufferedDocs == 0 // starting a new chunk
|
||||||
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
|
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
|
||||||
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
|
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
|
||||||
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
|
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
|
||||||
assert docID == it.docBase;
|
assert docID == it.docBase;
|
||||||
|
|
||||||
// no need to decompress, just copy data
|
// no need to decompress, just copy data
|
||||||
endWithPreviousDocument();
|
|
||||||
if (triggerFlush()) {
|
|
||||||
flush();
|
|
||||||
}
|
|
||||||
indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer());
|
indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer());
|
||||||
writeHeader(this.docBase, it.chunkDocs, it.numStoredFields, it.lengths);
|
writeHeader(this.docBase, it.chunkDocs, it.numStoredFields, it.lengths);
|
||||||
it.copyCompressedData(fieldsStream);
|
it.copyCompressedData(fieldsStream);
|
||||||
|
@ -380,6 +374,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
final int diff = docID - it.docBase;
|
final int diff = docID - it.docBase;
|
||||||
startDocument(it.numStoredFields[diff]);
|
startDocument(it.numStoredFields[diff]);
|
||||||
bufferedDocs.writeBytes(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
|
bufferedDocs.writeBytes(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
|
||||||
|
finishDocument();
|
||||||
++docCount;
|
++docCount;
|
||||||
mergeState.checkAbort.work(300);
|
mergeState.checkAbort.work(300);
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,14 +140,6 @@ public abstract class CompressionMode {
|
||||||
bytes.length = length;
|
bytes.length = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException {
|
|
||||||
final int copied = LZ4.copyCompressedData(in, originalLength, out);
|
|
||||||
if (copied != originalLength) {
|
|
||||||
throw new CorruptIndexException("Currupted compressed stream: expected " + originalLength + " bytes, but got at least" + copied);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Decompressor clone() {
|
public Decompressor clone() {
|
||||||
return this;
|
return this;
|
||||||
|
@ -224,13 +216,6 @@ public abstract class CompressionMode {
|
||||||
bytes.length = length;
|
bytes.length = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException {
|
|
||||||
final int compressedLength = in.readVInt();
|
|
||||||
out.writeVInt(compressedLength);
|
|
||||||
out.copyBytes(in, compressedLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Decompressor clone() {
|
public Decompressor clone() {
|
||||||
return new DeflateDecompressor();
|
return new DeflateDecompressor();
|
||||||
|
|
|
@ -24,7 +24,10 @@ import org.apache.lucene.store.DataOutput;
|
||||||
/**
|
/**
|
||||||
* A data compressor.
|
* A data compressor.
|
||||||
*/
|
*/
|
||||||
abstract class Compressor {
|
public abstract class Compressor {
|
||||||
|
|
||||||
|
/** Sole constructor, typically called from sub-classes. */
|
||||||
|
protected Compressor() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compress bytes into <code>out</code>. It it the responsibility of the
|
* Compress bytes into <code>out</code>. It it the responsibility of the
|
||||||
|
|
|
@ -20,13 +20,15 @@ package org.apache.lucene.codecs.compressing;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.DataOutput;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An decompressor.
|
* A decompressor.
|
||||||
*/
|
*/
|
||||||
abstract class Decompressor implements Cloneable {
|
public abstract class Decompressor implements Cloneable {
|
||||||
|
|
||||||
|
/** Sole constructor, typically called from sub-classes. */
|
||||||
|
protected Decompressor() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decompress bytes that were stored between offsets <code>offset</code> and
|
* Decompress bytes that were stored between offsets <code>offset</code> and
|
||||||
|
@ -44,10 +46,6 @@ abstract class Decompressor implements Cloneable {
|
||||||
*/
|
*/
|
||||||
public abstract void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException;
|
public abstract void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException;
|
||||||
|
|
||||||
/** Copy a compressed stream whose original length is
|
|
||||||
* <code>originalLength</code> from <code>in</code> to <code>out</code>. */
|
|
||||||
public abstract void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public abstract Decompressor clone();
|
public abstract Decompressor clone();
|
||||||
|
|
||||||
|
|
|
@ -506,51 +506,4 @@ class LZ4 {
|
||||||
encodeLastLiterals(src, anchor, srcEnd - anchor, out);
|
encodeLastLiterals(src, anchor, srcEnd - anchor, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Copy bytes from <code>in</code> to <code>out</code> where
|
|
||||||
* <code>in</code> is a LZ4-encoded stream. This method copies enough bytes
|
|
||||||
* so that <code>out</code> can be used later on to restore the first
|
|
||||||
* <code>length</code> bytes of the stream. This method always reads at
|
|
||||||
* least one byte from <code>in</code> so make sure not to call this method
|
|
||||||
* if <code>in</code> reached the end of the stream, even if
|
|
||||||
* <code>length=0</code>. */
|
|
||||||
public static int copyCompressedData(DataInput in, int length, DataOutput out) throws IOException {
|
|
||||||
int n = 0;
|
|
||||||
do {
|
|
||||||
// literals
|
|
||||||
final byte token = in.readByte();
|
|
||||||
out.writeByte(token);
|
|
||||||
int literalLen = (token & 0xFF) >>> 4;
|
|
||||||
if (literalLen == 0x0F) {
|
|
||||||
byte len;
|
|
||||||
while ((len = in.readByte()) == (byte) 0xFF) {
|
|
||||||
literalLen += 0xFF;
|
|
||||||
out.writeByte(len);
|
|
||||||
}
|
|
||||||
literalLen += len & 0xFF;
|
|
||||||
out.writeByte(len);
|
|
||||||
}
|
|
||||||
out.copyBytes(in, literalLen);
|
|
||||||
n += literalLen;
|
|
||||||
if (n >= length) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// matchs
|
|
||||||
out.copyBytes(in, 2); // match dec
|
|
||||||
int matchLen = token & 0x0F;
|
|
||||||
if (matchLen == 0x0F) {
|
|
||||||
byte len;
|
|
||||||
while ((len = in.readByte()) == (byte) 0xFF) {
|
|
||||||
matchLen += 0xFF;
|
|
||||||
out.writeByte(len);
|
|
||||||
}
|
|
||||||
matchLen += len & 0xFF;
|
|
||||||
out.writeByte(len);
|
|
||||||
}
|
|
||||||
matchLen += MIN_MATCH;
|
|
||||||
n += matchLen;
|
|
||||||
} while (n < length);
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -124,17 +124,16 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
||||||
if (payloads)
|
if (payloads)
|
||||||
bits |= Lucene40TermVectorsReader.STORE_PAYLOAD_WITH_TERMVECTOR;
|
bits |= Lucene40TermVectorsReader.STORE_PAYLOAD_WITH_TERMVECTOR;
|
||||||
tvf.writeByte(bits);
|
tvf.writeByte(bits);
|
||||||
|
|
||||||
assert fieldCount <= numVectorFields;
|
|
||||||
if (fieldCount == numVectorFields) {
|
|
||||||
// last field of the document
|
|
||||||
// this is crazy because the file format is crazy!
|
|
||||||
for (int i = 1; i < fieldCount; i++) {
|
|
||||||
tvd.writeVLong(fps[i] - fps[i-1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishDocument() throws IOException {
|
||||||
|
assert fieldCount == numVectorFields;
|
||||||
|
for (int i = 1; i < fieldCount; i++) {
|
||||||
|
tvd.writeVLong(fps[i] - fps[i-1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private final BytesRef lastTerm = new BytesRef(10);
|
private final BytesRef lastTerm = new BytesRef(10);
|
||||||
|
|
||||||
// NOTE: we override addProx, so we don't need to buffer when indexing.
|
// NOTE: we override addProx, so we don't need to buffer when indexing.
|
||||||
|
@ -222,20 +221,6 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
bufferedIndex++;
|
bufferedIndex++;
|
||||||
|
|
||||||
// dump buffer if we are done
|
|
||||||
if (bufferedIndex == bufferedFreq) {
|
|
||||||
if (payloads) {
|
|
||||||
tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length);
|
|
||||||
}
|
|
||||||
for (int i = 0; i < bufferedIndex; i++) {
|
|
||||||
if (offsets) {
|
|
||||||
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
|
|
||||||
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
|
|
||||||
lastOffset = offsetEndBuffer[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (positions) {
|
} else if (positions) {
|
||||||
// write position delta
|
// write position delta
|
||||||
writePosition(position - lastPosition, payload);
|
writePosition(position - lastPosition, payload);
|
||||||
|
@ -248,6 +233,25 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishTerm() throws IOException {
|
||||||
|
if (bufferedIndex > 0) {
|
||||||
|
// dump buffer
|
||||||
|
assert positions && (offsets || payloads);
|
||||||
|
assert bufferedIndex == bufferedFreq;
|
||||||
|
if (payloads) {
|
||||||
|
tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < bufferedIndex; i++) {
|
||||||
|
if (offsets) {
|
||||||
|
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
|
||||||
|
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
|
||||||
|
lastOffset = offsetEndBuffer[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void writePosition(int delta, BytesRef payload) throws IOException {
|
private void writePosition(int delta, BytesRef payload) throws IOException {
|
||||||
if (payloads) {
|
if (payloads) {
|
||||||
int payloadLength = payload == null ? 0 : payload.length;
|
int payloadLength = payload == null ? 0 : payload.length;
|
||||||
|
|
|
@ -108,6 +108,7 @@ final class StoredFieldsProcessor extends StoredFieldsConsumer {
|
||||||
while(lastDocID < docID) {
|
while(lastDocID < docID) {
|
||||||
fieldsWriter.startDocument(0);
|
fieldsWriter.startDocument(0);
|
||||||
lastDocID++;
|
lastDocID++;
|
||||||
|
fieldsWriter.finishDocument();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,6 +124,7 @@ final class StoredFieldsProcessor extends StoredFieldsConsumer {
|
||||||
for (int i = 0; i < numStoredFields; i++) {
|
for (int i = 0; i < numStoredFields; i++) {
|
||||||
fieldsWriter.writeField(fieldInfos[i], storedFields[i]);
|
fieldsWriter.writeField(fieldInfos[i], storedFields[i]);
|
||||||
}
|
}
|
||||||
|
fieldsWriter.finishDocument();
|
||||||
lastDocID++;
|
lastDocID++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -78,6 +78,7 @@ final class TermVectorsConsumer extends TermsHashConsumer {
|
||||||
void fill(int docID) throws IOException {
|
void fill(int docID) throws IOException {
|
||||||
while(lastDocID < docID) {
|
while(lastDocID < docID) {
|
||||||
writer.startDocument(0);
|
writer.startDocument(0);
|
||||||
|
writer.finishDocument();
|
||||||
lastDocID++;
|
lastDocID++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -108,6 +109,7 @@ final class TermVectorsConsumer extends TermsHashConsumer {
|
||||||
for (int i = 0; i < numVectorFields; i++) {
|
for (int i = 0; i < numVectorFields; i++) {
|
||||||
perFields[i].finishDocument();
|
perFields[i].finishDocument();
|
||||||
}
|
}
|
||||||
|
writer.finishDocument();
|
||||||
|
|
||||||
assert lastDocID == docState.docID: "lastDocID=" + lastDocID + " docState.docID=" + docState.docID;
|
assert lastDocID == docState.docID: "lastDocID=" + lastDocID + " docState.docID=" + docState.docID;
|
||||||
|
|
||||||
|
|
|
@ -182,7 +182,9 @@ final class TermVectorsConsumerPerField extends TermsHashConsumerPerField {
|
||||||
}
|
}
|
||||||
tv.addProx(freq, posReader, offReader);
|
tv.addProx(freq, posReader, offReader);
|
||||||
}
|
}
|
||||||
|
tv.finishTerm();
|
||||||
}
|
}
|
||||||
|
tv.finishField();
|
||||||
|
|
||||||
termsHashPerField.reset();
|
termsHashPerField.reset();
|
||||||
|
|
||||||
|
|
|
@ -218,7 +218,7 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence, Clone
|
||||||
if (start < 0 || end > length || start > end) {
|
if (start < 0 || end > length || start > end) {
|
||||||
throw new IndexOutOfBoundsException();
|
throw new IndexOutOfBoundsException();
|
||||||
}
|
}
|
||||||
return new CharsRef(chars, offset + start, offset + end);
|
return new CharsRef(chars, offset + start, end - start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @deprecated This comparator is only a transition mechanism */
|
/** @deprecated This comparator is only a transition mechanism */
|
||||||
|
|
|
@ -84,11 +84,11 @@ public class Builder<T> {
|
||||||
/**
|
/**
|
||||||
* Instantiates an FST/FSA builder without any pruning. A shortcut
|
* Instantiates an FST/FSA builder without any pruning. A shortcut
|
||||||
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean,
|
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean,
|
||||||
* boolean, int, Outputs, FreezeTail, boolean)} with
|
* boolean, int, Outputs, FreezeTail, boolean, boolean)} with
|
||||||
* pruning options turned off.
|
* pruning options turned off.
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT);
|
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -97,9 +97,9 @@ public class Builder<T> {
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||||
FreezeTail<T> freezeTail, boolean willPackFST) {
|
FreezeTail<T> freezeTail, boolean willPackFST, boolean allowArrayArcs) {
|
||||||
this(inputType, minSuffixCount1, minSuffixCount2, doShareSuffix, doShareNonSingletonNodes,
|
this(inputType, minSuffixCount1, minSuffixCount2, doShareSuffix, doShareNonSingletonNodes,
|
||||||
shareMaxTailLength, outputs, freezeTail, willPackFST, PackedInts.DEFAULT);
|
shareMaxTailLength, outputs, freezeTail, willPackFST, PackedInts.DEFAULT, allowArrayArcs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -143,10 +143,14 @@ public class Builder<T> {
|
||||||
*
|
*
|
||||||
* @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
|
* @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
|
||||||
* is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float)
|
* is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float)
|
||||||
|
*
|
||||||
|
* @param allowArrayArcs Pass false to disable the array arc optimization
|
||||||
|
* while building the FST; this will make the resulting
|
||||||
|
* FST smaller but slower to traverse.
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||||
FreezeTail<T> freezeTail, boolean doPackFST, float acceptableOverheadRatio) {
|
FreezeTail<T> freezeTail, boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
|
||||||
this.minSuffixCount1 = minSuffixCount1;
|
this.minSuffixCount1 = minSuffixCount1;
|
||||||
this.minSuffixCount2 = minSuffixCount2;
|
this.minSuffixCount2 = minSuffixCount2;
|
||||||
this.freezeTail = freezeTail;
|
this.freezeTail = freezeTail;
|
||||||
|
@ -154,7 +158,7 @@ public class Builder<T> {
|
||||||
this.shareMaxTailLength = shareMaxTailLength;
|
this.shareMaxTailLength = shareMaxTailLength;
|
||||||
this.doPackFST = doPackFST;
|
this.doPackFST = doPackFST;
|
||||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||||
fst = new FST<T>(inputType, outputs, doPackFST, acceptableOverheadRatio);
|
fst = new FST<T>(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs);
|
||||||
if (doShareSuffix) {
|
if (doShareSuffix) {
|
||||||
dedupHash = new NodeHash<T>(fst);
|
dedupHash = new NodeHash<T>(fst);
|
||||||
} else {
|
} else {
|
||||||
|
@ -182,13 +186,6 @@ public class Builder<T> {
|
||||||
return dedupHash == null ? 0 : fst.nodeCount;
|
return dedupHash == null ? 0 : fst.nodeCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Pass false to disable the array arc optimization
|
|
||||||
* while building the FST; this will make the resulting
|
|
||||||
* FST smaller but slower to traverse. */
|
|
||||||
public void setAllowArrayArcs(boolean b) {
|
|
||||||
fst.setAllowArrayArcs(b);
|
|
||||||
}
|
|
||||||
|
|
||||||
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
|
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
|
||||||
final int node;
|
final int node;
|
||||||
if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
||||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.DataOutput;
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.InputStreamDataInput;
|
import org.apache.lucene.store.InputStreamDataInput;
|
||||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||||
|
import org.apache.lucene.store.RAMOutputStream;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
@ -137,16 +138,18 @@ public final class FST<T> {
|
||||||
// if non-null, this FST accepts the empty string and
|
// if non-null, this FST accepts the empty string and
|
||||||
// produces this output
|
// produces this output
|
||||||
T emptyOutput;
|
T emptyOutput;
|
||||||
private byte[] emptyOutputBytes;
|
|
||||||
|
|
||||||
// Not private to avoid synthetic access$NNN methods:
|
// Not private to avoid synthetic access$NNN methods:
|
||||||
byte[] bytes;
|
byte[] bytes;
|
||||||
int byteUpto = 0;
|
|
||||||
|
|
||||||
private int startNode = -1;
|
private int startNode = -1;
|
||||||
|
|
||||||
public final Outputs<T> outputs;
|
public final Outputs<T> outputs;
|
||||||
|
|
||||||
|
// Used for the BIT_TARGET_NEXT optimization (whereby
|
||||||
|
// instead of storing the address of the target node for
|
||||||
|
// a given arc, we mark a single bit noting that the next
|
||||||
|
// node in the byte[] is the target node):
|
||||||
private int lastFrozenNode;
|
private int lastFrozenNode;
|
||||||
|
|
||||||
private final T NO_OUTPUT;
|
private final T NO_OUTPUT;
|
||||||
|
@ -161,7 +164,7 @@ public final class FST<T> {
|
||||||
/** If arc has this label then that arc is final/accepted */
|
/** If arc has this label then that arc is final/accepted */
|
||||||
public static final int END_LABEL = -1;
|
public static final int END_LABEL = -1;
|
||||||
|
|
||||||
private boolean allowArrayArcs = true;
|
private final boolean allowArrayArcs;
|
||||||
|
|
||||||
private Arc<T> cachedRootArcs[];
|
private Arc<T> cachedRootArcs[];
|
||||||
|
|
||||||
|
@ -262,9 +265,10 @@ public final class FST<T> {
|
||||||
|
|
||||||
// make a new empty FST, for building; Builder invokes
|
// make a new empty FST, for building; Builder invokes
|
||||||
// this ctor
|
// this ctor
|
||||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio) {
|
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
|
||||||
this.inputType = inputType;
|
this.inputType = inputType;
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
|
this.allowArrayArcs = allowArrayArcs;
|
||||||
bytes = new byte[128];
|
bytes = new byte[128];
|
||||||
NO_OUTPUT = outputs.getNoOutput();
|
NO_OUTPUT = outputs.getNoOutput();
|
||||||
if (willPackFST) {
|
if (willPackFST) {
|
||||||
|
@ -293,14 +297,15 @@ public final class FST<T> {
|
||||||
if (in.readByte() == 1) {
|
if (in.readByte() == 1) {
|
||||||
// accepts empty string
|
// accepts empty string
|
||||||
int numBytes = in.readVInt();
|
int numBytes = in.readVInt();
|
||||||
// messy
|
|
||||||
bytes = new byte[numBytes];
|
bytes = new byte[numBytes];
|
||||||
in.readBytes(bytes, 0, numBytes);
|
in.readBytes(bytes, 0, numBytes);
|
||||||
|
|
||||||
|
// De-serialize empty-string output:
|
||||||
BytesReader reader;
|
BytesReader reader;
|
||||||
if (packed) {
|
if (packed) {
|
||||||
reader = getBytesReader(0);
|
reader = new ForwardBytesReader(bytes, 0);
|
||||||
} else {
|
} else {
|
||||||
reader = getBytesReader(numBytes-1);
|
reader = new ReverseBytesReader(bytes, bytes.length-1);
|
||||||
}
|
}
|
||||||
emptyOutput = outputs.readFinalOutput(reader);
|
emptyOutput = outputs.readFinalOutput(reader);
|
||||||
} else {
|
} else {
|
||||||
|
@ -335,6 +340,11 @@ public final class FST<T> {
|
||||||
NO_OUTPUT = outputs.getNoOutput();
|
NO_OUTPUT = outputs.getNoOutput();
|
||||||
|
|
||||||
cacheRootArcs();
|
cacheRootArcs();
|
||||||
|
|
||||||
|
// NOTE: bogus because this is only used during
|
||||||
|
// building; we need to break out mutable FST from
|
||||||
|
// immutable
|
||||||
|
allowArrayArcs = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public INPUT_TYPE getInputType() {
|
public INPUT_TYPE getInputType() {
|
||||||
|
@ -412,26 +422,6 @@ public final class FST<T> {
|
||||||
} else {
|
} else {
|
||||||
emptyOutput = v;
|
emptyOutput = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: this is messy -- replace with sillyBytesWriter; maybe make
|
|
||||||
// bytes private
|
|
||||||
final int posSave = writer.getPosition();
|
|
||||||
outputs.writeFinalOutput(emptyOutput, writer);
|
|
||||||
emptyOutputBytes = new byte[writer.getPosition()-posSave];
|
|
||||||
|
|
||||||
if (!packed) {
|
|
||||||
// reverse
|
|
||||||
final int stopAt = (writer.getPosition() - posSave)/2;
|
|
||||||
int upto = 0;
|
|
||||||
while(upto < stopAt) {
|
|
||||||
final byte b = bytes[posSave + upto];
|
|
||||||
bytes[posSave+upto] = bytes[writer.getPosition()-upto-1];
|
|
||||||
bytes[writer.getPosition()-upto-1] = b;
|
|
||||||
upto++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.getPosition()-posSave);
|
|
||||||
writer.setPosition(posSave);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void save(DataOutput out) throws IOException {
|
public void save(DataOutput out) throws IOException {
|
||||||
|
@ -453,7 +443,27 @@ public final class FST<T> {
|
||||||
// TODO: really we should encode this as an arc, arriving
|
// TODO: really we should encode this as an arc, arriving
|
||||||
// to the root node, instead of special casing here:
|
// to the root node, instead of special casing here:
|
||||||
if (emptyOutput != null) {
|
if (emptyOutput != null) {
|
||||||
|
// Accepts empty string
|
||||||
out.writeByte((byte) 1);
|
out.writeByte((byte) 1);
|
||||||
|
|
||||||
|
// Serialize empty-string output:
|
||||||
|
RAMOutputStream ros = new RAMOutputStream();
|
||||||
|
outputs.writeFinalOutput(emptyOutput, ros);
|
||||||
|
|
||||||
|
byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()];
|
||||||
|
ros.writeTo(emptyOutputBytes, 0);
|
||||||
|
|
||||||
|
if (!packed) {
|
||||||
|
// reverse
|
||||||
|
final int stopAt = emptyOutputBytes.length/2;
|
||||||
|
int upto = 0;
|
||||||
|
while(upto < stopAt) {
|
||||||
|
final byte b = emptyOutputBytes[upto];
|
||||||
|
emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
|
||||||
|
emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
|
||||||
|
upto++;
|
||||||
|
}
|
||||||
|
}
|
||||||
out.writeVInt(emptyOutputBytes.length);
|
out.writeVInt(emptyOutputBytes.length);
|
||||||
out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
|
out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
|
||||||
} else {
|
} else {
|
||||||
|
@ -1160,10 +1170,6 @@ public final class FST<T> {
|
||||||
return arcWithOutputCount;
|
return arcWithOutputCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setAllowArrayArcs(boolean v) {
|
|
||||||
allowArrayArcs = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Nodes will be expanded if their depth (distance from the root node) is
|
* Nodes will be expanded if their depth (distance from the root node) is
|
||||||
* <= this value and their number of arcs is >=
|
* <= this value and their number of arcs is >=
|
||||||
|
@ -1453,6 +1459,11 @@ public final class FST<T> {
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
NO_OUTPUT = outputs.getNoOutput();
|
NO_OUTPUT = outputs.getNoOutput();
|
||||||
writer = new DefaultBytesWriter();
|
writer = new DefaultBytesWriter();
|
||||||
|
|
||||||
|
// NOTE: bogus because this is only used during
|
||||||
|
// building; we need to break out mutable FST from
|
||||||
|
// immutable
|
||||||
|
allowArrayArcs = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Expert: creates an FST by packing this one. This
|
/** Expert: creates an FST by packing this one. This
|
||||||
|
|
|
@ -0,0 +1,227 @@
|
||||||
|
package org.apache.lucene.util.packed;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT;
|
||||||
|
import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0;
|
||||||
|
import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize;
|
||||||
|
|
||||||
|
import java.io.EOFException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.LongsRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reader for sequences of longs written with {@link BlockPackedWriter}.
|
||||||
|
* @see BlockPackedWriter
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
public final class BlockPackedReader {
|
||||||
|
|
||||||
|
static long zigZagDecode(long n) {
|
||||||
|
return ((n >>> 1) ^ -(n & 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// same as DataInput.readVLong but supports negative values
|
||||||
|
static long readVLong(DataInput in) throws IOException {
|
||||||
|
byte b = in.readByte();
|
||||||
|
if (b >= 0) return b;
|
||||||
|
long i = b & 0x7FL;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 7;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 14;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 21;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 28;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 35;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 42;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0x7FL) << 49;
|
||||||
|
if (b >= 0) return i;
|
||||||
|
b = in.readByte();
|
||||||
|
i |= (b & 0xFFL) << 56;
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
final DataInput in;
|
||||||
|
final int packedIntsVersion;
|
||||||
|
final long valueCount;
|
||||||
|
final int blockSize;
|
||||||
|
final LongsRef values;
|
||||||
|
byte[] blocks;
|
||||||
|
int off;
|
||||||
|
long ord;
|
||||||
|
|
||||||
|
/** Sole constructor.
|
||||||
|
* @param blockSize the number of values of a block, must be equal to the
|
||||||
|
* block size of the {@link BlockPackedWriter} which has
|
||||||
|
* been used to write the stream
|
||||||
|
*/
|
||||||
|
public BlockPackedReader(DataInput in, int packedIntsVersion, int blockSize, long valueCount) {
|
||||||
|
checkBlockSize(blockSize);
|
||||||
|
this.in = in;
|
||||||
|
this.packedIntsVersion = packedIntsVersion;
|
||||||
|
this.blockSize = blockSize;
|
||||||
|
this.values = new LongsRef(blockSize);
|
||||||
|
assert valueCount >= 0;
|
||||||
|
this.valueCount = valueCount;
|
||||||
|
off = blockSize;
|
||||||
|
ord = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Skip exactly <code>count</code> values. */
|
||||||
|
public void skip(long count) throws IOException {
|
||||||
|
assert count >= 0;
|
||||||
|
if (ord + count > valueCount || ord + count < 0) {
|
||||||
|
throw new EOFException();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1. skip buffered values
|
||||||
|
final int skipBuffer = (int) Math.min(count, blockSize - off);
|
||||||
|
off += skipBuffer;
|
||||||
|
ord += skipBuffer;
|
||||||
|
count -= skipBuffer;
|
||||||
|
if (count == 0L) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. skip as many blocks as necessary
|
||||||
|
assert off == blockSize;
|
||||||
|
while (count >= blockSize) {
|
||||||
|
final int token = in.readByte() & 0xFF;
|
||||||
|
final int bitsPerValue = token >>> BPV_SHIFT;
|
||||||
|
if (bitsPerValue > 64) {
|
||||||
|
throw new IOException("Corrupted");
|
||||||
|
}
|
||||||
|
if ((token & MIN_VALUE_EQUALS_0) == 0) {
|
||||||
|
readVLong(in);
|
||||||
|
}
|
||||||
|
final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue);
|
||||||
|
skipBytes(blockBytes);
|
||||||
|
ord += blockSize;
|
||||||
|
count -= blockSize;
|
||||||
|
}
|
||||||
|
if (count == 0L) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. skip last values
|
||||||
|
assert count < blockSize;
|
||||||
|
refill();
|
||||||
|
ord += count;
|
||||||
|
off += count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void skipBytes(long count) throws IOException {
|
||||||
|
if (in instanceof IndexInput) {
|
||||||
|
final IndexInput iin = (IndexInput) in;
|
||||||
|
iin.seek(iin.getFilePointer() + count);
|
||||||
|
} else {
|
||||||
|
if (blocks == null) {
|
||||||
|
blocks = new byte[blockSize];
|
||||||
|
}
|
||||||
|
long skipped = 0;
|
||||||
|
while (skipped < count) {
|
||||||
|
final int toSkip = (int) Math.min(blocks.length, count - skipped);
|
||||||
|
in.readBytes(blocks, 0, toSkip);
|
||||||
|
skipped += toSkip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Read the next value. */
|
||||||
|
public long next() throws IOException {
|
||||||
|
next(1);
|
||||||
|
assert values.length == 1;
|
||||||
|
return values.longs[values.offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Read between <tt>1</tt> and <code>count</code> values. */
|
||||||
|
public LongsRef next(int count) throws IOException {
|
||||||
|
assert count > 0;
|
||||||
|
if (ord == valueCount) {
|
||||||
|
throw new EOFException();
|
||||||
|
}
|
||||||
|
if (off == blockSize) {
|
||||||
|
refill();
|
||||||
|
}
|
||||||
|
|
||||||
|
count = Math.min(count, blockSize - off);
|
||||||
|
count = (int) Math.min(count, valueCount - ord);
|
||||||
|
|
||||||
|
values.offset = off;
|
||||||
|
values.length = count;
|
||||||
|
off += count;
|
||||||
|
ord += count;
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void refill() throws IOException {
|
||||||
|
final int token = in.readByte() & 0xFF;
|
||||||
|
final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0;
|
||||||
|
final int bitsPerValue = token >>> BPV_SHIFT;
|
||||||
|
if (bitsPerValue > 64) {
|
||||||
|
throw new IOException("Corrupted");
|
||||||
|
}
|
||||||
|
final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in));
|
||||||
|
assert minEquals0 || minValue != 0;
|
||||||
|
|
||||||
|
if (bitsPerValue == 0) {
|
||||||
|
Arrays.fill(values.longs, minValue);
|
||||||
|
} else {
|
||||||
|
final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue);
|
||||||
|
final int iterations = blockSize / decoder.valueCount();
|
||||||
|
final int blocksSize = iterations * 8 * decoder.blockCount();
|
||||||
|
if (blocks == null || blocks.length < blocksSize) {
|
||||||
|
blocks = new byte[blocksSize];
|
||||||
|
}
|
||||||
|
|
||||||
|
final int valueCount = (int) Math.min(this.valueCount - ord, blockSize);
|
||||||
|
final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue);
|
||||||
|
in.readBytes(blocks, 0, blocksCount);
|
||||||
|
|
||||||
|
decoder.decode(blocks, 0, values.longs, 0, iterations);
|
||||||
|
|
||||||
|
if (minValue != 0) {
|
||||||
|
for (int i = 0; i < valueCount; ++i) {
|
||||||
|
values.longs[i] += minValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
off = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the offset of the next value to read. */
|
||||||
|
public long ord() {
|
||||||
|
return ord;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,164 @@
|
||||||
|
package org.apache.lucene.util.packed;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A writer for large sequences of longs.
|
||||||
|
* <p>
|
||||||
|
* The sequence is divided into fixed-size blocks and for each block, the
|
||||||
|
* difference between each value and the minimum value of the block is encoded
|
||||||
|
* using as few bits as possible. Memory usage of this class is proportional to
|
||||||
|
* the block size. Each block has an overhead between 1 and 10 bytes to store
|
||||||
|
* the minimum value and the number of bits per value of the block.
|
||||||
|
* @see BlockPackedReader
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
public final class BlockPackedWriter {
|
||||||
|
|
||||||
|
static final int MAX_BLOCK_SIZE = 1 << (30 - 3);
|
||||||
|
static final int MIN_VALUE_EQUALS_0 = 1 << 0;
|
||||||
|
static final int BPV_SHIFT = 1;
|
||||||
|
|
||||||
|
static void checkBlockSize(int blockSize) {
|
||||||
|
if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) {
|
||||||
|
throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize);
|
||||||
|
}
|
||||||
|
if (blockSize % 64 != 0) {
|
||||||
|
throw new IllegalArgumentException("blockSize must be a multiple of 64, got " + blockSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static long zigZagEncode(long n) {
|
||||||
|
return (n >> 63) ^ (n << 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// same as DataOutput.writeVLong but accepts negative values
|
||||||
|
static void writeVLong(DataOutput out, long i) throws IOException {
|
||||||
|
int k = 0;
|
||||||
|
while ((i & ~0x7FL) != 0L && k++ < 8) {
|
||||||
|
out.writeByte((byte)((i & 0x7FL) | 0x80L));
|
||||||
|
i >>>= 7;
|
||||||
|
}
|
||||||
|
out.writeByte((byte) i);
|
||||||
|
}
|
||||||
|
|
||||||
|
final DataOutput out;
|
||||||
|
final long[] values;
|
||||||
|
byte[] blocks;
|
||||||
|
int off;
|
||||||
|
long ord;
|
||||||
|
boolean finished;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sole constructor.
|
||||||
|
* @param blockSize the number of values of a single block, must be a multiple of <tt>64</tt>
|
||||||
|
*/
|
||||||
|
public BlockPackedWriter(DataOutput out, int blockSize) {
|
||||||
|
checkBlockSize(blockSize);
|
||||||
|
this.out = out;
|
||||||
|
values = new long[blockSize];
|
||||||
|
off = 0;
|
||||||
|
ord = 0L;
|
||||||
|
finished = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkNotFinished() {
|
||||||
|
if (finished) {
|
||||||
|
throw new IllegalStateException("Already finished");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Append a new long. */
|
||||||
|
public void add(long l) throws IOException {
|
||||||
|
checkNotFinished();
|
||||||
|
if (off == values.length) {
|
||||||
|
flush();
|
||||||
|
}
|
||||||
|
values[off++] = l;
|
||||||
|
++ord;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Flush all buffered data to disk. This instance is not usable anymore
|
||||||
|
* after this method has been called. */
|
||||||
|
public void finish() throws IOException {
|
||||||
|
checkNotFinished();
|
||||||
|
if (off > 0) {
|
||||||
|
flush();
|
||||||
|
}
|
||||||
|
finished = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void flush() throws IOException {
|
||||||
|
assert off > 0;
|
||||||
|
long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
|
||||||
|
for (int i = 0; i < off; ++i) {
|
||||||
|
min = Math.min(values[i], min);
|
||||||
|
max = Math.max(values[i], max);
|
||||||
|
}
|
||||||
|
|
||||||
|
final long delta = max - min;
|
||||||
|
final int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInts.bitsRequired(delta);
|
||||||
|
if (bitsRequired == 64) {
|
||||||
|
// no need to delta-encode
|
||||||
|
min = 0L;
|
||||||
|
} else if (min > 0L) {
|
||||||
|
// make min as small as possible so that writeVLong requires fewer bytes
|
||||||
|
min = Math.max(0L, max - PackedInts.maxValue(bitsRequired));
|
||||||
|
}
|
||||||
|
|
||||||
|
final int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0);
|
||||||
|
out.writeByte((byte) token);
|
||||||
|
|
||||||
|
if (min != 0) {
|
||||||
|
writeVLong(out, zigZagEncode(min) - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bitsRequired > 0) {
|
||||||
|
if (min != 0) {
|
||||||
|
for (int i = 0; i < off; ++i) {
|
||||||
|
values[i] -= min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsRequired);
|
||||||
|
final int iterations = values.length / encoder.valueCount();
|
||||||
|
final int blockSize = encoder.blockCount() * 8 * iterations;
|
||||||
|
if (blocks == null || blocks.length < blockSize) {
|
||||||
|
blocks = new byte[blockSize];
|
||||||
|
}
|
||||||
|
if (off < values.length) {
|
||||||
|
Arrays.fill(values, off, values.length, 0L);
|
||||||
|
}
|
||||||
|
encoder.encode(values, 0, blocks, 0, iterations);
|
||||||
|
final int blockCount = (int) PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, off, bitsRequired);
|
||||||
|
out.writeBytes(blocks, blockCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
off = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the number of values which have been added. */
|
||||||
|
public long ord() {
|
||||||
|
return ord;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -80,16 +80,6 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
|
||||||
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
|
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
static byte[] copyCompressedData(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException {
|
|
||||||
GrowableByteArrayDataOutput out = new GrowableByteArrayDataOutput(compressed.length);
|
|
||||||
decompressor.copyCompressedData(new ByteArrayDataInput(compressed), originalLength, out);
|
|
||||||
return Arrays.copyOf(out.bytes, out.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
byte[] copyCompressedData(byte[] compressed, int originalLength) throws IOException {
|
|
||||||
return copyCompressedData(mode.newDecompressor(), compressed, originalLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testDecompress() throws IOException {
|
public void testDecompress() throws IOException {
|
||||||
final int iterations = atLeast(10);
|
final int iterations = atLeast(10);
|
||||||
for (int i = 0; i < iterations; ++i) {
|
for (int i = 0; i < iterations; ++i) {
|
||||||
|
@ -117,17 +107,10 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCopyCompressedData() throws IOException {
|
|
||||||
final byte[] decompressed = randomArray();
|
|
||||||
final byte[] compressed = compress(decompressed);
|
|
||||||
assertArrayEquals(compressed, copyCompressedData(compressed, decompressed.length));
|
|
||||||
}
|
|
||||||
|
|
||||||
public byte[] test(byte[] decompressed) throws IOException {
|
public byte[] test(byte[] decompressed) throws IOException {
|
||||||
final byte[] compressed = compress(decompressed);
|
final byte[] compressed = compress(decompressed);
|
||||||
final byte[] restored = decompress(compressed, decompressed.length);
|
final byte[] restored = decompress(compressed, decompressed.length);
|
||||||
assertEquals(decompressed.length, restored.length);
|
assertEquals(decompressed.length, restored.length);
|
||||||
assertArrayEquals(compressed, copyCompressedData(compressed, decompressed.length));
|
|
||||||
return compressed;
|
return compressed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -116,11 +116,28 @@ public class TestCharsRef extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// LUCENE-3590: fix off-by-one in subsequence, and fully obey interface
|
// LUCENE-3590: fix off-by-one in subsequence, and fully obey interface
|
||||||
|
// LUCENE-4671: fix subSequence
|
||||||
public void testCharSequenceSubSequence() {
|
public void testCharSequenceSubSequence() {
|
||||||
CharSequence c = new CharsRef("abc");
|
CharSequence sequences[] = {
|
||||||
|
new CharsRef("abc"),
|
||||||
|
new CharsRef("0abc".toCharArray(), 1, 3),
|
||||||
|
new CharsRef("abc0".toCharArray(), 0, 3),
|
||||||
|
new CharsRef("0abc0".toCharArray(), 1, 3)
|
||||||
|
};
|
||||||
|
|
||||||
|
for (CharSequence c : sequences) {
|
||||||
|
doTestSequence(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestSequence(CharSequence c) {
|
||||||
|
|
||||||
// slice
|
// slice
|
||||||
assertEquals("a", c.subSequence(0, 1).toString());
|
assertEquals("a", c.subSequence(0, 1).toString());
|
||||||
|
// mid subsequence
|
||||||
|
assertEquals("b", c.subSequence(1, 2).toString());
|
||||||
|
// end subsequence
|
||||||
|
assertEquals("bc", c.subSequence(1, 3).toString());
|
||||||
// empty subsequence
|
// empty subsequence
|
||||||
assertEquals("", c.subSequence(0, 0).toString());
|
assertEquals("", c.subSequence(0, 0).toString());
|
||||||
|
|
||||||
|
|
|
@ -310,7 +310,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
final boolean doRewrite = random().nextBoolean();
|
final boolean doRewrite = random().nextBoolean();
|
||||||
|
|
||||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite);
|
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite, true);
|
||||||
|
|
||||||
boolean storeOrd = random().nextBoolean();
|
boolean storeOrd = random().nextBoolean();
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
@ -453,8 +453,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
this.doPack = doPack;
|
this.doPack = doPack;
|
||||||
|
|
||||||
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack);
|
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack, !noArcArrays);
|
||||||
builder.setAllowArrayArcs(!noArcArrays);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||||
|
@ -1063,7 +1062,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testFinalOutputOnEndState() throws Exception {
|
public void testFinalOutputOnEndState() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
|
|
||||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random().nextBoolean());
|
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random().nextBoolean(), true);
|
||||||
builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
|
builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
|
||||||
builder.add(Util.toUTF32("station", new IntsRef()), 10L);
|
builder.add(Util.toUTF32("station", new IntsRef()), 10L);
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = builder.finish();
|
||||||
|
@ -1078,7 +1077,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testInternalFinalState() throws Exception {
|
public void testInternalFinalState() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
final boolean willRewrite = random().nextBoolean();
|
final boolean willRewrite = random().nextBoolean();
|
||||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite);
|
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite, true);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
|
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
|
||||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
|
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = builder.finish();
|
||||||
|
@ -1101,7 +1100,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final Long nothing = outputs.getNoOutput();
|
final Long nothing = outputs.getNoOutput();
|
||||||
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT);
|
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, true);
|
||||||
|
|
||||||
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
|
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,8 @@ import java.util.Locale;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
@ -875,4 +877,102 @@ public class TestPackedInts extends LuceneTestCase {
|
||||||
in.close();
|
in.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
public void testBlockPackedReaderWriter() throws IOException {
|
||||||
|
final int iters = atLeast(2);
|
||||||
|
for (int iter = 0; iter < iters; ++iter) {
|
||||||
|
final int blockSize = 64 * _TestUtil.nextInt(random(), 1, 1 << 12);
|
||||||
|
final int valueCount = random().nextInt(1 << 18);
|
||||||
|
final long[] values = new long[valueCount];
|
||||||
|
long minValue = 0;
|
||||||
|
int bpv = 0;
|
||||||
|
for (int i = 0; i < valueCount; ++i) {
|
||||||
|
if (i % blockSize == 0) {
|
||||||
|
minValue = rarely() ? random().nextInt(256) : rarely() ? -5 : random().nextLong();
|
||||||
|
bpv = random().nextInt(65);
|
||||||
|
}
|
||||||
|
if (bpv == 0) {
|
||||||
|
values[i] = minValue;
|
||||||
|
} else if (bpv == 64) {
|
||||||
|
values[i] = random().nextLong();
|
||||||
|
} else {
|
||||||
|
values[i] = minValue + _TestUtil.nextLong(random(), 0, (1L << bpv) - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final Directory dir = newDirectory();
|
||||||
|
final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
|
||||||
|
final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize);
|
||||||
|
for (int i = 0; i < valueCount; ++i) {
|
||||||
|
assertEquals(i, writer.ord());
|
||||||
|
writer.add(values[i]);
|
||||||
|
}
|
||||||
|
assertEquals(valueCount, writer.ord());
|
||||||
|
writer.finish();
|
||||||
|
assertEquals(valueCount, writer.ord());
|
||||||
|
final long fp = out.getFilePointer();
|
||||||
|
out.close();
|
||||||
|
|
||||||
|
DataInput in = dir.openInput("out.bin", IOContext.DEFAULT);
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
byte[] buf = new byte[(int) fp];
|
||||||
|
in.readBytes(buf, 0, (int) fp);
|
||||||
|
((IndexInput) in).close();
|
||||||
|
in = new ByteArrayDataInput(buf);
|
||||||
|
}
|
||||||
|
final BlockPackedReader reader = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
|
||||||
|
for (int i = 0; i < valueCount; ) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
assertEquals("" + i, values[i], reader.next());
|
||||||
|
++i;
|
||||||
|
} else {
|
||||||
|
final LongsRef nextValues = reader.next(_TestUtil.nextInt(random(), 1, 1024));
|
||||||
|
for (int j = 0; j < nextValues.length; ++j) {
|
||||||
|
assertEquals("" + (i + j), values[i + j], nextValues.longs[nextValues.offset + j]);
|
||||||
|
}
|
||||||
|
i += nextValues.length;
|
||||||
|
}
|
||||||
|
assertEquals(i, reader.ord());
|
||||||
|
}
|
||||||
|
assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
|
||||||
|
try {
|
||||||
|
reader.next();
|
||||||
|
assertTrue(false);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// OK
|
||||||
|
}
|
||||||
|
|
||||||
|
if (in instanceof ByteArrayDataInput) {
|
||||||
|
((ByteArrayDataInput) in).setPosition(0);
|
||||||
|
} else {
|
||||||
|
((IndexInput) in).seek(0L);
|
||||||
|
}
|
||||||
|
final BlockPackedReader reader2 = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
|
||||||
|
int i = 0;
|
||||||
|
while (true) {
|
||||||
|
final int skip = _TestUtil.nextInt(random(), 0, valueCount - i);
|
||||||
|
reader2.skip(skip);
|
||||||
|
i += skip;
|
||||||
|
assertEquals(i, reader2.ord());
|
||||||
|
if (i == valueCount) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
assertEquals(values[i], reader2.next());
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
|
||||||
|
try {
|
||||||
|
reader2.skip(1);
|
||||||
|
assertTrue(false);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// OK
|
||||||
|
}
|
||||||
|
|
||||||
|
if (in instanceof IndexInput) {
|
||||||
|
((IndexInput) in).close();
|
||||||
|
}
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,5 +81,12 @@
|
||||||
</links>
|
</links>
|
||||||
</invoke-module-javadoc>
|
</invoke-module-javadoc>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<target name="run-encoding-benchmark" depends="compile-test">
|
||||||
|
<java classname="org.apache.lucene.util.encoding.EncodingSpeed" fork="true" failonerror="true">
|
||||||
|
<classpath refid="test.classpath" />
|
||||||
|
<classpath path="${build.dir}/classes/test" />
|
||||||
|
</java>
|
||||||
|
</target>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -39,7 +39,7 @@ import org.apache.lucene.store.Directory;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class AssociationIndexer {
|
public class CategoryAssociationsIndexer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an index, and adds to it sample documents and categories.
|
* Create an index, and adds to it sample documents and categories.
|
||||||
|
@ -75,13 +75,11 @@ public class AssociationIndexer {
|
||||||
++nFacetsAdded;
|
++nFacetsAdded;
|
||||||
}
|
}
|
||||||
// and also those with associations
|
// and also those with associations
|
||||||
CategoryPath[] associationsPaths = AssociationUtils.categories[docNum];
|
CategoryPath[] associationsPaths = CategoryAssociationsUtils.categories[docNum];
|
||||||
CategoryAssociation[] associationsValues = AssociationUtils.associations[docNum];
|
CategoryAssociation[] associationsValues = CategoryAssociationsUtils.associations[docNum];
|
||||||
for (int i = 0; i < associationsPaths.length; i++) {
|
for (int i = 0; i < associationsPaths.length; i++) {
|
||||||
associations.setAssociation(associationsPaths[i], associationsValues[i]);
|
associations.setAssociation(associationsPaths[i], associationsValues[i]);
|
||||||
ExampleUtils.log("\t $$$$ Association: ("
|
ExampleUtils.log("\t $$$$ Association: (" + associationsPaths[i] + "," + associationsValues[i] + ")");
|
||||||
+ associationsPaths[i] + "," + associationsValues[i]
|
|
||||||
+ ")");
|
|
||||||
++nFacetsAdded;
|
++nFacetsAdded;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,15 +31,15 @@ import org.apache.lucene.facet.search.results.FacetResult;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class AssociationMain {
|
public class CategoryAssociationsMain {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Driver for the simple sample.
|
* Driver for the simple sample.
|
||||||
* @throws Exception on error (no detailed exception handling here for sample simplicity
|
* @throws Exception on error (no detailed exception handling here for sample simplicity
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
new AssociationMain().runSumIntAssociationSample();
|
new CategoryAssociationsMain().runSumIntAssociationSample();
|
||||||
new AssociationMain().runSumFloatAssociationSample();
|
new CategoryAssociationsMain().runSumFloatAssociationSample();
|
||||||
ExampleUtils.log("DONE");
|
ExampleUtils.log("DONE");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,10 +51,10 @@ public class AssociationMain {
|
||||||
|
|
||||||
// index the sample documents
|
// index the sample documents
|
||||||
ExampleUtils.log("index the sample documents...");
|
ExampleUtils.log("index the sample documents...");
|
||||||
AssociationIndexer.index(indexDir, taxoDir);
|
CategoryAssociationsIndexer.index(indexDir, taxoDir);
|
||||||
|
|
||||||
ExampleUtils.log("search the sample documents...");
|
ExampleUtils.log("search the sample documents...");
|
||||||
List<FacetResult> facetRes = AssociationSearcher.searchSumIntAssociation(indexDir, taxoDir);
|
List<FacetResult> facetRes = CategoryAssociationsSearcher.searchSumIntAssociation(indexDir, taxoDir);
|
||||||
|
|
||||||
ExampleResult res = new ExampleResult();
|
ExampleResult res = new ExampleResult();
|
||||||
res.setFacetResults(facetRes);
|
res.setFacetResults(facetRes);
|
||||||
|
@ -69,10 +69,10 @@ public class AssociationMain {
|
||||||
|
|
||||||
// index the sample documents
|
// index the sample documents
|
||||||
ExampleUtils.log("index the sample documents...");
|
ExampleUtils.log("index the sample documents...");
|
||||||
AssociationIndexer.index(indexDir, taxoDir);
|
CategoryAssociationsIndexer.index(indexDir, taxoDir);
|
||||||
|
|
||||||
ExampleUtils.log("search the sample documents...");
|
ExampleUtils.log("search the sample documents...");
|
||||||
List<FacetResult> facetRes = AssociationSearcher.searchSumFloatAssociation(indexDir, taxoDir);
|
List<FacetResult> facetRes = CategoryAssociationsSearcher.searchSumFloatAssociation(indexDir, taxoDir);
|
||||||
|
|
||||||
ExampleResult res = new ExampleResult();
|
ExampleResult res = new ExampleResult();
|
||||||
res.setFacetResults(facetRes);
|
res.setFacetResults(facetRes);
|
|
@ -37,18 +37,15 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class AssociationSearcher {
|
public class CategoryAssociationsSearcher {
|
||||||
|
|
||||||
/** Search an index with a sum of int-association. */
|
/** Search an index with a sum of int-association. */
|
||||||
public static List<FacetResult> searchSumIntAssociation(Directory indexDir,
|
public static List<FacetResult> searchSumIntAssociation(Directory indexDir, Directory taxoDir) throws Exception {
|
||||||
Directory taxoDir) throws Exception {
|
|
||||||
// prepare index reader
|
// prepare index reader
|
||||||
IndexReader indexReader = DirectoryReader.open(indexDir);
|
IndexReader indexReader = DirectoryReader.open(indexDir);
|
||||||
TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
|
TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
|
||||||
AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest(
|
AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest(new CategoryPath("tags"), 10);
|
||||||
new CategoryPath("tags"), 10);
|
|
||||||
|
|
||||||
List<FacetResult> res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest);
|
List<FacetResult> res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest);
|
||||||
|
|
||||||
// close readers
|
// close readers
|
||||||
|
@ -59,14 +56,12 @@ public class AssociationSearcher {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Search an index with a sum of float-association. */
|
/** Search an index with a sum of float-association. */
|
||||||
public static List<FacetResult> searchSumFloatAssociation(Directory indexDir,
|
public static List<FacetResult> searchSumFloatAssociation(Directory indexDir, Directory taxoDir) throws Exception {
|
||||||
Directory taxoDir) throws Exception {
|
|
||||||
// prepare index reader
|
// prepare index reader
|
||||||
IndexReader indexReader = DirectoryReader.open(indexDir);
|
IndexReader indexReader = DirectoryReader.open(indexDir);
|
||||||
TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
|
TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
|
||||||
AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest(
|
AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest(new CategoryPath("genre"), 10);
|
||||||
new CategoryPath("genre"), 10);
|
|
||||||
|
|
||||||
List<FacetResult> res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest);
|
List<FacetResult> res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest);
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
/**
|
/**
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class AssociationUtils {
|
public class CategoryAssociationsUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Categories: categories[D][N] == category-path with association no. N for
|
* Categories: categories[D][N] == category-path with association no. N for
|
|
@ -1,11 +1,7 @@
|
||||||
package org.apache.lucene.facet.example.simple;
|
package org.apache.lucene.facet.example.simple;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||||
|
|
||||||
import org.apache.lucene.facet.example.ExampleUtils;
|
import org.apache.lucene.facet.example.ExampleUtils;
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
|
|
||||||
|
|
|
@ -1,92 +0,0 @@
|
||||||
package org.apache.lucene.facet.associations;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.index.CategoryListBuilder;
|
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
|
||||||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
|
||||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A {@link CategoryListBuilder} which encodes category-association value pairs
|
|
||||||
* in addition to regular counting list category ordinals. Every
|
|
||||||
* category-association pair is written under the respective association's
|
|
||||||
* {@link CategoryAssociation#getCategoryListID()}.
|
|
||||||
*/
|
|
||||||
public class AssociationsCategoryListBuilder extends CategoryListBuilder {
|
|
||||||
|
|
||||||
private final CategoryAssociationsContainer associations;
|
|
||||||
private final HashMap<String,BytesRef> perAssociationBytes = new HashMap<String,BytesRef>();
|
|
||||||
private final ByteArrayDataOutput output = new ByteArrayDataOutput();
|
|
||||||
|
|
||||||
public AssociationsCategoryListBuilder(CategoryAssociationsContainer associations,
|
|
||||||
CategoryListParams categoryListParams, FacetIndexingParams indexingParams, TaxonomyWriter taxoWriter) {
|
|
||||||
super(categoryListParams, indexingParams, taxoWriter);
|
|
||||||
this.associations = associations;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void handle(int ordinal, CategoryPath cp) throws IOException {
|
|
||||||
super.handle(ordinal, cp);
|
|
||||||
|
|
||||||
// build per-association key BytesRef
|
|
||||||
CategoryAssociation association = associations.getAssociation(cp);
|
|
||||||
if (association == null) {
|
|
||||||
// it is ok to set a null association for a category - it's treated as a
|
|
||||||
// regular category in that case.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
BytesRef bytes = perAssociationBytes.get(association.getCategoryListID());
|
|
||||||
if (bytes == null) {
|
|
||||||
bytes = new BytesRef();
|
|
||||||
perAssociationBytes.put(association.getCategoryListID(), bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded();
|
|
||||||
if (bytes.bytes.length - bytes.length < maxBytesNeeded) {
|
|
||||||
bytes.grow(bytes.bytes.length + maxBytesNeeded);
|
|
||||||
}
|
|
||||||
|
|
||||||
// reset the output to write from bytes.length (current position) until the end
|
|
||||||
output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length);
|
|
||||||
output.writeInt(ordinal);
|
|
||||||
|
|
||||||
// encode the association bytes
|
|
||||||
association.serialize(output);
|
|
||||||
|
|
||||||
// update BytesRef
|
|
||||||
bytes.length = output.getPosition();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public HashMap<String,BytesRef> finish() {
|
|
||||||
// build the ordinals list
|
|
||||||
HashMap<String,BytesRef> result = super.finish();
|
|
||||||
// add per association bytes
|
|
||||||
result.putAll(perAssociationBytes);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -7,7 +7,7 @@ import java.util.Map;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.facet.index.CategoryListBuilder;
|
import org.apache.lucene.facet.index.CountingListBuilder;
|
||||||
import org.apache.lucene.facet.index.DrillDownStream;
|
import org.apache.lucene.facet.index.DrillDownStream;
|
||||||
import org.apache.lucene.facet.index.FacetFields;
|
import org.apache.lucene.facet.index.FacetFields;
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||||
|
@ -15,6 +15,8 @@ import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -94,15 +96,16 @@ public class AssociationsFacetFields extends FacetFields {
|
||||||
return categoryLists;
|
return categoryLists;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a {@link CategoryListBuilder} for encoding the given categories and
|
|
||||||
* associations.
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams,
|
protected Map<String,BytesRef> getCategoryListData(CategoryListParams categoryListParams, IntsRef ordinals,
|
||||||
Iterable<CategoryPath> categories) {
|
Iterable<CategoryPath> categories) throws IOException {
|
||||||
return new AssociationsCategoryListBuilder((CategoryAssociationsContainer) categories, categoryListParams,
|
AssociationsListBuilder associations = new AssociationsListBuilder((CategoryAssociationsContainer) categories);
|
||||||
indexingParams, taxonomyWriter);
|
CountingListBuilder counting = new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter);
|
||||||
|
// CountingListBuilder modifies the ordinals array, by e.g. adding parent ordinals, sorting etc.
|
||||||
|
// Therefore first build the associations list and only afterwards the counting list.
|
||||||
|
final Map<String,BytesRef> res = associations.build(ordinals, categories);
|
||||||
|
res.putAll(counting.build(ordinals, categories));
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
package org.apache.lucene.facet.associations;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.facet.index.CategoryListBuilder;
|
||||||
|
import org.apache.lucene.facet.index.CountingListBuilder;
|
||||||
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link AssociationsListBuilder} which encodes category-association value pairs.
|
||||||
|
* Every category-association pair is written under the respective association's
|
||||||
|
* {@link CategoryAssociation#getCategoryListID()}.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> associations list do not encode the counting list data. You
|
||||||
|
* should use {@link CountingListBuilder} to build that information and then
|
||||||
|
* merge the results of both {@link #build(IntsRef, Iterable)}.
|
||||||
|
*/
|
||||||
|
public class AssociationsListBuilder implements CategoryListBuilder {
|
||||||
|
|
||||||
|
private final CategoryAssociationsContainer associations;
|
||||||
|
private final ByteArrayDataOutput output = new ByteArrayDataOutput();
|
||||||
|
|
||||||
|
public AssociationsListBuilder(CategoryAssociationsContainer associations) {
|
||||||
|
this.associations = associations;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String,BytesRef> build(IntsRef ordinals, Iterable<CategoryPath> categories) throws IOException {
|
||||||
|
final HashMap<String,BytesRef> res = new HashMap<String,BytesRef>();
|
||||||
|
int idx = 0;
|
||||||
|
for (CategoryPath cp : categories) {
|
||||||
|
// build per-association key BytesRef
|
||||||
|
CategoryAssociation association = associations.getAssociation(cp);
|
||||||
|
|
||||||
|
if (association == null) {
|
||||||
|
// it is ok to set a null association for a category - it's treated as a
|
||||||
|
// regular category in that case.
|
||||||
|
++idx;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
BytesRef bytes = res.get(association.getCategoryListID());
|
||||||
|
if (bytes == null) {
|
||||||
|
bytes = new BytesRef(32);
|
||||||
|
res.put(association.getCategoryListID(), bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded() + bytes.length;
|
||||||
|
if (bytes.bytes.length < maxBytesNeeded) {
|
||||||
|
bytes.grow(maxBytesNeeded);
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset the output to write from bytes.length (current position) until the end
|
||||||
|
output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length);
|
||||||
|
output.writeInt(ordinals.ints[idx++]);
|
||||||
|
|
||||||
|
// encode the association bytes
|
||||||
|
association.serialize(output);
|
||||||
|
|
||||||
|
// update BytesRef
|
||||||
|
bytes.length = output.getPosition();
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -53,20 +53,22 @@ public abstract class AssociationsPayloadIterator<T extends CategoryAssociation>
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Skip to the requested document. Returns true iff the document has categort
|
* Skip to the requested document. Returns true iff the document has category
|
||||||
* association values and they were read successfully.
|
* association values and they were read successfully. Associations are
|
||||||
|
* handled through {@link #handleAssociation(int, CategoryAssociation)} by
|
||||||
|
* extending classes.
|
||||||
*/
|
*/
|
||||||
public boolean setNextDoc(int docId) throws IOException {
|
protected final boolean setNextDoc(int docID) throws IOException {
|
||||||
if (!hasAssociations) { // there are no associations at all
|
if (!hasAssociations) { // there are no associations at all
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!pi.setdoc(docId)) { // no associations for the requested document
|
BytesRef bytes = pi.getPayload(docID);
|
||||||
|
if (bytes == null) { // no associations for the requested document
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
BytesRef associations = pi.getPayload();
|
ByteArrayDataInput in = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
|
||||||
ByteArrayDataInput in = new ByteArrayDataInput(associations.bytes, associations.offset, associations.length);
|
|
||||||
while (!in.eof()) {
|
while (!in.eof()) {
|
||||||
int ordinal = in.readInt();
|
int ordinal = in.readInt();
|
||||||
association.deserialize(in);
|
association.deserialize(in);
|
||||||
|
|
|
@ -55,5 +55,10 @@ public class CategoryAssociationsContainer implements Iterable<CategoryPath> {
|
||||||
public void clear() {
|
public void clear() {
|
||||||
categoryAssociations.clear();
|
categoryAssociations.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return categoryAssociations.toString();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,5 +71,10 @@ public class CategoryFloatAssociation implements CategoryAssociation {
|
||||||
public float getValue() {
|
public float getValue() {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName() + "(" + value + ")";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,4 +72,9 @@ public class CategoryIntAssociation implements CategoryAssociation {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName() + "(" + value + ")";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,23 +40,17 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato
|
||||||
protected void handleAssociation(int ordinal, CategoryFloatAssociation association) {
|
protected void handleAssociation(int ordinal, CategoryFloatAssociation association) {
|
||||||
ordinalAssociations.put(ordinal, association.getValue());
|
ordinalAssociations.put(ordinal, association.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean setNextDoc(int docId) throws IOException {
|
|
||||||
ordinalAssociations.clear();
|
|
||||||
return super.setNextDoc(docId);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the float association value for the given ordinal, or
|
* Returns the float association values of the categories that are associated
|
||||||
* {@link Float#NaN} in case the ordinal has no association value.
|
* with the given document, or {@code null} if the document has no
|
||||||
|
* associations.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> you are not expected to modify the returned map.
|
||||||
*/
|
*/
|
||||||
public float getAssociation(int ordinal) {
|
public IntToFloatMap getAssociations(int docID) throws IOException {
|
||||||
if (ordinalAssociations.containsKey(ordinal)) {
|
ordinalAssociations.clear();
|
||||||
return ordinalAssociations.get(ordinal);
|
return setNextDoc(docID) ? ordinalAssociations : null;
|
||||||
}
|
|
||||||
|
|
||||||
return Float.NaN;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,12 +31,6 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator<
|
||||||
|
|
||||||
private final IntToIntMap ordinalAssociations = new IntToIntMap();
|
private final IntToIntMap ordinalAssociations = new IntToIntMap();
|
||||||
|
|
||||||
/**
|
|
||||||
* The long-special-value returned for ordinals which have no associated int
|
|
||||||
* value. It is not in the int range of values making it a valid mark.
|
|
||||||
*/
|
|
||||||
public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1;
|
|
||||||
|
|
||||||
public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association)
|
public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(reader, field, association);
|
super(reader, field, association);
|
||||||
|
@ -47,22 +41,16 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator<
|
||||||
ordinalAssociations.put(ordinal, association.getValue());
|
ordinalAssociations.put(ordinal, association.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean setNextDoc(int docId) throws IOException {
|
|
||||||
ordinalAssociations.clear();
|
|
||||||
return super.setNextDoc(docId);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the integer association value for the given ordinal, or
|
* Returns the integer association values of the categories that are
|
||||||
* {@link #NO_ASSOCIATION} in case the ordinal has no association value.
|
* associated with the given document, or {@code null} if the document has no
|
||||||
|
* associations.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> you are not expected to modify the returned map.
|
||||||
*/
|
*/
|
||||||
public long getAssociation(int ordinal) {
|
public IntToIntMap getAssociations(int docID) throws IOException {
|
||||||
if (ordinalAssociations.containsKey(ordinal)) {
|
ordinalAssociations.clear();
|
||||||
return ordinalAssociations.get(ordinal);
|
return setNextDoc(docID) ? ordinalAssociations : null;
|
||||||
}
|
|
||||||
|
|
||||||
return NO_ASSOCIATION;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,19 +1,11 @@
|
||||||
package org.apache.lucene.facet.index;
|
package org.apache.lucene.facet.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
|
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
|
||||||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
|
||||||
import org.apache.lucene.facet.util.PartitionsUtils;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.UnsafeByteArrayOutputStream;
|
|
||||||
import org.apache.lucene.util.encoding.IntEncoder;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -33,149 +25,14 @@ import org.apache.lucene.util.encoding.IntEncoder;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds a category list by encoding the category ordinals into one or more
|
* Builds a category list data by encoding the appropriate information for every
|
||||||
* {@link BytesRef}. Each {@link BytesRef} corresponds to a set of ordinals that
|
* category and ordinal given to {@link #build(IntsRef, Iterable)}.
|
||||||
* belong to the same partition. When partitions are not enabled (i.e.
|
*
|
||||||
* {@link FacetIndexingParams#getPartitionSize()} returns
|
* @lucene.experimental
|
||||||
* {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this
|
|
||||||
* class.
|
|
||||||
*/
|
*/
|
||||||
public class CategoryListBuilder {
|
public interface CategoryListBuilder {
|
||||||
|
|
||||||
/** Specializes encoding ordinals when partitions are enabled/disabled. */
|
|
||||||
private static abstract class OrdinalsEncoder {
|
|
||||||
OrdinalsEncoder() {}
|
|
||||||
public abstract void encode(int ordinal);
|
|
||||||
public abstract HashMap<String,BytesRef> finish();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder {
|
/** Returns the encoded ordinals data. */
|
||||||
|
public Map<String,BytesRef> build(IntsRef ordinals, Iterable<CategoryPath> categories) throws IOException;
|
||||||
private final IntEncoder encoder;
|
|
||||||
private final UnsafeByteArrayOutputStream ubaos;
|
|
||||||
private final String name;
|
|
||||||
|
|
||||||
NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) {
|
|
||||||
name = categoryListParams.getTerm().text();
|
|
||||||
encoder = categoryListParams.createEncoder();
|
|
||||||
ubaos = new UnsafeByteArrayOutputStream();
|
|
||||||
encoder.reInit(ubaos);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void encode(int ordinal) {
|
|
||||||
try {
|
|
||||||
encoder.encode(ordinal);
|
|
||||||
} catch (IOException e) {
|
|
||||||
// shouldn't happen as we're writing to byte[]
|
|
||||||
throw new RuntimeException("unexpected exception", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public HashMap<String,BytesRef> finish() {
|
|
||||||
try {
|
|
||||||
encoder.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
// shouldn't happen as we're writing to byte[]
|
|
||||||
throw new RuntimeException("unexpected exception", e);
|
|
||||||
}
|
|
||||||
HashMap<String,BytesRef> result = new HashMap<String,BytesRef>();
|
|
||||||
result.put(name, new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length()));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder {
|
|
||||||
|
|
||||||
private final FacetIndexingParams indexingParams;
|
|
||||||
private final CategoryListParams categoryListParams;
|
|
||||||
private final int partitionSize;
|
|
||||||
private final HashMap<String,IntEncoder> partitionEncoder = new HashMap<String,IntEncoder>();
|
|
||||||
private final HashMap<String,UnsafeByteArrayOutputStream> partitionBytes = new HashMap<String,UnsafeByteArrayOutputStream>();
|
|
||||||
|
|
||||||
PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) {
|
|
||||||
this.indexingParams = indexingParams;
|
|
||||||
this.categoryListParams = categoryListParams;
|
|
||||||
this.partitionSize = indexingParams.getPartitionSize();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void encode(int ordinal) {
|
|
||||||
final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal);
|
|
||||||
IntEncoder encoder = partitionEncoder.get(name);
|
|
||||||
if (encoder == null) {
|
|
||||||
encoder = categoryListParams.createEncoder();
|
|
||||||
final UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream();
|
|
||||||
encoder.reInit(ubaos);
|
|
||||||
partitionEncoder.put(name, encoder);
|
|
||||||
partitionBytes.put(name, ubaos);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
encoder.encode(ordinal % partitionSize);
|
|
||||||
} catch (IOException e) {
|
|
||||||
// shouldn't happen as we're writing to byte[]
|
|
||||||
throw new RuntimeException("unexpected exception", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public HashMap<String,BytesRef> finish() {
|
|
||||||
// finish encoding
|
|
||||||
IOUtils.closeWhileHandlingException(partitionEncoder.values());
|
|
||||||
|
|
||||||
HashMap<String,BytesRef> bytes = new HashMap<String,BytesRef>();
|
|
||||||
for (Entry<String,UnsafeByteArrayOutputStream> e : partitionBytes.entrySet()) {
|
|
||||||
UnsafeByteArrayOutputStream ubaos = e.getValue();
|
|
||||||
bytes.put(e.getKey(), new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length()));
|
|
||||||
}
|
|
||||||
return bytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private final TaxonomyWriter taxoWriter;
|
|
||||||
private final OrdinalsEncoder ordinalsEncoder;
|
|
||||||
private final OrdinalPolicy ordinalPolicy;
|
|
||||||
|
|
||||||
public CategoryListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams,
|
|
||||||
TaxonomyWriter taxoWriter) {
|
|
||||||
this.taxoWriter = taxoWriter;
|
|
||||||
this.ordinalPolicy = indexingParams.getOrdinalPolicy();
|
|
||||||
if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) {
|
|
||||||
ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams);
|
|
||||||
} else {
|
|
||||||
ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Encodes the given ordinal as well as any of its parent ordinals (per
|
|
||||||
* {@link OrdinalPolicy}).
|
|
||||||
*/
|
|
||||||
public void handle(int ordinal, CategoryPath cp) throws IOException {
|
|
||||||
ordinalsEncoder.encode(ordinal);
|
|
||||||
|
|
||||||
// add all parent ordinals, per OrdinalPolicy
|
|
||||||
int parent = taxoWriter.getParent(ordinal);
|
|
||||||
while (parent > 0) {
|
|
||||||
if (ordinalPolicy.shouldAdd(parent)) {
|
|
||||||
ordinalsEncoder.encode(parent);
|
|
||||||
}
|
|
||||||
parent = taxoWriter.getParent(parent);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the encoded ordinals data. Every returned {@link BytesRef}
|
|
||||||
* corresponds to a single partition (as defined by
|
|
||||||
* {@link FacetIndexingParams#getPartitionSize()}) and the key denotes the
|
|
||||||
* partition ID. When no partitions are defined, the returned map includes
|
|
||||||
* only one value.
|
|
||||||
*/
|
|
||||||
public HashMap<String,BytesRef> finish() {
|
|
||||||
return ordinalsEncoder.finish();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,160 @@
|
||||||
|
package org.apache.lucene.facet.index;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
|
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
|
||||||
|
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||||
|
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
||||||
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
|
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||||
|
import org.apache.lucene.facet.util.PartitionsUtils;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.encoding.IntEncoder;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link CategoryListBuilder} which builds a counting list data by encoding
|
||||||
|
* the category ordinals into one or more {@link BytesRef}. Each
|
||||||
|
* {@link BytesRef} corresponds to a set of ordinals that belong to the same
|
||||||
|
* partition. When partitions are not enabled (i.e.
|
||||||
|
* {@link FacetIndexingParams#getPartitionSize()} returns
|
||||||
|
* {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this
|
||||||
|
* class.
|
||||||
|
* <p>
|
||||||
|
* Counting lists are used usually for computing the weight of categories by
|
||||||
|
* summing their number of occurrences (hence counting) in a result set.
|
||||||
|
*/
|
||||||
|
public class CountingListBuilder implements CategoryListBuilder {
|
||||||
|
|
||||||
|
/** Specializes encoding ordinals when partitions are enabled/disabled. */
|
||||||
|
private static abstract class OrdinalsEncoder {
|
||||||
|
OrdinalsEncoder() {}
|
||||||
|
public abstract Map<String,BytesRef> encode(IntsRef ordinals);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder {
|
||||||
|
|
||||||
|
private final IntEncoder encoder;
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) {
|
||||||
|
name = categoryListParams.getTerm().text();
|
||||||
|
encoder = categoryListParams.createEncoder();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String,BytesRef> encode(IntsRef ordinals) {
|
||||||
|
final BytesRef bytes = new BytesRef(128); // should be enough for most common applications
|
||||||
|
encoder.encode(ordinals, bytes);
|
||||||
|
return Collections.singletonMap(name, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder {
|
||||||
|
|
||||||
|
private final FacetIndexingParams indexingParams;
|
||||||
|
private final CategoryListParams categoryListParams;
|
||||||
|
private final int partitionSize;
|
||||||
|
private final HashMap<String,IntEncoder> partitionEncoder = new HashMap<String,IntEncoder>();
|
||||||
|
|
||||||
|
PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) {
|
||||||
|
this.indexingParams = indexingParams;
|
||||||
|
this.categoryListParams = categoryListParams;
|
||||||
|
this.partitionSize = indexingParams.getPartitionSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public HashMap<String,BytesRef> encode(IntsRef ordinals) {
|
||||||
|
// build the partitionOrdinals map
|
||||||
|
final HashMap<String,IntsRef> partitionOrdinals = new HashMap<String,IntsRef>();
|
||||||
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
|
int ordinal = ordinals.ints[i];
|
||||||
|
final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal);
|
||||||
|
IntsRef partitionOrds = partitionOrdinals.get(name);
|
||||||
|
if (partitionOrds == null) {
|
||||||
|
partitionOrds = new IntsRef(32);
|
||||||
|
partitionOrdinals.put(name, partitionOrds);
|
||||||
|
partitionEncoder.put(name, categoryListParams.createEncoder());
|
||||||
|
}
|
||||||
|
partitionOrds.ints[partitionOrds.length++] = ordinal % partitionSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
HashMap<String,BytesRef> partitionBytes = new HashMap<String,BytesRef>();
|
||||||
|
for (Entry<String,IntsRef> e : partitionOrdinals.entrySet()) {
|
||||||
|
String name = e.getKey();
|
||||||
|
final IntEncoder encoder = partitionEncoder.get(name);
|
||||||
|
final BytesRef bytes = new BytesRef(128); // should be enough for most common applications
|
||||||
|
encoder.encode(e.getValue(), bytes);
|
||||||
|
partitionBytes.put(name, bytes);
|
||||||
|
}
|
||||||
|
return partitionBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private final OrdinalsEncoder ordinalsEncoder;
|
||||||
|
private final TaxonomyWriter taxoWriter;
|
||||||
|
private final OrdinalPolicy ordinalPolicy;
|
||||||
|
|
||||||
|
public CountingListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams,
|
||||||
|
TaxonomyWriter taxoWriter) {
|
||||||
|
this.taxoWriter = taxoWriter;
|
||||||
|
this.ordinalPolicy = indexingParams.getOrdinalPolicy();
|
||||||
|
if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) {
|
||||||
|
ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams);
|
||||||
|
} else {
|
||||||
|
ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Every returned {@link BytesRef} corresponds to a single partition (as
|
||||||
|
* defined by {@link FacetIndexingParams#getPartitionSize()}) and the key
|
||||||
|
* denotes the partition ID. When no partitions are defined, the returned map
|
||||||
|
* contains only one value.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> the {@code ordinals} array is modified by adding parent
|
||||||
|
* ordinals to it. Also, some encoders may sort the array and remove duplicate
|
||||||
|
* ordinals. Therefore you may want to invoke this method after you finished
|
||||||
|
* processing the array for other purposes.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Map<String,BytesRef> build(IntsRef ordinals, Iterable<CategoryPath> categories) throws IOException {
|
||||||
|
int upto = ordinals.length; // since we add ordinals to IntsRef, iterate upto original length
|
||||||
|
|
||||||
|
for (int i = 0; i < upto; i++) {
|
||||||
|
int ordinal = ordinals.ints[i];
|
||||||
|
int parent = taxoWriter.getParent(ordinal);
|
||||||
|
while (parent > 0) {
|
||||||
|
if (ordinalPolicy.shouldAdd(parent)) {
|
||||||
|
ordinals.ints[ordinals.length++] = parent;
|
||||||
|
}
|
||||||
|
parent = taxoWriter.getParent(parent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ordinalsEncoder.encode(ordinals);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -21,6 +22,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -69,7 +71,7 @@ public class FacetFields {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setCategoriesData(HashMap<String,BytesRef> categoriesData) {
|
void setCategoriesData(Map<String,BytesRef> categoriesData) {
|
||||||
this.categoriesData = categoriesData.entrySet().iterator();
|
this.categoriesData = categoriesData.entrySet().iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,6 +134,9 @@ public class FacetFields {
|
||||||
*/
|
*/
|
||||||
protected Map<CategoryListParams,Iterable<CategoryPath>> createCategoryListMapping(
|
protected Map<CategoryListParams,Iterable<CategoryPath>> createCategoryListMapping(
|
||||||
Iterable<CategoryPath> categories) {
|
Iterable<CategoryPath> categories) {
|
||||||
|
if (indexingParams.getAllCategoryListParams().size() == 1) {
|
||||||
|
return Collections.singletonMap(indexingParams.getCategoryListParams(null), categories);
|
||||||
|
}
|
||||||
HashMap<CategoryListParams,Iterable<CategoryPath>> categoryLists =
|
HashMap<CategoryListParams,Iterable<CategoryPath>> categoryLists =
|
||||||
new HashMap<CategoryListParams,Iterable<CategoryPath>>();
|
new HashMap<CategoryListParams,Iterable<CategoryPath>>();
|
||||||
for (CategoryPath cp : categories) {
|
for (CategoryPath cp : categories) {
|
||||||
|
@ -147,10 +152,15 @@ public class FacetFields {
|
||||||
return categoryLists;
|
return categoryLists;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a {@link CategoryListBuilder} for encoding the given categories. */
|
/**
|
||||||
protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams,
|
* Returns the category list data, as a mapping from key to {@link BytesRef}
|
||||||
Iterable<CategoryPath> categories /* needed for AssociationsFacetFields */) {
|
* which includes the encoded data. Every ordinal in {@code ordinals}
|
||||||
return new CategoryListBuilder(categoryListParams, indexingParams, taxonomyWriter);
|
* corrspond to a {@link CategoryPath} returned from {@code categories}.
|
||||||
|
*/
|
||||||
|
protected Map<String,BytesRef> getCategoryListData(CategoryListParams categoryListParams,
|
||||||
|
IntsRef ordinals, Iterable<CategoryPath> categories /* needed for AssociationsFacetFields */)
|
||||||
|
throws IOException {
|
||||||
|
return new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter).build(ordinals, categories);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -185,17 +195,25 @@ public class FacetFields {
|
||||||
|
|
||||||
// for each CLP we add a different field for drill-down terms as well as for
|
// for each CLP we add a different field for drill-down terms as well as for
|
||||||
// counting list data.
|
// counting list data.
|
||||||
|
IntsRef ordinals = new IntsRef(32); // should be enough for most common applications
|
||||||
for (Entry<CategoryListParams, Iterable<CategoryPath>> e : categoryLists.entrySet()) {
|
for (Entry<CategoryListParams, Iterable<CategoryPath>> e : categoryLists.entrySet()) {
|
||||||
final CategoryListParams clp = e.getKey();
|
final CategoryListParams clp = e.getKey();
|
||||||
final String field = clp.getTerm().field();
|
final String field = clp.getTerm().field();
|
||||||
|
|
||||||
// add the counting list data
|
// build category list data
|
||||||
CategoryListBuilder categoriesPayloadBuilder = getCategoryListBuilder(clp, e.getValue());
|
ordinals.length = 0; // reset
|
||||||
|
int maxNumOrds = 0;
|
||||||
for (CategoryPath cp : e.getValue()) {
|
for (CategoryPath cp : e.getValue()) {
|
||||||
int ordinal = taxonomyWriter.addCategory(cp);
|
int ordinal = taxonomyWriter.addCategory(cp);
|
||||||
categoriesPayloadBuilder.handle(ordinal , cp);
|
maxNumOrds += cp.length; // ordinal and potentially all parents
|
||||||
|
if (ordinals.ints.length < maxNumOrds) {
|
||||||
|
ordinals.grow(maxNumOrds);
|
||||||
|
}
|
||||||
|
ordinals.ints[ordinals.length++] = ordinal;
|
||||||
}
|
}
|
||||||
HashMap<String,BytesRef> categoriesData = categoriesPayloadBuilder.finish();
|
Map<String,BytesRef> categoriesData = getCategoryListData(clp, ordinals, e.getValue());
|
||||||
|
|
||||||
|
// add the counting list data
|
||||||
CountingListStream ts = new CountingListStream();
|
CountingListStream ts = new CountingListStream();
|
||||||
ts.setCategoriesData(categoriesData);
|
ts.setCategoriesData(categoriesData);
|
||||||
doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE));
|
doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE));
|
||||||
|
|
|
@ -17,10 +17,7 @@ package org.apache.lucene.facet.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -36,6 +33,7 @@ import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.encoding.IntDecoder;
|
import org.apache.lucene.util.encoding.IntDecoder;
|
||||||
import org.apache.lucene.util.encoding.IntEncoder;
|
import org.apache.lucene.util.encoding.IntEncoder;
|
||||||
|
|
||||||
|
@ -187,7 +185,7 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader {
|
||||||
private class OrdinalMappingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
|
private class OrdinalMappingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
|
||||||
private final IntEncoder encoder;
|
private final IntEncoder encoder;
|
||||||
private final IntDecoder decoder;
|
private final IntDecoder decoder;
|
||||||
private final ByteArrayOutputStream os = new ByteArrayOutputStream();
|
private final IntsRef ordinals = new IntsRef(32);
|
||||||
private final BytesRef payloadOut = new BytesRef();
|
private final BytesRef payloadOut = new BytesRef();
|
||||||
|
|
||||||
public OrdinalMappingDocsAndPositionsEnum(DocsAndPositionsEnum in, CategoryListParams params) {
|
public OrdinalMappingDocsAndPositionsEnum(DocsAndPositionsEnum in, CategoryListParams params) {
|
||||||
|
@ -202,21 +200,14 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader {
|
||||||
if (payload == null) {
|
if (payload == null) {
|
||||||
return payload;
|
return payload;
|
||||||
} else {
|
} else {
|
||||||
InputStream is = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length);
|
decoder.decode(payload, ordinals);
|
||||||
decoder.reInit(is);
|
|
||||||
os.reset();
|
// map the ordinals
|
||||||
encoder.reInit(os);
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
long ordinal;
|
ordinals.ints[i] = ordinalMap[ordinals.ints[i]];
|
||||||
while ((ordinal = decoder.decode()) != IntDecoder.EOS) {
|
|
||||||
int newOrdinal = ordinalMap[(int)ordinal];
|
|
||||||
encoder.encode(newOrdinal);
|
|
||||||
}
|
}
|
||||||
encoder.close();
|
|
||||||
// TODO (Facet): avoid copy?
|
encoder.encode(ordinals, payloadOut);
|
||||||
byte out[] = os.toByteArray();
|
|
||||||
payloadOut.bytes = out;
|
|
||||||
payloadOut.offset = 0;
|
|
||||||
payloadOut.length = out.length;
|
|
||||||
return payloadOut;
|
return payloadOut;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@ import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
|
||||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||||
import org.apache.lucene.facet.search.PayloadIntDecodingIterator;
|
import org.apache.lucene.facet.search.PayloadCategoryListIteraor;
|
||||||
import org.apache.lucene.facet.search.TotalFacetCounts;
|
import org.apache.lucene.facet.search.TotalFacetCounts;
|
||||||
import org.apache.lucene.facet.util.PartitionsUtils;
|
import org.apache.lucene.facet.util.PartitionsUtils;
|
||||||
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
||||||
|
@ -142,7 +142,7 @@ public class CategoryListParams implements Serializable {
|
||||||
int partition) throws IOException {
|
int partition) throws IOException {
|
||||||
String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
|
String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
|
||||||
Term payloadTerm = new Term(term.field(), categoryListTermStr);
|
Term payloadTerm = new Term(term.field(), categoryListTermStr);
|
||||||
return new PayloadIntDecodingIterator(reader, payloadTerm,
|
return new PayloadCategoryListIteraor(reader, payloadTerm,
|
||||||
createEncoder().createMatchingDecoder());
|
createEncoder().createMatchingDecoder());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -83,18 +83,9 @@ public class FacetIndexingParams {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The name of the category list to put this category in, or {@code null} if
|
* Returns the {@link CategoryListParams} for this {@link CategoryPath}. The
|
||||||
* this category should not be aggregatable.
|
* default implementation returns the same {@link CategoryListParams} for all
|
||||||
* <p>
|
* categories (even if {@code category} is {@code null}).
|
||||||
* By default, all categories are written to the same category list, but
|
|
||||||
* applications which know in advance that in some situations only parts of
|
|
||||||
* the category hierarchy needs to be counted can divide the categories into
|
|
||||||
* two or more different category lists.
|
|
||||||
* <p>
|
|
||||||
* If {@code null} is returned for a category, it means that this category
|
|
||||||
* should not appear in any category list, and thus weights for it cannot be
|
|
||||||
* aggregated. This category can still be used for drill-down, even though the
|
|
||||||
* its weight is unknown.
|
|
||||||
*
|
*
|
||||||
* @see PerDimensionIndexingParams
|
* @see PerDimensionIndexingParams
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -78,7 +78,9 @@ public class PerDimensionIndexingParams extends FacetIndexingParams {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the {@link CategoryListParams} for the corresponding dimension
|
* Returns the {@link CategoryListParams} for the corresponding dimension
|
||||||
* which is returned by {@code category.getComponent(0)}.
|
* which is returned by {@code category.getComponent(0)}. If {@code category}
|
||||||
|
* is {@code null}, or was not specified in the map given to the constructor,
|
||||||
|
* returns the default {@link CategoryListParams}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public CategoryListParams getCategoryListParams(CategoryPath category) {
|
public CategoryListParams getCategoryListParams(CategoryPath category) {
|
||||||
|
|
|
@ -2,6 +2,8 @@ package org.apache.lucene.facet.search;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -20,20 +22,10 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An interface for iterating over a "category list", i.e., the list of
|
* An interface for obtaining the category ordinals of documents.
|
||||||
* categories per document.
|
|
||||||
* <p>
|
* <p>
|
||||||
* <b>NOTE:</b>
|
* <b>NOTE:</b> this class operates as a key to a map, and therefore you should
|
||||||
* <ul>
|
* implement {@code equals()} and {@code hashCode()} for proper behavior.
|
||||||
* <li>This class operates as a key to a Map. Appropriate implementation of
|
|
||||||
* <code>hashCode()</code> and <code>equals()</code> must be provided.
|
|
||||||
* <li>{@link #init()} must be called before you consume any categories, or call
|
|
||||||
* {@link #skipTo(int)}.
|
|
||||||
* <li>{@link #skipTo(int)} must be called before any calls to
|
|
||||||
* {@link #nextCategory()}.
|
|
||||||
* <li>{@link #nextCategory()} returns values < {@link Integer#MAX_VALUE}, so
|
|
||||||
* you can use it as a stop condition.
|
|
||||||
* </ul>
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -41,29 +33,20 @@ public interface CategoryListIterator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes the iterator. This method must be called before any calls to
|
* Initializes the iterator. This method must be called before any calls to
|
||||||
* {@link #skipTo(int)}, and its return value indicates whether there are
|
* {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are
|
||||||
* any relevant documents for this iterator. If it returns false, any call
|
* any relevant documents for this iterator.
|
||||||
* to {@link #skipTo(int)} will return false as well.<br>
|
|
||||||
* <b>NOTE:</b> calling this method twice may result in skipping over
|
|
||||||
* documents for some implementations. Also, calling it again after all
|
|
||||||
* documents were consumed may yield unexpected behavior.
|
|
||||||
*/
|
*/
|
||||||
public boolean init() throws IOException;
|
public boolean init() throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Skips forward to document docId. Returns true iff this document exists
|
* Stores the category ordinals of the given document ID in the given
|
||||||
* and has any categories. This method must be called before calling
|
* {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows
|
||||||
* {@link #nextCategory()} for a particular document.<br>
|
* the {@link IntsRef} if it is not large enough.
|
||||||
* <b>NOTE:</b> Users should call this method with increasing docIds, and
|
*
|
||||||
* implementations can assume that this is the case.
|
* <p>
|
||||||
|
* <b>NOTE:</b> if the requested document does not category ordinals
|
||||||
|
* associated with it, {@link IntsRef#length} is set to zero.
|
||||||
*/
|
*/
|
||||||
public boolean skipTo(int docId) throws IOException;
|
public void getOrdinals(int docID, IntsRef ints) throws IOException;
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the next category for the current document that is set through
|
|
||||||
* {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}.
|
|
||||||
* No assumptions can be made on the order of the categories.
|
|
||||||
*/
|
|
||||||
public long nextCategory() throws IOException;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.UnsafeByteArrayInputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.encoding.IntDecoder;
|
import org.apache.lucene.util.encoding.IntDecoder;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -26,44 +26,21 @@ import org.apache.lucene.util.encoding.IntDecoder;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A payload deserializer comes with its own working space (buffer). One need to
|
* A {@link CategoryListIterator} which reads the category ordinals from a
|
||||||
* define the {@link IndexReader} and {@link Term} in which the payload resides.
|
* payload.
|
||||||
* The iterator then consumes the payload information of each document and
|
|
||||||
* decodes it into categories. A typical use case of this class is:
|
|
||||||
*
|
|
||||||
* <pre class="prettyprint">
|
|
||||||
* IndexReader reader = [open your reader];
|
|
||||||
* Term t = new Term("field", "where-payload-exists");
|
|
||||||
* CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
|
|
||||||
* if (!cli.init()) {
|
|
||||||
* // it means there are no payloads / documents associated with that term.
|
|
||||||
* // Usually a sanity check. However, init() must be called.
|
|
||||||
* }
|
|
||||||
* DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
|
|
||||||
* int doc;
|
|
||||||
* while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
|
||||||
* cli.setdoc(doc);
|
|
||||||
* long category;
|
|
||||||
* while ((category = cli.nextCategory()) < Integer.MAX_VALUE) {
|
|
||||||
* }
|
|
||||||
* }
|
|
||||||
* </pre>
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class PayloadIntDecodingIterator implements CategoryListIterator {
|
public class PayloadCategoryListIteraor implements CategoryListIterator {
|
||||||
|
|
||||||
private final UnsafeByteArrayInputStream ubais;
|
|
||||||
private final IntDecoder decoder;
|
private final IntDecoder decoder;
|
||||||
|
|
||||||
private final IndexReader indexReader;
|
private final IndexReader indexReader;
|
||||||
private final Term term;
|
private final Term term;
|
||||||
private final PayloadIterator pi;
|
private final PayloadIterator pi;
|
||||||
private final int hashCode;
|
private final int hashCode;
|
||||||
|
|
||||||
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
|
public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
|
||||||
pi = new PayloadIterator(indexReader, term);
|
pi = new PayloadIterator(indexReader, term);
|
||||||
ubais = new UnsafeByteArrayInputStream();
|
|
||||||
this.decoder = decoder;
|
this.decoder = decoder;
|
||||||
hashCode = indexReader.hashCode() ^ term.hashCode();
|
hashCode = indexReader.hashCode() ^ term.hashCode();
|
||||||
this.term = term;
|
this.term = term;
|
||||||
|
@ -72,10 +49,10 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object other) {
|
public boolean equals(Object other) {
|
||||||
if (!(other instanceof PayloadIntDecodingIterator)) {
|
if (!(other instanceof PayloadCategoryListIteraor)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other;
|
PayloadCategoryListIteraor that = (PayloadCategoryListIteraor) other;
|
||||||
if (hashCode != that.hashCode) {
|
if (hashCode != that.hashCode) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -95,21 +72,12 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long nextCategory() throws IOException {
|
public void getOrdinals(int docID, IntsRef ints) throws IOException {
|
||||||
return decoder.decode();
|
ints.length = 0;
|
||||||
}
|
BytesRef payload = pi.getPayload(docID);
|
||||||
|
if (payload != null) {
|
||||||
@Override
|
decoder.decode(payload, ints);
|
||||||
public boolean skipTo(int docId) throws IOException {
|
|
||||||
if (!pi.setdoc(docId)) {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initializing the decoding mechanism with the new payload data
|
|
||||||
BytesRef data = pi.getPayload();
|
|
||||||
ubais.reInit(data.bytes, data.offset, data.length + data.offset);
|
|
||||||
decoder.reInit(ubais);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -34,9 +34,9 @@ import org.apache.lucene.util.BytesRef;
|
||||||
* A utility class for iterating through a posting list of a given term and
|
* A utility class for iterating through a posting list of a given term and
|
||||||
* retrieving the payload of the first position in every document. For
|
* retrieving the payload of the first position in every document. For
|
||||||
* efficiency, this class does not check if documents passed to
|
* efficiency, this class does not check if documents passed to
|
||||||
* {@link #setdoc(int)} are deleted, since it is usually used to iterate on
|
* {@link #getPayload(int)} are deleted, since it is usually used to iterate on
|
||||||
* payloads of documents that matched a query. If you need to skip over deleted
|
* payloads of documents that matched a query. If you need to skip over deleted
|
||||||
* documents, you should do so before calling {@link #setdoc(int)}.
|
* documents, you should do so before calling {@link #getPayload(int)}.
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -84,8 +84,8 @@ public class PayloadIterator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the iterator. Should be done before the first call to
|
* Initialize the iterator. Should be done before the first call to
|
||||||
* {@link #setdoc(int)}. Returns {@code false} if no category list is found,
|
* {@link #getPayload(int)}. Returns {@code false} if no category list is
|
||||||
* or the category list has no documents.
|
* found, or the category list has no documents.
|
||||||
*/
|
*/
|
||||||
public boolean init() throws IOException {
|
public boolean init() throws IOException {
|
||||||
nextSegment();
|
nextSegment();
|
||||||
|
@ -93,30 +93,29 @@ public class PayloadIterator {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Skip forward to document docId. Return true if this document exists and
|
* Returns the {@link BytesRef payload} of the given document, or {@code null}
|
||||||
* has any payload.
|
* if the document does not exist, there are no more documents in the posting
|
||||||
* <P>
|
* list, or the document exists but has not payload. You should call
|
||||||
* Users should call this method with increasing docIds, and implementations
|
* {@link #init()} before the first call to this method.
|
||||||
* can assume that this is the case.
|
|
||||||
*/
|
*/
|
||||||
public boolean setdoc(int docId) throws IOException {
|
public BytesRef getPayload(int docID) throws IOException {
|
||||||
if (!hasMore) {
|
if (!hasMore) {
|
||||||
return false;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
|
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
|
||||||
int localDocID = docId - curDocBase;
|
int localDocID = docID - curDocBase;
|
||||||
|
|
||||||
if (curDocID > localDocID) {
|
if (curDocID > localDocID) {
|
||||||
// document does not exist
|
// document does not exist
|
||||||
return false;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (curDocID < localDocID) {
|
if (curDocID < localDocID) {
|
||||||
// look for the document either in that segment, or others
|
// look for the document either in that segment, or others
|
||||||
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
|
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
nextSegment(); // also updates curDocID
|
nextSegment(); // also updates curDocID
|
||||||
localDocID = docId - curDocBase;
|
localDocID = docID - curDocBase;
|
||||||
// nextSegment advances to nextDoc, so check if we still need to advance
|
// nextSegment advances to nextDoc, so check if we still need to advance
|
||||||
if (curDocID >= localDocID) {
|
if (curDocID >= localDocID) {
|
||||||
break;
|
break;
|
||||||
|
@ -127,7 +126,7 @@ public class PayloadIterator {
|
||||||
// 1. we iterated over all segments (hasMore=false)
|
// 1. we iterated over all segments (hasMore=false)
|
||||||
// 2. current segment advanced to a doc, either requested or higher
|
// 2. current segment advanced to a doc, either requested or higher
|
||||||
if (!hasMore || curDocID != localDocID) {
|
if (!hasMore || curDocID != localDocID) {
|
||||||
return false;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,12 +134,7 @@ public class PayloadIterator {
|
||||||
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
|
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
|
||||||
int pos = currentDPE.nextPosition();
|
int pos = currentDPE.nextPosition();
|
||||||
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
|
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
|
||||||
data = currentDPE.getPayload();
|
return currentDPE.getPayload();
|
||||||
return data != null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public BytesRef getPayload() {
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@ import java.util.logging.Level;
|
||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||||
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
||||||
|
@ -231,9 +232,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
||||||
facetArrays.free(); // to get a cleared array for this partition
|
facetArrays.free(); // to get a cleared array for this partition
|
||||||
}
|
}
|
||||||
|
|
||||||
HashMap<CategoryListIterator, Aggregator> categoryLists = getCategoryListMap(
|
HashMap<CategoryListIterator, Aggregator> categoryLists = getCategoryListMap(facetArrays, partition);
|
||||||
facetArrays, partition);
|
|
||||||
|
|
||||||
|
IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps
|
||||||
for (Entry<CategoryListIterator, Aggregator> entry : categoryLists.entrySet()) {
|
for (Entry<CategoryListIterator, Aggregator> entry : categoryLists.entrySet()) {
|
||||||
CategoryListIterator categoryList = entry.getKey();
|
CategoryListIterator categoryList = entry.getKey();
|
||||||
if (!categoryList.init()) {
|
if (!categoryList.init()) {
|
||||||
|
@ -244,14 +245,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
||||||
ScoredDocIDsIterator iterator = docids.iterator();
|
ScoredDocIDsIterator iterator = docids.iterator();
|
||||||
while (iterator.next()) {
|
while (iterator.next()) {
|
||||||
int docID = iterator.getDocID();
|
int docID = iterator.getDocID();
|
||||||
if (!categoryList.skipTo(docID)) {
|
categoryList.getOrdinals(docID, ordinals);
|
||||||
|
if (ordinals.length == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
categorator.setNextDoc(docID, iterator.getScore());
|
categorator.aggregate(docID, iterator.getScore(), ordinals);
|
||||||
long ordinal;
|
|
||||||
while ((ordinal = categoryList.nextCategory()) <= Integer.MAX_VALUE) {
|
|
||||||
categorator.aggregate((int) ordinal);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,8 @@ package org.apache.lucene.facet.search.aggregator;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -36,16 +38,9 @@ import java.io.IOException;
|
||||||
public interface Aggregator {
|
public interface Aggregator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Specify the document (and its score in the search) that the following
|
* Aggregate the ordinals of the given document ID (and its score). The given
|
||||||
* {@link #aggregate(int)} calls will pertain to.
|
* ordinals offset is always zero.
|
||||||
*/
|
*/
|
||||||
void setNextDoc(int docid, float score) throws IOException;
|
public void aggregate(int docID, float score, IntsRef ordinals) throws IOException;
|
||||||
|
|
||||||
/**
|
|
||||||
* Collect (and do whatever an implementation deems appropriate) the
|
|
||||||
* category given by its ordinal. This category belongs to a document
|
|
||||||
* given earlier by {@link #setNextDoc(int, float)}.
|
|
||||||
*/
|
|
||||||
void aggregate(int ordinal);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
package org.apache.lucene.facet.search.aggregator;
|
package org.apache.lucene.facet.search.aggregator;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -29,9 +33,12 @@ public class ComplementCountingAggregator extends CountingAggregator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void aggregate(int ordinal) {
|
public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
|
||||||
assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal;
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
--counterArray[ordinal];
|
int ord = ordinals.ints[i];
|
||||||
|
assert counterArray[ord] != 0 : "complement aggregation: count is about to become negative for ordinal " + ord;
|
||||||
|
--counterArray[ord];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
package org.apache.lucene.facet.search.aggregator;
|
package org.apache.lucene.facet.search.aggregator;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -27,21 +31,17 @@ package org.apache.lucene.facet.search.aggregator;
|
||||||
public class CountingAggregator implements Aggregator {
|
public class CountingAggregator implements Aggregator {
|
||||||
|
|
||||||
protected int[] counterArray;
|
protected int[] counterArray;
|
||||||
|
|
||||||
@Override
|
|
||||||
public void aggregate(int ordinal) {
|
|
||||||
++counterArray[ordinal];
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextDoc(int docid, float score) {
|
|
||||||
// There's nothing for us to do here since we only increment the count by 1
|
|
||||||
// in this aggregator.
|
|
||||||
}
|
|
||||||
|
|
||||||
public CountingAggregator(int[] counterArray) {
|
public CountingAggregator(int[] counterArray) {
|
||||||
this.counterArray = counterArray;
|
this.counterArray = counterArray;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
|
||||||
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
|
counterArray[ordinals.ints[i]]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
|
@ -54,8 +54,7 @@ public class CountingAggregator implements Aggregator {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int hashCode = counterArray == null ? 0 : counterArray.hashCode();
|
return counterArray == null ? 0 : counterArray.hashCode();
|
||||||
|
|
||||||
return hashCode;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
package org.apache.lucene.facet.search.aggregator;
|
package org.apache.lucene.facet.search.aggregator;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -26,7 +30,6 @@ package org.apache.lucene.facet.search.aggregator;
|
||||||
public class ScoringAggregator implements Aggregator {
|
public class ScoringAggregator implements Aggregator {
|
||||||
|
|
||||||
private final float[] scoreArray;
|
private final float[] scoreArray;
|
||||||
private float score;
|
|
||||||
private final int hashCode;
|
private final int hashCode;
|
||||||
|
|
||||||
public ScoringAggregator(float[] counterArray) {
|
public ScoringAggregator(float[] counterArray) {
|
||||||
|
@ -35,10 +38,12 @@ public class ScoringAggregator implements Aggregator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void aggregate(int ordinal) {
|
public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
|
||||||
scoreArray[ordinal] += score;
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
|
scoreArray[ordinals.ints[i]] += score;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (obj == null || obj.getClass() != this.getClass()) {
|
if (obj == null || obj.getClass() != this.getClass()) {
|
||||||
|
@ -53,8 +58,4 @@ public class ScoringAggregator implements Aggregator {
|
||||||
return hashCode;
|
return hashCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextDoc(int docid, float score) {
|
|
||||||
this.score = score;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator;
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.collections.IntToFloatMap;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -48,13 +50,18 @@ public class AssociationFloatSumAggregator implements Aggregator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void aggregate(int ordinal) {
|
public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
|
||||||
float association = associations.getAssociation(ordinal);
|
IntToFloatMap values = associations.getAssociations(docID);
|
||||||
if (!Float.isNaN(association)) {
|
if (values != null) {
|
||||||
sumArray[ordinal] += association;
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
|
int ord = ordinals.ints[i];
|
||||||
|
if (values.containsKey(ord)) {
|
||||||
|
sumArray[ord] += values.get(ord);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (obj == null || obj.getClass() != this.getClass()) {
|
if (obj == null || obj.getClass() != this.getClass()) {
|
||||||
|
@ -69,9 +76,4 @@ public class AssociationFloatSumAggregator implements Aggregator {
|
||||||
return field.hashCode();
|
return field.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextDoc(int docid, float score) throws IOException {
|
|
||||||
associations.setNextDoc(docid);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator;
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.collections.IntToIntMap;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -48,13 +50,18 @@ public class AssociationIntSumAggregator implements Aggregator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void aggregate(int ordinal) {
|
public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
|
||||||
long association = associations.getAssociation(ordinal);
|
IntToIntMap values = associations.getAssociations(docID);
|
||||||
if (association != IntAssociationsPayloadIterator.NO_ASSOCIATION) {
|
if (values != null) {
|
||||||
sumArray[ordinal] += association;
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
|
int ord = ordinals.ints[i];
|
||||||
|
if (values.containsKey(ord)) {
|
||||||
|
sumArray[ord] += values.get(ord);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (obj == null || obj.getClass() != this.getClass()) {
|
if (obj == null || obj.getClass() != this.getClass()) {
|
||||||
|
@ -69,9 +76,4 @@ public class AssociationIntSumAggregator implements Aggregator {
|
||||||
return field.hashCode();
|
return field.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextDoc(int docid, float score) throws IOException {
|
|
||||||
associations.setNextDoc(docid);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,13 +2,12 @@ package org.apache.lucene.facet.search.cache;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
||||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
import org.apache.lucene.util.collections.IntArray;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -56,33 +55,26 @@ public class CategoryListData {
|
||||||
protected CategoryListData() {
|
protected CategoryListData() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Compute category list data for caching for faster iteration. */
|
||||||
* Compute category list data for caching for faster iteration.
|
|
||||||
*/
|
|
||||||
CategoryListData(IndexReader reader, TaxonomyReader taxo,
|
CategoryListData(IndexReader reader, TaxonomyReader taxo,
|
||||||
FacetIndexingParams iparams, CategoryListParams clp) throws IOException {
|
FacetIndexingParams iparams, CategoryListParams clp) throws IOException {
|
||||||
|
|
||||||
final int maxDoc = reader.maxDoc();
|
final int maxDoc = reader.maxDoc();
|
||||||
int[][][]dpf = new int[maxDoc][][];
|
int[][][]dpf = new int[maxDoc][][];
|
||||||
int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize());
|
int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize());
|
||||||
IntArray docCategories = new IntArray();
|
IntsRef ordinals = new IntsRef(32);
|
||||||
for (int part=0; part<numPartitions; part++) {
|
for (int part = 0; part < numPartitions; part++) {
|
||||||
CategoryListIterator cli = clp.createCategoryListIterator(reader, part);
|
CategoryListIterator cli = clp.createCategoryListIterator(reader, part);
|
||||||
if (cli.init()) {
|
if (cli.init()) {
|
||||||
for (int doc=0; doc<maxDoc; doc++) {
|
for (int doc = 0; doc < maxDoc; doc++) {
|
||||||
if (cli.skipTo(doc)) {
|
cli.getOrdinals(doc, ordinals);
|
||||||
docCategories.clear(false);
|
if (ordinals.length > 0) {
|
||||||
if (dpf[doc]==null) {
|
if (dpf[doc] == null) {
|
||||||
dpf[doc] = new int[numPartitions][];
|
dpf[doc] = new int[numPartitions][];
|
||||||
}
|
}
|
||||||
long category;
|
dpf[doc][part] = new int[ordinals.length];
|
||||||
while ((category = cli.nextCategory()) <= Integer.MAX_VALUE) {
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
docCategories.addToArray((int)category);
|
dpf[doc][part][i] = ordinals.ints[i];
|
||||||
}
|
|
||||||
final int size = docCategories.size();
|
|
||||||
dpf[doc][part] = new int[size];
|
|
||||||
for (int i=0; i<size; i++) {
|
|
||||||
dpf[doc][part][i] = docCategories.get(i);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -98,14 +90,11 @@ public class CategoryListData {
|
||||||
return new RAMCategoryListIterator(partition, docPartitionCategories);
|
return new RAMCategoryListIterator(partition, docPartitionCategories);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Internal: category list iterator over uncompressed category info in RAM */
|
||||||
* Internal: category list iterator over uncompressed category info in RAM
|
|
||||||
*/
|
|
||||||
private static class RAMCategoryListIterator implements CategoryListIterator {
|
private static class RAMCategoryListIterator implements CategoryListIterator {
|
||||||
|
|
||||||
private final int part;
|
private final int part;
|
||||||
private final int[][][] dpc;
|
private final int[][][] dpc;
|
||||||
private int currDoc = -1;
|
|
||||||
private int nextCategoryIndex = -1;
|
|
||||||
|
|
||||||
RAMCategoryListIterator(int part, int[][][] docPartitionCategories) {
|
RAMCategoryListIterator(int part, int[][][] docPartitionCategories) {
|
||||||
this.part = part;
|
this.part = part;
|
||||||
|
@ -114,25 +103,22 @@ public class CategoryListData {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean init() throws IOException {
|
public boolean init() throws IOException {
|
||||||
return dpc!=null && dpc.length>part;
|
return dpc != null && dpc.length > part;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long nextCategory() throws IOException {
|
public void getOrdinals(int docID, IntsRef ints) throws IOException {
|
||||||
if (nextCategoryIndex >= dpc[currDoc][part].length) {
|
ints.length = 0;
|
||||||
return 1L+Integer.MAX_VALUE;
|
if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) {
|
||||||
|
if (ints.ints.length < dpc[docID][part].length) {
|
||||||
|
ints.grow(dpc[docID][part].length);
|
||||||
|
}
|
||||||
|
ints.length = 0;
|
||||||
|
for (int i = 0; i < dpc[docID][part].length; i++) {
|
||||||
|
ints.ints[ints.length++] = dpc[docID][part][i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return dpc[currDoc][part][nextCategoryIndex++];
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean skipTo(int docId) throws IOException {
|
|
||||||
final boolean res = dpc.length>docId && dpc[docId]!=null && dpc[docId][part]!=null;
|
|
||||||
if (res) {
|
|
||||||
currDoc = docId;
|
|
||||||
nextCategoryIndex = 0;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -48,8 +48,7 @@ public class CountFacetRequest extends FacetRequest {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Aggregator createAggregator(boolean useComplements,
|
public Aggregator createAggregator(boolean useComplements,
|
||||||
FacetArrays arrays, IndexReader reader,
|
FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) {
|
||||||
TaxonomyReader taxonomy) {
|
|
||||||
// we rely on that, if needed, result is cleared by arrays!
|
// we rely on that, if needed, result is cleared by arrays!
|
||||||
int[] a = arrays.getIntArray();
|
int[] a = arrays.getIntArray();
|
||||||
if (useComplements) {
|
if (useComplements) {
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -33,16 +34,13 @@ public class MultiCategoryListIterator implements CategoryListIterator {
|
||||||
|
|
||||||
private final CategoryListIterator[] iterators;
|
private final CategoryListIterator[] iterators;
|
||||||
private final List<CategoryListIterator> validIterators;
|
private final List<CategoryListIterator> validIterators;
|
||||||
private final List<CategoryListIterator> perDocValidIterators;
|
|
||||||
|
|
||||||
/** Receives the iterators to iterate on */
|
/** Receives the iterators to iterate on */
|
||||||
public MultiCategoryListIterator(CategoryListIterator... iterators) {
|
public MultiCategoryListIterator(CategoryListIterator... iterators) {
|
||||||
this.iterators = iterators;
|
this.iterators = iterators;
|
||||||
this.validIterators = new ArrayList<CategoryListIterator>();
|
this.validIterators = new ArrayList<CategoryListIterator>();
|
||||||
this.perDocValidIterators = new ArrayList<CategoryListIterator>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fails if all given iterators fail to init */
|
|
||||||
@Override
|
@Override
|
||||||
public boolean init() throws IOException {
|
public boolean init() throws IOException {
|
||||||
for (CategoryListIterator cli : iterators) {
|
for (CategoryListIterator cli : iterators) {
|
||||||
|
@ -52,35 +50,17 @@ public class MultiCategoryListIterator implements CategoryListIterator {
|
||||||
}
|
}
|
||||||
return !validIterators.isEmpty();
|
return !validIterators.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a value larger than {@link Integer#MAX_VALUE} only if all
|
|
||||||
* iterators are exhausted
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public long nextCategory() throws IOException {
|
public void getOrdinals(int docID, IntsRef ints) throws IOException {
|
||||||
while (!perDocValidIterators.isEmpty()) {
|
IntsRef tmp = new IntsRef(ints.length);
|
||||||
long value = perDocValidIterators.get(0).nextCategory();
|
|
||||||
if (value <= Integer.MAX_VALUE) {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
perDocValidIterators.remove(0);
|
|
||||||
}
|
|
||||||
return 0x100000000L;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Fails only if skipTo on all the provided iterators returned {@code false}
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean skipTo(int docId) throws IOException {
|
|
||||||
perDocValidIterators.clear();
|
|
||||||
for (CategoryListIterator cli : validIterators) {
|
for (CategoryListIterator cli : validIterators) {
|
||||||
if (cli.skipTo(docId)) {
|
cli.getOrdinals(docID, tmp);
|
||||||
perDocValidIterators.add(cli);
|
if (ints.ints.length < ints.length + tmp.length) {
|
||||||
|
ints.grow(ints.length + tmp.length);
|
||||||
}
|
}
|
||||||
|
ints.length += tmp.length;
|
||||||
}
|
}
|
||||||
return !perDocValidIterators.isEmpty();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,229 +0,0 @@
|
||||||
package org.apache.lucene.util;
|
|
||||||
|
|
||||||
import java.io.EOFException;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is encoded as follows:
|
|
||||||
* <ul>
|
|
||||||
* <li>If it is less than 127 and non-negative (i.e., if the number uses only 7 bits), it is encoded as
|
|
||||||
* as single byte: 0bbbbbbb.
|
|
||||||
* <li>If its highest nonzero bit is greater than bit 6 (0x40), it is represented as a series of
|
|
||||||
* bytes, each byte's
|
|
||||||
* 7 LSB containing bits from the original value, with the MSB set for all but the last
|
|
||||||
* byte. The first encoded byte contains the highest nonzero bits from the
|
|
||||||
* original; the second byte contains the next 7 MSB; and so on, with the last byte
|
|
||||||
* containing the 7 LSB of the original.
|
|
||||||
* </ul>
|
|
||||||
* Examples:
|
|
||||||
* <ol>
|
|
||||||
* <li>n = 117 = 1110101: This has fewer than 8 significant bits, and so is encoded as
|
|
||||||
* 01110101 = 0x75.
|
|
||||||
* <li>n = 100000 = (binary) 11000011010100000. This has 17 significant bits, and so needs
|
|
||||||
* three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, then split it into chunks of 7
|
|
||||||
* and add an MSB, 0 for the last byte, 1 for the others: 1|0000110 1|0001101 0|0100000
|
|
||||||
* = 0x86 0x8D 0x20.
|
|
||||||
* </ol>
|
|
||||||
* This encoder/decoder will correctly handle any 32-bit integer, but for negative numbers,
|
|
||||||
* and positive numbers with more than 28 significant bits, encoding requires 5 bytes; this
|
|
||||||
* is not an efficient encoding scheme for large
|
|
||||||
* positive numbers or any negative number.
|
|
||||||
* <p>
|
|
||||||
* <b>Compatibility:</b><br>
|
|
||||||
* This class has been used in products that have shipped to customers, and is needed to
|
|
||||||
* decode legacy data. Do not modify this class in ways that will break compatibility.
|
|
||||||
*
|
|
||||||
* @lucene.experimental
|
|
||||||
*/
|
|
||||||
public class Vint8 {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Because Java lacks call-by-reference, this class boxes the decoding position, which
|
|
||||||
* is initially set by the caller, and returned after decoding, incremented by the number
|
|
||||||
* of bytes processed.
|
|
||||||
*/
|
|
||||||
public static class Position {
|
|
||||||
/**
|
|
||||||
* Creates a position value set to zero.
|
|
||||||
*/
|
|
||||||
public Position() {
|
|
||||||
// The initial position is zero by default.
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Creates a position set to {@code initialPosition}.
|
|
||||||
* @param initialPosition The starting decoding position in the source buffer.
|
|
||||||
*/
|
|
||||||
public Position(int initialPosition) {
|
|
||||||
this.pos = initialPosition;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* The value passed by reference.
|
|
||||||
*/
|
|
||||||
public int pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the number of bytes needed to encode {@code number}.
|
|
||||||
* @param number The number whose encoded length is needed.
|
|
||||||
* @return The number of bytes needed to encode {@code number}.
|
|
||||||
*/
|
|
||||||
public static int bytesNeeded(int number) {
|
|
||||||
if ((number & ~0x7F) == 0) {
|
|
||||||
return 1;
|
|
||||||
} else if ((number & ~0x3FFF) == 0) {
|
|
||||||
return 2;
|
|
||||||
} else if ((number & ~0x1FFFFF) == 0) {
|
|
||||||
return 3;
|
|
||||||
} else if ((number & ~0xFFFFFFF) == 0) {
|
|
||||||
return 4;
|
|
||||||
} else {
|
|
||||||
return 5;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The maximum number of bytes needed to encode a number using {@code Vint8}.
|
|
||||||
*/
|
|
||||||
public static final int MAXIMUM_BYTES_NEEDED = 5;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Encodes {@code number} to {@code out}.
|
|
||||||
* @param number The value to be written in encoded form, to {@code out}.
|
|
||||||
* @param out The output stream receiving the encoded bytes.
|
|
||||||
* @exception IOException If there is a problem writing to {@code out}.
|
|
||||||
*/
|
|
||||||
public static void encode(int number, OutputStream out) throws IOException {
|
|
||||||
if ((number & ~0x7F) == 0) {
|
|
||||||
out.write(number);
|
|
||||||
} else if ((number & ~0x3FFF) == 0) {
|
|
||||||
out.write(0x80 | (number >> 7));
|
|
||||||
out.write(0x7F & number);
|
|
||||||
} else if ((number & ~0x1FFFFF) == 0) {
|
|
||||||
out.write(0x80 | (number >> 14));
|
|
||||||
out.write(0x80 | (number >> 7));
|
|
||||||
out.write(0x7F & number);
|
|
||||||
} else if ((number & ~0xFFFFFFF) == 0) {
|
|
||||||
out.write(0x80 | (number >> 21));
|
|
||||||
out.write(0x80 | (number >> 14));
|
|
||||||
out.write(0x80 | (number >> 7));
|
|
||||||
out.write(0x7F & number);
|
|
||||||
} else {
|
|
||||||
out.write(0x80 | (number >> 28));
|
|
||||||
out.write(0x80 | (number >> 21));
|
|
||||||
out.write(0x80 | (number >> 14));
|
|
||||||
out.write(0x80 | (number >> 7));
|
|
||||||
out.write(0x7F & number);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Encodes {@code number} into {@code dest}, starting at offset {@code start} from
|
|
||||||
* the beginning of the array. This method assumes {@code dest} is large enough to
|
|
||||||
* hold the required number of bytes.
|
|
||||||
* @param number The number to be encoded.
|
|
||||||
* @param dest The destination array.
|
|
||||||
* @param start The starting offset in the array.
|
|
||||||
* @return The number of bytes used in the array.
|
|
||||||
*/
|
|
||||||
public static int encode(int number, byte[] dest, int start) {
|
|
||||||
if ((number & ~0x7F) == 0) {
|
|
||||||
dest[start] = (byte) number;
|
|
||||||
return 1;
|
|
||||||
} else if ((number & ~0x3FFF) == 0) {
|
|
||||||
dest[start] = (byte) (0x80 | ((number & 0x3F80) >> 7));
|
|
||||||
dest[start + 1] = (byte) (number & 0x7F);
|
|
||||||
return 2;
|
|
||||||
} else if ((number & ~0x1FFFFF) == 0) {
|
|
||||||
dest[start] = (byte) (0x80 | ((number & 0x1FC000) >> 14));
|
|
||||||
dest[start + 1] = (byte) (0x80 | ((number & 0x3F80) >> 7));
|
|
||||||
dest[start + 2] = (byte) (number & 0x7F);
|
|
||||||
return 3;
|
|
||||||
} else if ((number & ~0xFFFFFFF) == 0) {
|
|
||||||
dest[start] = (byte) (0x80 | ((number & 0xFE00000) >> 21));
|
|
||||||
dest[start + 1] = (byte) (0x80 | ((number & 0x1FC000) >> 14));
|
|
||||||
dest[start + 2] = (byte) (0x80 | ((number & 0x3F80) >> 7));
|
|
||||||
dest[start + 3] = (byte) (number & 0x7F);
|
|
||||||
return 4;
|
|
||||||
} else {
|
|
||||||
dest[start] = (byte) (0x80 | ((number & 0xF0000000) >> 28));
|
|
||||||
dest[start + 1] = (byte) (0x80 | ((number & 0xFE00000) >> 21));
|
|
||||||
dest[start + 2] = (byte) (0x80 | ((number & 0x1FC000) >> 14));
|
|
||||||
dest[start + 3] = (byte) (0x80 | ((number & 0x3F80) >> 7));
|
|
||||||
dest[start + 4] = (byte) (number & 0x7F);
|
|
||||||
return 5;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decodes a 32-bit integer from {@code bytes}, beginning at offset {@code pos.pos}.
|
|
||||||
* The decoded value is returned, and {@code pos.pos} is incremented by the number of
|
|
||||||
* bytes processed.
|
|
||||||
* @param bytes The byte array containing an encoded value.
|
|
||||||
* @param pos On entry, the starting position in the array; on return, one greater
|
|
||||||
* than the position of the last byte decoded in the call.
|
|
||||||
* @return The decoded value.
|
|
||||||
*/
|
|
||||||
public static int decode(byte[] bytes, Position pos) {
|
|
||||||
int value = 0;
|
|
||||||
while (true) {
|
|
||||||
byte first = bytes[pos.pos];
|
|
||||||
++pos.pos;
|
|
||||||
value |= first & 0x7F;
|
|
||||||
if ((first & 0x80) == 0) {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
value <<= 7;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decodes a 32-bit integer from bytes read from {@code in}. Bytes are read,
|
|
||||||
* one at a time, from {@code in}, and it is assumed they represent a 32-bit
|
|
||||||
* integer encoded using this class's encoding scheme. The decoded value is
|
|
||||||
* returned.
|
|
||||||
* @param in The input stream containing the encoded bytes.
|
|
||||||
* @return The decoded value.
|
|
||||||
* @exception EOFException If the stream ends before a value has been decoded.
|
|
||||||
*/
|
|
||||||
public static int decode(InputStream in) throws IOException {
|
|
||||||
int value = 0;
|
|
||||||
while (true) {
|
|
||||||
int first = in.read();
|
|
||||||
if (first < 0) {
|
|
||||||
throw new EOFException();
|
|
||||||
}
|
|
||||||
value |= first & 0x7F;
|
|
||||||
if ((first & 0x80) == 0) {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
value <<= 7;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The default ctor is made private because all methods of this class are static.
|
|
||||||
*/
|
|
||||||
private Vint8() {
|
|
||||||
// Just making it impossible to instantiate.
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.OutputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -27,38 +27,31 @@ import java.io.OutputStream;
|
||||||
* read more on the two implementations {@link FourFlagsIntEncoder} and
|
* read more on the two implementations {@link FourFlagsIntEncoder} and
|
||||||
* {@link EightFlagsIntEncoder}.
|
* {@link EightFlagsIntEncoder}.
|
||||||
* <p>
|
* <p>
|
||||||
* Extensions of this class need to implement {@link #encode(int)} in order to
|
* Extensions of this class need to implement {@link #encode(IntsRef, BytesRef)}
|
||||||
* build the proper indicator (flags). When enough values were accumulated
|
* in order to build the proper indicator (flags). When enough values were
|
||||||
* (typically the batch size), extensions can call {@link #encodeChunk()} to
|
* accumulated (typically the batch size), extensions can call
|
||||||
* flush the indicator and the rest of the values.
|
* {@link #encodeChunk(BytesRef)} to flush the indicator and the rest of the
|
||||||
|
* values.
|
||||||
* <p>
|
* <p>
|
||||||
* <b>NOTE:</b> flags encoders do not accept values ≤ 0 (zero) in their
|
* <b>NOTE:</b> flags encoders do not accept values ≤ 0 (zero) in their
|
||||||
* {@link #encode(int)}. For performance reasons they do not check that
|
* {@link #encode(IntsRef, BytesRef)}. For performance reasons they do not check
|
||||||
* condition, however if such value is passed the result stream may be corrupt
|
* that condition, however if such value is passed the result stream may be
|
||||||
* or an exception will be thrown. Also, these encoders perform the best when
|
* corrupt or an exception will be thrown. Also, these encoders perform the best
|
||||||
* there are many consecutive small values (depends on the encoder
|
* when there are many consecutive small values (depends on the encoder
|
||||||
* implementation). If that is not the case, the encoder will occupy 1 more byte
|
* implementation). If that is not the case, the encoder will occupy 1 more byte
|
||||||
* for every <i>batch</i> number of integers, over whatever
|
* for every <i>batch</i> number of integers, over whatever
|
||||||
* {@link VInt8IntEncoder} would have occupied. Therefore make sure to check
|
* {@link VInt8IntEncoder} would have occupied. Therefore make sure to check
|
||||||
* whether your data fits into the conditions of the specific encoder.
|
* whether your data fits into the conditions of the specific encoder.
|
||||||
* <p>
|
* <p>
|
||||||
* For the reasons mentioned above, these encoders are usually chained with
|
* For the reasons mentioned above, these encoders are usually chained with
|
||||||
* {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder} in the following
|
* {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder}.
|
||||||
* manner: <code><pre class="prettyprint">
|
|
||||||
* IntEncoder fourFlags =
|
|
||||||
* new SortingEncoderFilter(new UniqueValuesIntEncoder(new DGapIntEncoder(new FlagsIntEncoderImpl())));
|
|
||||||
* </pre></code>
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public abstract class ChunksIntEncoder extends IntEncoder {
|
public abstract class ChunksIntEncoder extends IntEncoder {
|
||||||
|
|
||||||
/** Holds the values which must be encoded, outside the indicator. */
|
/** Holds the values which must be encoded, outside the indicator. */
|
||||||
protected final int[] encodeQueue;
|
protected final IntsRef encodeQueue;
|
||||||
protected int encodeQueueSize = 0;
|
|
||||||
|
|
||||||
/** Encoder used to encode values outside the indicator. */
|
|
||||||
protected final IntEncoder encoder = new VInt8IntEncoder();
|
|
||||||
|
|
||||||
/** Represents bits flag byte. */
|
/** Represents bits flag byte. */
|
||||||
protected int indicator = 0;
|
protected int indicator = 0;
|
||||||
|
@ -67,39 +60,33 @@ public abstract class ChunksIntEncoder extends IntEncoder {
|
||||||
protected byte ordinal = 0;
|
protected byte ordinal = 0;
|
||||||
|
|
||||||
protected ChunksIntEncoder(int chunkSize) {
|
protected ChunksIntEncoder(int chunkSize) {
|
||||||
encodeQueue = new int[chunkSize];
|
encodeQueue = new IntsRef(chunkSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encodes the values of the current chunk. First it writes the indicator, and
|
* Encodes the values of the current chunk. First it writes the indicator, and
|
||||||
* then it encodes the values outside the indicator.
|
* then it encodes the values outside the indicator.
|
||||||
*/
|
*/
|
||||||
protected void encodeChunk() throws IOException {
|
protected void encodeChunk(BytesRef buf) {
|
||||||
out.write(indicator);
|
// ensure there's enough room in the buffer
|
||||||
for (int i = 0; i < encodeQueueSize; ++i) {
|
int maxBytesRequired = buf.length + 1 + encodeQueue.length * 4; /* indicator + at most 4 bytes per positive VInt */
|
||||||
encoder.encode(encodeQueue[i]);
|
if (buf.bytes.length < maxBytesRequired) {
|
||||||
|
buf.grow(maxBytesRequired);
|
||||||
}
|
}
|
||||||
encodeQueueSize = 0;
|
|
||||||
ordinal = 0;
|
buf.bytes[buf.length++] = ((byte) indicator);
|
||||||
indicator = 0;
|
for (int i = 0; i < encodeQueue.length; i++) {
|
||||||
|
VInt8.encode(encodeQueue.ints[i], buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
protected void reset() {
|
||||||
if (ordinal != 0) {
|
|
||||||
encodeChunk();
|
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
super.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
encoder.reInit(out);
|
|
||||||
super.reInit(out);
|
|
||||||
ordinal = 0;
|
ordinal = 0;
|
||||||
indicator = 0;
|
indicator = 0;
|
||||||
encodeQueueSize = 0;
|
encodeQueue.length = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.InputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,10 +21,8 @@ import java.io.InputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An {@link IntDecoder} which wraps another {@link IntDecoder} and reverts the
|
* An {@link IntDecoder} which wraps another decoder and reverts the d-gap that
|
||||||
* d-gap that was encoded by {@link DGapIntEncoder}. The wrapped decoder
|
* was encoded by {@link DGapIntEncoder}.
|
||||||
* performs the actual decoding, while this class simply adds the decoded value
|
|
||||||
* to the previous value.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -32,26 +30,23 @@ public class DGapIntDecoder extends IntDecoder {
|
||||||
|
|
||||||
private final IntDecoder decoder;
|
private final IntDecoder decoder;
|
||||||
|
|
||||||
private int prev = 0;
|
|
||||||
|
|
||||||
public DGapIntDecoder(IntDecoder decoder) {
|
public DGapIntDecoder(IntDecoder decoder) {
|
||||||
this.decoder = decoder;
|
this.decoder = decoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long decode() throws IOException {
|
protected void reset() {
|
||||||
long decode = decoder.decode();
|
decoder.reset();
|
||||||
if (decode == EOS) {
|
|
||||||
return EOS;
|
|
||||||
}
|
|
||||||
|
|
||||||
return prev += decode;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reInit(InputStream in) {
|
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||||
decoder.reInit(in);
|
decoder.doDecode(buf, values, upto);
|
||||||
prev = 0;
|
int prev = 0;
|
||||||
|
for (int i = 0; i < values.length; i++) {
|
||||||
|
values.ints[i] += prev;
|
||||||
|
prev = values.ints[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.OutputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -27,7 +27,7 @@ import java.io.OutputStream;
|
||||||
* space) if the values are 'close' to each other.
|
* space) if the values are 'close' to each other.
|
||||||
* <p>
|
* <p>
|
||||||
* <b>NOTE:</b> this encoder assumes the values are given to
|
* <b>NOTE:</b> this encoder assumes the values are given to
|
||||||
* {@link #encode(int)} in an ascending sorted manner, which ensures only
|
* {@link #encode(IntsRef, BytesRef)} in an ascending sorted manner, which ensures only
|
||||||
* positive values are encoded and thus yields better performance. If you are
|
* positive values are encoded and thus yields better performance. If you are
|
||||||
* not sure whether the values are sorted or not, it is possible to chain this
|
* not sure whether the values are sorted or not, it is possible to chain this
|
||||||
* encoder with {@link SortingIntEncoder} to ensure the values will be
|
* encoder with {@link SortingIntEncoder} to ensure the values will be
|
||||||
|
@ -37,17 +37,20 @@ import java.io.OutputStream;
|
||||||
*/
|
*/
|
||||||
public class DGapIntEncoder extends IntEncoderFilter {
|
public class DGapIntEncoder extends IntEncoderFilter {
|
||||||
|
|
||||||
private int prev = 0;
|
|
||||||
|
|
||||||
/** Initializes with the given encoder. */
|
/** Initializes with the given encoder. */
|
||||||
public DGapIntEncoder(IntEncoder encoder) {
|
public DGapIntEncoder(IntEncoder encoder) {
|
||||||
super(encoder);
|
super(encoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void encode(int value) throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
encoder.encode(value - prev);
|
int prev = 0;
|
||||||
prev = value;
|
for (int i = values.offset; i < upto; i++) {
|
||||||
|
int tmp = values.ints[i];
|
||||||
|
values.ints[i] -= prev;
|
||||||
|
prev = tmp;
|
||||||
|
}
|
||||||
|
encoder.doEncode(values, buf, upto);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -55,12 +58,6 @@ public class DGapIntEncoder extends IntEncoderFilter {
|
||||||
return new DGapIntDecoder(encoder.createMatchingDecoder());
|
return new DGapIntDecoder(encoder.createMatchingDecoder());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
super.reInit(out);
|
|
||||||
prev = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "DGap (" + encoder.toString() + ")";
|
return "DGap (" + encoder.toString() + ")";
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.InputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,20 +21,17 @@ import java.io.InputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decodes data which was encoded by {@link EightFlagsIntEncoder}. Scans
|
* Decodes values encoded with {@link EightFlagsIntEncoder}.
|
||||||
* the <code>indicator</code>, one flag (1-bits) at a time, and decodes extra
|
|
||||||
* data using {@link VInt8IntDecoder}.
|
|
||||||
*
|
*
|
||||||
* @see EightFlagsIntEncoder
|
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class EightFlagsIntDecoder extends IntDecoder {
|
public class EightFlagsIntDecoder extends IntDecoder {
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Holds all combinations of <i>indicator</i> for fast decoding (saves time
|
* Holds all combinations of <i>indicator</i> for fast decoding (saves time
|
||||||
* on real-time bit manipulation)
|
* on real-time bit manipulation)
|
||||||
*/
|
*/
|
||||||
private static final byte[][] decodeTable = new byte[256][8];
|
private static final byte[][] DECODE_TABLE = new byte[256][8];
|
||||||
|
|
||||||
/** Generating all combinations of <i>indicator</i> into separate flags. */
|
/** Generating all combinations of <i>indicator</i> into separate flags. */
|
||||||
static {
|
static {
|
||||||
|
@ -42,45 +39,36 @@ public class EightFlagsIntDecoder extends IntDecoder {
|
||||||
--i;
|
--i;
|
||||||
for (int j = 8; j != 0;) {
|
for (int j = 8; j != 0;) {
|
||||||
--j;
|
--j;
|
||||||
decodeTable[i][j] = (byte) ((i >>> j) & 0x1);
|
DECODE_TABLE[i][j] = (byte) ((i >>> j) & 0x1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final IntDecoder decoder = new VInt8IntDecoder();
|
|
||||||
|
|
||||||
/** The indicator for decoding a chunk of 8 integers. */
|
|
||||||
private int indicator;
|
|
||||||
|
|
||||||
/** Used as an ordinal of 0 - 7, as the decoder decodes chunks of 8 integers. */
|
|
||||||
private int ordinal = 0;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long decode() throws IOException {
|
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||||
// If we've decoded 8 integers, read the next indicator.
|
while (buf.offset < upto) {
|
||||||
if ((ordinal & 0x7) == 0) {
|
// read indicator
|
||||||
indicator = in.read();
|
int indicator = buf.bytes[buf.offset++] & 0xFF;
|
||||||
if (indicator < 0) {
|
int ordinal = 0;
|
||||||
return EOS;
|
|
||||||
|
int capacityNeeded = values.length + 8;
|
||||||
|
if (values.ints.length < capacityNeeded) {
|
||||||
|
values.grow(capacityNeeded);
|
||||||
|
}
|
||||||
|
|
||||||
|
// process indicator, until we read 8 values, or end-of-buffer
|
||||||
|
while (ordinal != 8) {
|
||||||
|
if (DECODE_TABLE[indicator][ordinal++] == 0) {
|
||||||
|
if (buf.offset == upto) { // end of buffer
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// decode the value from the stream.
|
||||||
|
values.ints[values.length++] = VInt8.decode(buf) + 2;
|
||||||
|
} else {
|
||||||
|
values.ints[values.length++] = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ordinal = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (decodeTable[indicator][ordinal++] == 0) {
|
|
||||||
// decode the value from the stream.
|
|
||||||
long decode = decoder.decode();
|
|
||||||
return decode == EOS ? EOS : decode + 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(InputStream in) {
|
|
||||||
super.reInit(in);
|
|
||||||
decoder.reInit(in);
|
|
||||||
ordinal = 0;
|
|
||||||
indicator = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -20,14 +21,15 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group starts with a single
|
* A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group
|
||||||
* byte (called indicator) which represents 8 - 1 bit flags, where the value:
|
* starts with a single byte (called indicator) which represents 8 - 1 bit
|
||||||
|
* flags, where the value:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>1 means the encoded value is '1'
|
* <li>1 means the encoded value is '1'
|
||||||
* <li>0 means the value is encoded using {@link VInt8IntEncoder}, and the
|
* <li>0 means the value is encoded using {@link VInt8IntEncoder}, and the
|
||||||
* encoded bytes follow the indicator.<br>
|
* encoded bytes follow the indicator.<br>
|
||||||
* Since value 0 is illegal, and 1 is encoded in the indicator, the actual
|
* Since value 0 is illegal, and 1 is encoded in the indicator, the actual value
|
||||||
* value that is encoded is <code>value-2</code>, which saves some more bits.
|
* that is encoded is <code>value-2</code>, which saves some more bits.
|
||||||
* </ul>
|
* </ul>
|
||||||
* Encoding example:
|
* Encoding example:
|
||||||
* <ul>
|
* <ul>
|
||||||
|
@ -46,28 +48,36 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class EightFlagsIntEncoder extends ChunksIntEncoder {
|
public class EightFlagsIntEncoder extends ChunksIntEncoder {
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Holds all combinations of <i>indicator</i> flags for fast encoding (saves
|
* Holds all combinations of <i>indicator</i> flags for fast encoding (saves
|
||||||
* time on bit manipulation at encode time)
|
* time on bit manipulation at encode time)
|
||||||
*/
|
*/
|
||||||
private static byte[] encodeTable = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 };
|
private static final byte[] ENCODE_TABLE = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 };
|
||||||
|
|
||||||
public EightFlagsIntEncoder() {
|
public EightFlagsIntEncoder() {
|
||||||
super(8);
|
super(8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void encode(int data) throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
if (data == 1) {
|
for (int i = values.offset; i < upto; i++) {
|
||||||
indicator |= encodeTable[ordinal];
|
int value = values.ints[i];
|
||||||
} else {
|
if (value == 1) {
|
||||||
encodeQueue[encodeQueueSize++] = data - 2;
|
indicator |= ENCODE_TABLE[ordinal];
|
||||||
|
} else {
|
||||||
|
encodeQueue.ints[encodeQueue.length++] = value - 2;
|
||||||
|
}
|
||||||
|
++ordinal;
|
||||||
|
|
||||||
|
// encode the chunk and the indicator
|
||||||
|
if (ordinal == 8) {
|
||||||
|
encodeChunk(buf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
++ordinal;
|
|
||||||
|
// encode remaining values
|
||||||
// If 8 values were encoded thus far, 'flush' them including the indicator.
|
if (ordinal != 0) {
|
||||||
if ((ordinal & 0x7) == 0) {
|
encodeChunk(buf);
|
||||||
encodeChunk();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +88,7 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "EightFlags (" + encoder.toString() + ")";
|
return "EightFlags (VInt)";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.InputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,11 +21,8 @@ import java.io.InputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decodes data which was encoded by {@link FourFlagsIntEncoder}. Scans
|
* Decodes values encoded with {@link FourFlagsIntEncoder}.
|
||||||
* the <code>indicator</code>, one flag (1-bits) at a time, and decodes extra
|
|
||||||
* data using {@link VInt8IntDecoder}.
|
|
||||||
*
|
*
|
||||||
* @see FourFlagsIntEncoder
|
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class FourFlagsIntDecoder extends IntDecoder {
|
public class FourFlagsIntDecoder extends IntDecoder {
|
||||||
|
@ -34,7 +31,7 @@ public class FourFlagsIntDecoder extends IntDecoder {
|
||||||
* Holds all combinations of <i>indicator</i> for fast decoding (saves time
|
* Holds all combinations of <i>indicator</i> for fast decoding (saves time
|
||||||
* on real-time bit manipulation)
|
* on real-time bit manipulation)
|
||||||
*/
|
*/
|
||||||
private final static byte[][] decodeTable = new byte[256][4];
|
private final static byte[][] DECODE_TABLE = new byte[256][4];
|
||||||
|
|
||||||
/** Generating all combinations of <i>indicator</i> into separate flags. */
|
/** Generating all combinations of <i>indicator</i> into separate flags. */
|
||||||
static {
|
static {
|
||||||
|
@ -42,46 +39,36 @@ public class FourFlagsIntDecoder extends IntDecoder {
|
||||||
--i;
|
--i;
|
||||||
for (int j = 4; j != 0;) {
|
for (int j = 4; j != 0;) {
|
||||||
--j;
|
--j;
|
||||||
decodeTable[i][j] = (byte) ((i >>> (j << 1)) & 0x3);
|
DECODE_TABLE[i][j] = (byte) ((i >>> (j << 1)) & 0x3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final IntDecoder decoder = new VInt8IntDecoder();
|
|
||||||
|
|
||||||
/** The indicator for decoding a chunk of 4 integers. */
|
|
||||||
private int indicator;
|
|
||||||
|
|
||||||
/** Used as an ordinal of 0 - 3, as the decoder decodes chunks of 4 integers. */
|
|
||||||
private int ordinal = 0;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long decode() throws IOException {
|
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||||
// If we've decoded 8 integers, read the next indicator.
|
while (buf.offset < upto) {
|
||||||
if ((ordinal & 0x3) == 0) {
|
// read indicator
|
||||||
indicator = in.read();
|
int indicator = buf.bytes[buf.offset++] & 0xFF;
|
||||||
if (indicator < 0) {
|
int ordinal = 0;
|
||||||
return EOS;
|
|
||||||
|
int capacityNeeded = values.length + 4;
|
||||||
|
if (values.ints.length < capacityNeeded) {
|
||||||
|
values.grow(capacityNeeded);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (ordinal != 4) {
|
||||||
|
byte decodeVal = DECODE_TABLE[indicator][ordinal++];
|
||||||
|
if (decodeVal == 0) {
|
||||||
|
if (buf.offset == upto) { // end of buffer
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// decode the value from the stream.
|
||||||
|
values.ints[values.length++] = VInt8.decode(buf) + 4;
|
||||||
|
} else {
|
||||||
|
values.ints[values.length++] = decodeVal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ordinal = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
byte decodeVal = decodeTable[indicator][ordinal++];
|
|
||||||
if (decodeVal == 0) {
|
|
||||||
// decode the value from the stream.
|
|
||||||
long decode = decoder.decode();
|
|
||||||
return decode == EOS ? EOS : decode + 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
return decodeVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(InputStream in) {
|
|
||||||
super.reInit(in);
|
|
||||||
decoder.reInit(in);
|
|
||||||
ordinal = 0;
|
|
||||||
indicator = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -48,11 +49,11 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class FourFlagsIntEncoder extends ChunksIntEncoder {
|
public class FourFlagsIntEncoder extends ChunksIntEncoder {
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Holds all combinations of <i>indicator</i> flags for fast encoding (saves
|
* Holds all combinations of <i>indicator</i> flags for fast encoding (saves
|
||||||
* time on bit manipulation @ encode time)
|
* time on bit manipulation @ encode time)
|
||||||
*/
|
*/
|
||||||
private static byte[][] encodeTable = new byte[][] {
|
private static final byte[][] ENCODE_TABLE = new byte[][] {
|
||||||
new byte[] { 0x00, 0x00, 0x00, 0x00 },
|
new byte[] { 0x00, 0x00, 0x00, 0x00 },
|
||||||
new byte[] { 0x01, 0x04, 0x10, 0x40 },
|
new byte[] { 0x01, 0x04, 0x10, 0x40 },
|
||||||
new byte[] { 0x02, 0x08, 0x20, (byte) 0x80 },
|
new byte[] { 0x02, 0x08, 0x20, (byte) 0x80 },
|
||||||
|
@ -63,26 +64,26 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder {
|
||||||
super(4);
|
super(4);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Small values (<=3) are stored in the <code>indicator</code> while larger
|
|
||||||
* values are saved for later encoding in the {@link #encodeQueue}. Since
|
|
||||||
* Vint8 will only encode values larger or equal to 4, the values saves for
|
|
||||||
* encoded are transformed to (value - 4).<br>
|
|
||||||
* When a chunk is ready (got 4 values), the {@link #encodeChunk()}
|
|
||||||
* takes control.
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void encode(int data) throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
if (data <= 3) {
|
for (int i = values.offset; i < upto; i++) {
|
||||||
indicator |= encodeTable[data][ordinal];
|
int value = values.ints[i];
|
||||||
} else {
|
if (value <= 3) {
|
||||||
encodeQueue[encodeQueueSize++] = data - 4;
|
indicator |= ENCODE_TABLE[value][ordinal];
|
||||||
|
} else {
|
||||||
|
encodeQueue.ints[encodeQueue.length++] = value - 4;
|
||||||
|
}
|
||||||
|
++ordinal;
|
||||||
|
|
||||||
|
// encode the chunk and the indicator
|
||||||
|
if (ordinal == 4) {
|
||||||
|
encodeChunk(buf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
++ordinal;
|
|
||||||
|
// encode remaining values
|
||||||
// If 4 values were encoded thus far, 'flush' them including the indicator.
|
if (ordinal != 0) {
|
||||||
if ((ordinal & 0x3) == 0) {
|
encodeChunk(buf);
|
||||||
encodeChunk();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,7 +94,7 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "FourFlags (" + encoder.toString() + ")";
|
return "FourFlags (VInt)";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.InputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,33 +21,50 @@ import java.io.InputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decodes integers from a set {@link InputStream}. For re-usability, the
|
* Decodes integers from a set {@link BytesRef}.
|
||||||
* decoder's input stream can be set by ({@link #reInit(InputStream)}).
|
|
||||||
* By design, Decoders are NOT thread-safe.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public abstract class IntDecoder {
|
public abstract class IntDecoder {
|
||||||
|
|
||||||
/** A special long value which is used to indicate end-of-stream has reached. */
|
/**
|
||||||
public static final long EOS = 0x100000000L;
|
* Performs the actual decoding. Values should be read from
|
||||||
|
* {@link BytesRef#offset} up to {@code upto}. Also, {@code values} offset and
|
||||||
/** Input stream from which the encoded bytes are read */
|
* length are set to 0 and the encoder is expected to update
|
||||||
protected InputStream in;
|
* {@link IntsRef#length}, but not {@link IntsRef#offset}.
|
||||||
|
*
|
||||||
/** Sets the input stream from which the encoded data is read. */
|
* <p>
|
||||||
public void reInit(InputStream in) {
|
* <b>NOTE:</b> it is ok to use the buffer's offset as the current position in
|
||||||
this.in = in;
|
* the buffer (and modify it), it will be reset by
|
||||||
|
* {@link #decode(BytesRef, IntsRef)}.
|
||||||
|
*/
|
||||||
|
protected abstract void doDecode(BytesRef buf, IntsRef values, int upto);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called before {@link #doDecode(BytesRef, IntsRef, int)} so that decoders
|
||||||
|
* can reset their state.
|
||||||
|
*/
|
||||||
|
protected void reset() {
|
||||||
|
// do nothing by default
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decodes data received from the input stream, and returns one decoded
|
* Decodes the values from the buffer into the given {@link IntsRef}. Note
|
||||||
* integer. If end of stream is reached, {@link #EOS} is returned.
|
* that {@code values.offset} and {@code values.length} are set to 0.
|
||||||
*
|
|
||||||
* @return one decoded integer as long or {@link #EOS} if end-of-stream
|
|
||||||
* reached.
|
|
||||||
* @throws IOException if an I/O error occurs
|
|
||||||
*/
|
*/
|
||||||
public abstract long decode() throws IOException;
|
public final void decode(BytesRef buf, IntsRef values) {
|
||||||
|
values.offset = values.length = 0; // must do that because we cannot grow() them otherwise
|
||||||
|
|
||||||
|
// some decoders may use the buffer's offset as a position index, so save
|
||||||
|
// current offset.
|
||||||
|
int bufOffset = buf.offset;
|
||||||
|
|
||||||
|
reset();
|
||||||
|
doDecode(buf, values, buf.offset + buf.length);
|
||||||
|
assert values.offset == 0 : "offset should not have been modified by the decoder.";
|
||||||
|
|
||||||
|
// fix offset
|
||||||
|
buf.offset = bufOffset;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import java.io.OutputStream;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -22,94 +21,47 @@ import java.io.OutputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encodes integers to a set {@link OutputStream}. Extending classes need to
|
* Encodes integers to a set {@link BytesRef}. For convenience, each encoder
|
||||||
* override {@link #encode(int)} to encode the value using their encoding
|
* implements {@link #createMatchingDecoder()} for easy access to the matching
|
||||||
* algorithm. The default implementation of {@link #close()} closes the set
|
* decoder.
|
||||||
* {@link OutputStream}.
|
|
||||||
* <p>
|
|
||||||
* The default {@link #IntEncoder() constructor} is provided for convenience
|
|
||||||
* only. One must call {@link #reInit(OutputStream)} before calling
|
|
||||||
* {@link #encode(int)} or {@link #close()}.
|
|
||||||
* <p>
|
|
||||||
* For convenience, each encoder implements {@link #createMatchingDecoder()} for
|
|
||||||
* easy access to the matching decoder.
|
|
||||||
* <p>
|
|
||||||
* <b>NOTE:</b> some implementations may buffer the encoded values in memory
|
|
||||||
* (such as {@link IntEncoderFilter} implementations) and encoding will happen
|
|
||||||
* only upon calling {@link #close()}. Therefore it is important to always call
|
|
||||||
* {@link #close()} on the encoder at hand.
|
|
||||||
* <p>
|
|
||||||
* <b>NOTE:</b> encoders are usually not thread safe, unless specifically
|
|
||||||
* documented otherwise by an implementation.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public abstract class IntEncoder implements Closeable {
|
public abstract class IntEncoder {
|
||||||
|
|
||||||
protected OutputStream out = null;
|
public IntEncoder() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default constructor, provided here for robustness: if in the future a
|
* Performs the actual encoding. Values should be read from
|
||||||
* constructor with parameters will be added, this might break custom
|
* {@link IntsRef#offset} up to {@code upto}. Also, it is guaranteed that
|
||||||
* implementations of this class which call this implicit constructor. So we
|
* {@code buf's} offset and length are set to 0 and the encoder is expected to
|
||||||
* make it explicit to avoid any such issue in the future.
|
* update {@link BytesRef#length}, but not {@link BytesRef#offset}.
|
||||||
*/
|
*/
|
||||||
public IntEncoder() {
|
protected abstract void doEncode(IntsRef values, BytesRef buf, int upto);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called before {@link #doEncode(IntsRef, BytesRef, int)} so that encoders
|
||||||
|
* can reset their state.
|
||||||
|
*/
|
||||||
|
protected void reset() {
|
||||||
|
// do nothing by default
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instructs the encoder to finish the encoding process. This method closes
|
* Encodes the values to the given buffer. Note that the buffer's offset and
|
||||||
* the output stream which was specified by {@link #reInit(OutputStream)
|
* length are set to 0.
|
||||||
* reInit}. An implementation may do here additional cleanup required to
|
|
||||||
* complete the encoding, such as flushing internal buffers, etc.<br>
|
|
||||||
* Once this method was called, no further calls to {@link #encode(int)
|
|
||||||
* encode} should be made before first calling {@link #reInit(OutputStream)
|
|
||||||
* reInit}.
|
|
||||||
* <p>
|
|
||||||
* <b>NOTE:</b> overriding classes should make sure they either call
|
|
||||||
* <code>super.close()</code> or close the output stream themselves.
|
|
||||||
*/
|
*/
|
||||||
@Override
|
public final void encode(IntsRef values, BytesRef buf) {
|
||||||
public void close() throws IOException {
|
buf.offset = buf.length = 0;
|
||||||
if (out != null) {
|
reset();
|
||||||
out.close();
|
doEncode(values, buf, values.offset + values.length);
|
||||||
}
|
assert buf.offset == 0 : "offset should not have been modified by the encoder.";
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encodes an integer to the output stream given in
|
* Returns an {@link IntDecoder} which can decode the values that were encoded
|
||||||
* {@link #reInit(OutputStream) reInit}
|
* with this encoder.
|
||||||
*/
|
|
||||||
public abstract void encode(int value) throws IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns an {@link IntDecoder} which matches this encoder. Every encoder
|
|
||||||
* must return an {@link IntDecoder} and <code>null</code> is not a valid
|
|
||||||
* value. If an encoder is just a filter, it should at least return its
|
|
||||||
* wrapped encoder's matching decoder.
|
|
||||||
* <p>
|
|
||||||
* <b>NOTE:</b> this method should create a new instance of the matching
|
|
||||||
* decoder and leave the instance sharing to the caller. Returning the same
|
|
||||||
* instance over and over is risky because encoders and decoders are not
|
|
||||||
* thread safe.
|
|
||||||
*/
|
*/
|
||||||
public abstract IntDecoder createMatchingDecoder();
|
public abstract IntDecoder createMatchingDecoder();
|
||||||
|
|
||||||
/**
|
|
||||||
* Reinitializes the encoder with the give {@link OutputStream}. For
|
|
||||||
* re-usability it can be changed without the need to reconstruct a new
|
|
||||||
* object.
|
|
||||||
* <p>
|
|
||||||
* <b>NOTE:</b> after calling {@link #close()}, one <u><i>must</i></u> call
|
|
||||||
* this method even if the output stream itself hasn't changed. An example
|
|
||||||
* case is that the output stream wraps a byte[], and the output stream itself
|
|
||||||
* is reset, but its instance hasn't changed. Some implementations of
|
|
||||||
* {@link IntEncoder} may write some metadata about themselves to the output
|
|
||||||
* stream, and therefore it is imperative that one calls this method before
|
|
||||||
* encoding any data.
|
|
||||||
*/
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
this.out = out;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,17 +19,7 @@ import java.io.OutputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An abstract implementation of {@link IntEncoder} which is served as a filter
|
* An abstract implementation of {@link IntEncoder} which wraps another encoder.
|
||||||
* on the values to encode. An encoder filter wraps another {@link IntEncoder}
|
|
||||||
* which does the actual encoding. This allows for chaining filters and
|
|
||||||
* encoders, such as: <code><pre class="prettyprint">
|
|
||||||
* new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEnoder()));
|
|
||||||
* {@link UniqueValuesIntEncoder} followed by {@link DGapIntEncoder}
|
|
||||||
</pre></code>
|
|
||||||
* <p>
|
|
||||||
* The default implementation implements {@link #close()} by closing the wrapped
|
|
||||||
* encoder and {@link #reInit(OutputStream)} by re-initializing the wrapped
|
|
||||||
* encoder.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -44,15 +32,8 @@ public abstract class IntEncoderFilter extends IntEncoder {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void reset() {
|
||||||
// There is no need to call super.close(), since we don't pass the output
|
encoder.reset();
|
||||||
// stream to super.
|
|
||||||
encoder.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
encoder.reInit(out);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.InputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,54 +21,65 @@ import java.io.InputStream;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decodes data which was encoded by {@link NOnesIntEncoder}. Uses a
|
* Decodes values encoded encoded with {@link NOnesIntEncoder}.
|
||||||
* {@link FourFlagsIntDecoder} to perform the actual encoding and translates the
|
|
||||||
* values back as described in {@link NOnesIntEncoder}.
|
|
||||||
*
|
*
|
||||||
* @see NOnesIntEncoder
|
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class NOnesIntDecoder extends FourFlagsIntDecoder {
|
public class NOnesIntDecoder extends FourFlagsIntDecoder {
|
||||||
|
|
||||||
/** Number of consecutive '1's to generate upon decoding a '2'. */
|
// Number of consecutive '1's to generate upon decoding a '2'
|
||||||
private int n;
|
private final int n;
|
||||||
|
private final IntsRef internalBuffer;
|
||||||
private int onesCounter;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a decoder with a given N (Number of consecutive '1's which are
|
* Constructs a decoder with a given N (Number of consecutive '1's which are
|
||||||
* translated into a single target value '2'.
|
* translated into a single target value '2'.
|
||||||
*/
|
*/
|
||||||
public NOnesIntDecoder(int n) {
|
public NOnesIntDecoder(int n) {
|
||||||
this.n = n;
|
this.n = n;
|
||||||
|
// initial size (room for 100 integers)
|
||||||
|
internalBuffer = new IntsRef(100);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long decode() throws IOException {
|
protected void reset() {
|
||||||
// If we read '2', we should return n '1's.
|
internalBuffer.length = 0;
|
||||||
if (onesCounter > 0) {
|
super.reset();
|
||||||
--onesCounter;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
long decode = super.decode();
|
|
||||||
if (decode == 1) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (decode == 2) {
|
|
||||||
onesCounter = n - 1;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (decode == 3) {
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
return decode == EOS ? EOS : decode - 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reInit(InputStream in) {
|
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||||
super.reInit(in);
|
super.doDecode(buf, internalBuffer, upto);
|
||||||
onesCounter = 0;
|
if (values.ints.length < internalBuffer.length) {
|
||||||
|
// need space for internalBuffer.length to internalBuffer.length*N,
|
||||||
|
// grow mildly at first
|
||||||
|
values.grow(internalBuffer.length * n/2);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < internalBuffer.length; i++) {
|
||||||
|
int decode = internalBuffer.ints[i];
|
||||||
|
if (decode == 1) {
|
||||||
|
if (values.length == values.ints.length) {
|
||||||
|
values.grow(values.length + 10); // grow by few items, however not too many
|
||||||
|
}
|
||||||
|
// 1 is 1
|
||||||
|
values.ints[values.length++] = 1;
|
||||||
|
} else if (decode == 2) {
|
||||||
|
if (values.length + n >= values.ints.length) {
|
||||||
|
values.grow(values.length + n); // grow by few items, however not too many
|
||||||
|
}
|
||||||
|
// '2' means N 1's
|
||||||
|
for (int j = 0; j < n; j++) {
|
||||||
|
values.ints[values.length++] = 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (values.length == values.ints.length) {
|
||||||
|
values.grow(values.length + 10); // grow by few items, however not too many
|
||||||
|
}
|
||||||
|
// any other value is val-1
|
||||||
|
values.ints[values.length++] = decode - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.OutputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -50,11 +50,10 @@ import java.io.OutputStream;
|
||||||
*/
|
*/
|
||||||
public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
||||||
|
|
||||||
|
private final IntsRef internalBuffer;
|
||||||
|
|
||||||
/** Number of consecutive '1's to be translated into single target value '2'. */
|
/** Number of consecutive '1's to be translated into single target value '2'. */
|
||||||
private int n;
|
private final int n;
|
||||||
|
|
||||||
/** Counts the number of consecutive ones seen. */
|
|
||||||
private int onesCounter = 0;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs an encoder with a given value of N (N: Number of consecutive
|
* Constructs an encoder with a given value of N (N: Number of consecutive
|
||||||
|
@ -62,38 +61,48 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
||||||
*/
|
*/
|
||||||
public NOnesIntEncoder(int n) {
|
public NOnesIntEncoder(int n) {
|
||||||
this.n = n;
|
this.n = n;
|
||||||
|
internalBuffer = new IntsRef(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
protected void reset() {
|
||||||
// We might have ones in our buffer, encode them as neccesary.
|
internalBuffer.length = 0;
|
||||||
while (onesCounter-- > 0) {
|
super.reset();
|
||||||
super.encode(1);
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
|
// make sure the internal buffer is large enough
|
||||||
|
if (values.length > internalBuffer.ints.length) {
|
||||||
|
internalBuffer.grow(values.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
super.close();
|
int onesCounter = 0;
|
||||||
}
|
for (int i = values.offset; i < upto; i++) {
|
||||||
|
int value = values.ints[i];
|
||||||
@Override
|
if (value == 1) {
|
||||||
public void encode(int value) throws IOException {
|
// every N 1's should be encoded as '2'
|
||||||
if (value == 1) {
|
if (++onesCounter == n) {
|
||||||
// Increment the number of consecutive ones seen so far
|
internalBuffer.ints[internalBuffer.length++] = 2;
|
||||||
if (++onesCounter == n) {
|
onesCounter = 0;
|
||||||
super.encode(2);
|
}
|
||||||
onesCounter = 0;
|
} else {
|
||||||
|
// there might have been 1's that we need to encode
|
||||||
|
while (onesCounter > 0) {
|
||||||
|
--onesCounter;
|
||||||
|
internalBuffer.ints[internalBuffer.length++] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// encode value as value+1
|
||||||
|
internalBuffer.ints[internalBuffer.length++] = value + 1;
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
// there might have been 1's that we need to encode
|
||||||
// If it's not one - there might have been ones we had to encode prior to
|
|
||||||
// this value
|
|
||||||
while (onesCounter > 0) {
|
while (onesCounter > 0) {
|
||||||
--onesCounter;
|
--onesCounter;
|
||||||
super.encode(1);
|
internalBuffer.ints[internalBuffer.length++] = 1;
|
||||||
}
|
}
|
||||||
|
super.doEncode(internalBuffer, buf, internalBuffer.length);
|
||||||
// encode value + 1 --> the translation.
|
|
||||||
super.encode(value + 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -101,12 +110,6 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
|
||||||
return new NOnesIntDecoder(n);
|
return new NOnesIntDecoder(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
super.reInit(out);
|
|
||||||
onesCounter = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "NOnes (" + n + ") (" + super.toString() + ")";
|
return "NOnes (" + n + ") (" + super.toString() + ")";
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.StreamCorruptedException;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -21,41 +21,24 @@ import java.io.StreamCorruptedException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A simple stream decoder which can decode values encoded with
|
* Decodes values encoded with {@link SimpleIntEncoder}.
|
||||||
* {@link SimpleIntEncoder}.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class SimpleIntDecoder extends IntDecoder {
|
public class SimpleIntDecoder extends IntDecoder {
|
||||||
|
|
||||||
/**
|
|
||||||
* reusable buffer - allocated only once as this is not a thread-safe object
|
|
||||||
*/
|
|
||||||
private byte[] buffer = new byte[4];
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long decode() throws IOException {
|
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||||
|
while (buf.offset < upto) {
|
||||||
// we need exactly 4 bytes to decode an int in this decoder impl, otherwise, throw an exception
|
if (values.length == values.ints.length) {
|
||||||
int offset = 0;
|
values.grow(values.length + 10); // grow by few items, however not too many
|
||||||
while (offset < 4) {
|
|
||||||
int nRead = in.read(buffer, offset, 4 - offset);
|
|
||||||
if (nRead == -1) {
|
|
||||||
if (offset > 0) {
|
|
||||||
throw new StreamCorruptedException(
|
|
||||||
"Need 4 bytes for decoding an int, got only " + offset);
|
|
||||||
}
|
|
||||||
return EOS;
|
|
||||||
}
|
}
|
||||||
offset += nRead;
|
values.ints[values.length++] =
|
||||||
|
((buf.bytes[buf.offset++] & 0xFF) << 24) |
|
||||||
|
((buf.bytes[buf.offset++] & 0xFF) << 16) |
|
||||||
|
((buf.bytes[buf.offset++] & 0xFF) << 8) |
|
||||||
|
(buf.bytes[buf.offset++] & 0xFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
int v = buffer[3] & 0xff;
|
|
||||||
v |= (buffer[2] << 8) & 0xff00;
|
|
||||||
v |= (buffer[1] << 16) & 0xff0000;
|
|
||||||
v |= (buffer[0] << 24) & 0xff000000;
|
|
||||||
|
|
||||||
return v;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -26,22 +27,21 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class SimpleIntEncoder extends IntEncoder {
|
public class SimpleIntEncoder extends IntEncoder {
|
||||||
|
|
||||||
/**
|
|
||||||
* This method makes sure the value wasn't previously encoded by checking
|
|
||||||
* against the Set. If the value wasn't encoded, it's added to the Set, and
|
|
||||||
* encoded with {#link Vint8#encode}
|
|
||||||
*
|
|
||||||
* @param value
|
|
||||||
* an integer to be encoded
|
|
||||||
* @throws IOException
|
|
||||||
* possibly thrown by the OutputStream
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void encode(int value) throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
out.write(value >>> 24);
|
// ensure there's enough room in the buffer
|
||||||
out.write((value >> 16) & 0xFF);
|
int bytesNeeded = values.length * 4;
|
||||||
out.write((value >> 8) & 0xFF);
|
if (buf.bytes.length < bytesNeeded) {
|
||||||
out.write(value & 0xFF);
|
buf.grow(bytesNeeded);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = values.offset; i < upto; i++) {
|
||||||
|
int value = values.ints[i];
|
||||||
|
buf.bytes[buf.length++] = (byte) (value >>> 24);
|
||||||
|
buf.bytes[buf.length++] = (byte) ((value >> 16) & 0xFF);
|
||||||
|
buf.bytes[buf.length++] = (byte) ((value >> 8) & 0xFF);
|
||||||
|
buf.bytes[buf.length++] = (byte) (value & 0xFF);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -23,47 +24,21 @@ import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An {@link IntEncoderFilter} which sorts the values to encode in ascending
|
* An {@link IntEncoderFilter} which sorts the values to encode in ascending
|
||||||
* order before encoding them. Encoding therefore happens upon calling
|
* order before encoding them.
|
||||||
* {@link #close()}. Since this encoder is usually chained with another encoder
|
|
||||||
* that relies on sorted values, it does not offer a default constructor.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class SortingIntEncoder extends IntEncoderFilter {
|
public class SortingIntEncoder extends IntEncoderFilter {
|
||||||
|
|
||||||
private float grow = 2.0f;
|
|
||||||
private int index = 0;
|
|
||||||
private int[] set = new int[1024];
|
|
||||||
|
|
||||||
/** Initializes with the given encoder. */
|
/** Initializes with the given encoder. */
|
||||||
public SortingIntEncoder(IntEncoder encoder) {
|
public SortingIntEncoder(IntEncoder encoder) {
|
||||||
super(encoder);
|
super(encoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
if (index == 0) {
|
Arrays.sort(values.ints, values.offset, upto);
|
||||||
return;
|
encoder.doEncode(values, buf, upto);
|
||||||
}
|
|
||||||
|
|
||||||
Arrays.sort(set, 0, index);
|
|
||||||
for (int i = 0; i < index; i++) {
|
|
||||||
encoder.encode(set[i]);
|
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
index = 0;
|
|
||||||
|
|
||||||
super.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void encode(int value) throws IOException {
|
|
||||||
if (index == set.length) {
|
|
||||||
int[] newSet = new int[(int) (set.length * grow)];
|
|
||||||
System.arraycopy(set, 0, newSet, 0, set.length);
|
|
||||||
set = newSet;
|
|
||||||
}
|
|
||||||
set[index++] = value;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -71,12 +46,6 @@ public class SortingIntEncoder extends IntEncoderFilter {
|
||||||
return encoder.createMatchingDecoder();
|
return encoder.createMatchingDecoder();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
super.reInit(out);
|
|
||||||
index = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Sorting (" + encoder.toString() + ")";
|
return "Sorting (" + encoder.toString() + ")";
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import java.io.OutputStream;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -22,7 +22,7 @@ import java.io.OutputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An {@link IntEncoderFilter} which ensures only unique values are encoded. The
|
* An {@link IntEncoderFilter} which ensures only unique values are encoded. The
|
||||||
* implementation assumes the values given to {@link #encode(int)} are sorted.
|
* implementation assumes the values given to {@link #encode(IntsRef, BytesRef)} are sorted.
|
||||||
* If this is not the case, you can chain this encoder with
|
* If this is not the case, you can chain this encoder with
|
||||||
* {@link SortingIntEncoder}.
|
* {@link SortingIntEncoder}.
|
||||||
*
|
*
|
||||||
|
@ -30,26 +30,23 @@ import java.io.OutputStream;
|
||||||
*/
|
*/
|
||||||
public final class UniqueValuesIntEncoder extends IntEncoderFilter {
|
public final class UniqueValuesIntEncoder extends IntEncoderFilter {
|
||||||
|
|
||||||
/**
|
|
||||||
* Denotes an illegal value which we can use to init 'prev' to. Since all
|
|
||||||
* encoded values are integers, this value is init to MAX_INT+1 and is of type
|
|
||||||
* long. Therefore we are guaranteed not to get this value in encode.
|
|
||||||
*/
|
|
||||||
private static final long ILLEGAL_VALUE = Integer.MAX_VALUE + 1;
|
|
||||||
|
|
||||||
private long prev = ILLEGAL_VALUE;
|
|
||||||
|
|
||||||
/** Constructs a new instance with the given encoder. */
|
/** Constructs a new instance with the given encoder. */
|
||||||
public UniqueValuesIntEncoder(IntEncoder encoder) {
|
public UniqueValuesIntEncoder(IntEncoder encoder) {
|
||||||
super(encoder);
|
super(encoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void encode(int value) throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
if (prev != value) {
|
int prev = values.ints[values.offset];
|
||||||
encoder.encode(value);
|
int idx = values.offset + 1;
|
||||||
prev = value;
|
for (int i = idx; i < upto; i++) {
|
||||||
|
if (values.ints[i] != prev) {
|
||||||
|
values.ints[idx++] = values.ints[i];
|
||||||
|
prev = values.ints[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
values.length = idx - values.offset;
|
||||||
|
encoder.doEncode(values, buf, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -57,12 +54,6 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter {
|
||||||
return encoder.createMatchingDecoder();
|
return encoder.createMatchingDecoder();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
super.reInit(out);
|
|
||||||
prev = ILLEGAL_VALUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Unique (" + encoder.toString() + ")";
|
return "Unique (" + encoder.toString() + ")";
|
||||||
|
|
|
@ -0,0 +1,138 @@
|
||||||
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is
|
||||||
|
* encoded as follows:
|
||||||
|
* <ul>
|
||||||
|
* <li>If it is less than 127 and non-negative (i.e., if the number uses only 7
|
||||||
|
* bits), it is encoded as as single byte: 0bbbbbbb.
|
||||||
|
* <li>If its highest nonzero bit is greater than bit 6 (0x40), it is
|
||||||
|
* represented as a series of bytes, each byte's 7 LSB containing bits from the
|
||||||
|
* original value, with the MSB set for all but the last byte. The first encoded
|
||||||
|
* byte contains the highest nonzero bits from the original; the second byte
|
||||||
|
* contains the next 7 MSB; and so on, with the last byte containing the 7 LSB
|
||||||
|
* of the original.
|
||||||
|
* </ul>
|
||||||
|
* Examples:
|
||||||
|
* <ol>
|
||||||
|
* <li>n = 117 = 1110101: This has fewer than 8 significant bits, and so is
|
||||||
|
* encoded as 01110101 = 0x75.
|
||||||
|
* <li>n = 100000 = (binary) 11000011010100000. This has 17 significant bits,
|
||||||
|
* and so needs three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits,
|
||||||
|
* then split it into chunks of 7 and add an MSB, 0 for the last byte, 1 for the
|
||||||
|
* others: 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20.
|
||||||
|
* </ol>
|
||||||
|
* {@link #encode(int, BytesRef)} and {@link #decode(BytesRef)} will correctly
|
||||||
|
* handle any 32-bit integer, but for negative numbers, and positive numbers
|
||||||
|
* with more than 28 significant bits, encoding requires 5 bytes; this is not an
|
||||||
|
* efficient encoding scheme for large positive numbers or any negative number.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class VInt8 {
|
||||||
|
|
||||||
|
/** The maximum number of bytes needed to encode an integer. */
|
||||||
|
public static final int MAXIMUM_BYTES_NEEDED = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decodes an int from the given bytes, starting at {@link BytesRef#offset}.
|
||||||
|
* Returns the decoded bytes and updates {@link BytesRef#offset}.
|
||||||
|
*/
|
||||||
|
public static int decode(BytesRef bytes) {
|
||||||
|
/*
|
||||||
|
This is the original code of this method, but a Hotspot bug
|
||||||
|
corrupted the for-loop of DataInput.readVInt() (see LUCENE-2975)
|
||||||
|
so the loop was unwounded here too, to be on the safe side
|
||||||
|
int value = 0;
|
||||||
|
while (true) {
|
||||||
|
byte first = bytes.bytes[bytes.offset++];
|
||||||
|
value |= first & 0x7F;
|
||||||
|
if ((first & 0x80) == 0) {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
value <<= 7;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// byte 1
|
||||||
|
byte b = bytes.bytes[bytes.offset++];
|
||||||
|
if (b >= 0) return b;
|
||||||
|
|
||||||
|
// byte 2
|
||||||
|
int value = b & 0x7F;
|
||||||
|
b = bytes.bytes[bytes.offset++];
|
||||||
|
value = (value << 7) | b & 0x7F;
|
||||||
|
if (b >= 0) return value;
|
||||||
|
|
||||||
|
// byte 3
|
||||||
|
b = bytes.bytes[bytes.offset++];
|
||||||
|
value = (value << 7) | b & 0x7F;
|
||||||
|
if (b >= 0) return value;
|
||||||
|
|
||||||
|
// byte 4
|
||||||
|
b = bytes.bytes[bytes.offset++];
|
||||||
|
value = (value << 7) | b & 0x7F;
|
||||||
|
if (b >= 0) return value;
|
||||||
|
|
||||||
|
// byte 5
|
||||||
|
b = bytes.bytes[bytes.offset++];
|
||||||
|
return (value << 7) | b & 0x7F;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes the given number into bytes, starting at {@link BytesRef#length}.
|
||||||
|
* Assumes that the array is large enough.
|
||||||
|
*/
|
||||||
|
public static void encode(int value, BytesRef bytes) {
|
||||||
|
if ((value & ~0x7F) == 0) {
|
||||||
|
bytes.bytes[bytes.length] = (byte) value;
|
||||||
|
bytes.length++;
|
||||||
|
} else if ((value & ~0x3FFF) == 0) {
|
||||||
|
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||||
|
bytes.bytes[bytes.length + 1] = (byte) (value & 0x7F);
|
||||||
|
bytes.length += 2;
|
||||||
|
} else if ((value & ~0x1FFFFF) == 0) {
|
||||||
|
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||||
|
bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||||
|
bytes.bytes[bytes.length + 2] = (byte) (value & 0x7F);
|
||||||
|
bytes.length += 3;
|
||||||
|
} else if ((value & ~0xFFFFFFF) == 0) {
|
||||||
|
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||||
|
bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||||
|
bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||||
|
bytes.bytes[bytes.length + 3] = (byte) (value & 0x7F);
|
||||||
|
bytes.length += 4;
|
||||||
|
} else {
|
||||||
|
bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28));
|
||||||
|
bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
|
||||||
|
bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
|
||||||
|
bytes.bytes[bytes.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7));
|
||||||
|
bytes.bytes[bytes.length + 4] = (byte) (value & 0x7F);
|
||||||
|
bytes.length += 5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private VInt8() {
|
||||||
|
// Just making it impossible to instantiate.
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -20,33 +21,19 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An {@link IntDecoder} which can decode values encoded by
|
* Decodes values encoded by {@link VInt8IntEncoder}.
|
||||||
* {@link VInt8IntEncoder}.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class VInt8IntDecoder extends IntDecoder {
|
public class VInt8IntDecoder extends IntDecoder {
|
||||||
|
|
||||||
private boolean legalEOS = true;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long decode() throws IOException {
|
protected void doDecode(BytesRef buf, IntsRef values, int upto) {
|
||||||
int value = 0;
|
while (buf.offset < upto) {
|
||||||
while (true) {
|
if (values.length == values.ints.length) {
|
||||||
int first = in.read();
|
values.grow(values.length + 10); // grow by few items, however not too many
|
||||||
if (first < 0) {
|
|
||||||
if (!legalEOS) {
|
|
||||||
throw new IOException("Unexpected End-Of-Stream");
|
|
||||||
}
|
|
||||||
return EOS;
|
|
||||||
}
|
}
|
||||||
value |= first & 0x7F;
|
values.ints[values.length++] = VInt8.decode(buf);
|
||||||
if ((first & 0x80) == 0) {
|
|
||||||
legalEOS = true;
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
legalEOS = false;
|
|
||||||
value <<= 7;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -49,27 +50,14 @@ import java.io.IOException;
|
||||||
public class VInt8IntEncoder extends IntEncoder {
|
public class VInt8IntEncoder extends IntEncoder {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void encode(int value) throws IOException {
|
protected void doEncode(IntsRef values, BytesRef buf, int upto) {
|
||||||
if ((value & ~0x7F) == 0) {
|
int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt
|
||||||
out.write(value);
|
if (buf.bytes.length < maxBytesNeeded) {
|
||||||
} else if ((value & ~0x3FFF) == 0) {
|
buf.grow(maxBytesNeeded);
|
||||||
out.write(0x80 | (value >> 7));
|
}
|
||||||
out.write(0x7F & value);
|
|
||||||
} else if ((value & ~0x1FFFFF) == 0) {
|
for (int i = values.offset; i < upto; i++) {
|
||||||
out.write(0x80 | (value >> 14));
|
VInt8.encode(values.ints[i], buf);
|
||||||
out.write(0x80 | (value >> 7));
|
|
||||||
out.write(0x7F & value);
|
|
||||||
} else if ((value & ~0xFFFFFFF) == 0) {
|
|
||||||
out.write(0x80 | (value >> 21));
|
|
||||||
out.write(0x80 | (value >> 14));
|
|
||||||
out.write(0x80 | (value >> 7));
|
|
||||||
out.write(0x7F & value);
|
|
||||||
} else {
|
|
||||||
out.write(0x80 | (value >> 28));
|
|
||||||
out.write(0x80 | (value >> 21));
|
|
||||||
out.write(0x80 | (value >> 14));
|
|
||||||
out.write(0x80 | (value >> 7));
|
|
||||||
out.write(0x7F & value);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,49 +25,8 @@ mechanisms to create new ones. The super class for all encoders is
|
||||||
encoders there is a matching {@link
|
encoders there is a matching {@link
|
||||||
org.apache.lucene.util.encoding.IntDecoder} implementation (not all
|
org.apache.lucene.util.encoding.IntDecoder} implementation (not all
|
||||||
encoders need a decoder).
|
encoders need a decoder).
|
||||||
<p>An encoder encodes the integers that are passed to {@link
|
<p>
|
||||||
org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} into a
|
Some encoders don't perform any encoding at all, or do not include an
|
||||||
set output stream (see {@link
|
|
||||||
org.apache.lucene.util.encoding.IntEncoder#reInit(OutputStream)
|
|
||||||
reInit}). One should always call {@link
|
|
||||||
org.apache.lucene.util.encoding.IntEncoder#close() close} when all
|
|
||||||
integers have been encoded, to ensure proper finish by the encoder. Some
|
|
||||||
encoders buffer values in-memory and encode in batches in order to
|
|
||||||
optimize the encoding, and not closing them may result in loss of
|
|
||||||
information or corrupt stream.
|
|
||||||
<p>A proper and typical usage of an encoder looks like this:
|
|
||||||
<blockquote><pre class="prettyprint"><code>
|
|
||||||
int[] data = <the values to encode>
|
|
||||||
IntEncoder encoder = new VInt8IntEncoder();
|
|
||||||
OutputStream out = new ByteArrayOutputStream();
|
|
||||||
encoder.reInit(out);
|
|
||||||
for (int val : data) {
|
|
||||||
encoder.encode(val);
|
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
|
|
||||||
// Print the bytes in binary
|
|
||||||
byte[] bytes = out.toByteArray();
|
|
||||||
for (byte b : bytes) {
|
|
||||||
System.out.println(Integer.toBinaryString(b));
|
|
||||||
}
|
|
||||||
</code></pre></blockquote>
|
|
||||||
Each encoder also implements {@link
|
|
||||||
org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder()
|
|
||||||
createMatchingDecoder} which returns the matching decoder for this encoder.
|
|
||||||
As mentioned above, not all encoders have a matching decoder (like some
|
|
||||||
encoder filters which are explained next), however every encoder should
|
|
||||||
return a decoder following a call to that method. To complete the
|
|
||||||
example above, one can easily iterate over the decoded values like this:
|
|
||||||
<blockquote><pre class="prettyprint"><code>
|
|
||||||
IntDecoder d = e.createMatchingDecoder();
|
|
||||||
d.reInit(new ByteArrayInputStream(bytes));
|
|
||||||
long val;
|
|
||||||
while ((val = d.decode()) != IntDecoder.EOS) {
|
|
||||||
System.out.println(val);
|
|
||||||
}
|
|
||||||
</code></pre></blockquote>
|
|
||||||
<p>Some encoders don't perform any encoding at all, or do not include an
|
|
||||||
encoding logic. Those are called {@link
|
encoding logic. Those are called {@link
|
||||||
org.apache.lucene.util.encoding.IntEncoderFilter}s. A filter is an
|
org.apache.lucene.util.encoding.IntEncoderFilter}s. A filter is an
|
||||||
encoder which delegates the encoding task to a given encoder, however
|
encoder which delegates the encoding task to a given encoder, however
|
||||||
|
@ -76,91 +35,6 @@ example is {@link org.apache.lucene.util.encoding.DGapIntEncoder}
|
||||||
which encodes the gaps between values rather than the values themselves.
|
which encodes the gaps between values rather than the values themselves.
|
||||||
Another example is {@link
|
Another example is {@link
|
||||||
org.apache.lucene.util.encoding.SortingIntEncoder} which sorts all the
|
org.apache.lucene.util.encoding.SortingIntEncoder} which sorts all the
|
||||||
values in ascending order before they are sent for encoding. This
|
values in ascending order before they are sent for encoding.
|
||||||
encoder aggregates the values in its {@link
|
|
||||||
org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} implementation
|
|
||||||
and decoding only happens upon calling {@link
|
|
||||||
org.apache.lucene.util.encoding.IntEncoder#close() close}.
|
|
||||||
<h4>Extending IntEncoder</h4>
|
|
||||||
Extending {@link org.apache.lucene.util.encoding.IntEncoder} is a very
|
|
||||||
easy task. One only needs to implement {@link
|
|
||||||
org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} and
|
|
||||||
{@link org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder()
|
|
||||||
createMatchingDecoder} as the base implementation takes care of
|
|
||||||
re-initializing the output stream and closing it. The following example
|
|
||||||
illustrates how can one write an encoder (and a matching decoder) which
|
|
||||||
'tags' the stream with type/ID of the encoder. Such tagging is important
|
|
||||||
in scenarios where an application uses different encoders for different
|
|
||||||
streams, and wants to manage some sort of mapping between an encoder ID
|
|
||||||
to an IntEncoder/Decoder implementation, so a proper decoder will be
|
|
||||||
initialized on the fly:
|
|
||||||
<blockquote><pre class="prettyprint"><code>
|
|
||||||
public class TaggingIntEncoder extends IntEncoderFilter {
|
|
||||||
|
|
||||||
public TaggingIntEncoder(IntEncoder encoder) {
|
|
||||||
super(encoder);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void encode(int value) throws IOException {
|
|
||||||
encoder.encode(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public IntDecoder createMatchingDecoder() {
|
|
||||||
return new TaggingIntDecoder();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(OutputStream out) {
|
|
||||||
super.reInit(os);
|
|
||||||
// Assumes the application has a static EncodersMap class which is able to
|
|
||||||
// return a unique ID for a given encoder.
|
|
||||||
int encoderID = EncodersMap.getID(encoder);
|
|
||||||
this.out.write(encoderID);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "Tagging (" + encoder.toString() + ")";
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
</code></pre></blockquote>
|
|
||||||
And the matching decoder:
|
|
||||||
<blockquote><pre class="prettyprint"><code>
|
|
||||||
public class TaggingIntDecoder extends IntDecoder {
|
|
||||||
|
|
||||||
// Will be initialized upon calling reInit.
|
|
||||||
private IntDecoder decoder;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reInit(InputStream in) {
|
|
||||||
super.reInit(in);
|
|
||||||
|
|
||||||
// Read the ID of the encoder that tagged this stream.
|
|
||||||
int encoderID = in.read();
|
|
||||||
|
|
||||||
// Assumes EncodersMap can return the proper IntEncoder given the ID.
|
|
||||||
decoder = EncodersMap.getEncoder(encoderID).createMatchingDecoder();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long decode() throws IOException {
|
|
||||||
return decoder.decode();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "Tagging (" + decoder == null ? "none" : decoder.toString() + ")";
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
</code></pre></blockquote>
|
|
||||||
The example implements <code>TaggingIntEncoder</code> as a filter over another
|
|
||||||
encoder. Even though it does not do any filtering on the actual values, it feels
|
|
||||||
right to present it as a filter. Anyway, this is just an example code and one
|
|
||||||
can choose to implement it however it makes sense to the application. For
|
|
||||||
simplicity, error checking was omitted from the sample code.
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
|
@ -4,7 +4,7 @@ import org.junit.Test;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.facet.example.ExampleResult;
|
import org.apache.lucene.facet.example.ExampleResult;
|
||||||
import org.apache.lucene.facet.example.association.AssociationMain;
|
import org.apache.lucene.facet.example.association.CategoryAssociationsMain;
|
||||||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
import org.apache.lucene.facet.search.results.FacetResultNode;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -35,8 +35,8 @@ public class TestAssociationExample extends LuceneTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAssociationExamples() throws Exception {
|
public void testAssociationExamples() throws Exception {
|
||||||
assertExampleResult(new AssociationMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS);
|
assertExampleResult(new CategoryAssociationsMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS);
|
||||||
assertExampleResult(new AssociationMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS);
|
assertExampleResult(new CategoryAssociationsMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertExampleResult(ExampleResult res, double[] expectedResults) {
|
private void assertExampleResult(ExampleResult res, double[] expectedResults) {
|
||||||
|
|
|
@ -19,8 +19,8 @@ import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.UnsafeByteArrayOutputStream;
|
|
||||||
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
||||||
import org.apache.lucene.util.encoding.IntEncoder;
|
import org.apache.lucene.util.encoding.IntEncoder;
|
||||||
import org.apache.lucene.util.encoding.SortingIntEncoder;
|
import org.apache.lucene.util.encoding.SortingIntEncoder;
|
||||||
|
@ -49,17 +49,19 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
||||||
|
|
||||||
private static final class DataTokenStream extends TokenStream {
|
private static final class DataTokenStream extends TokenStream {
|
||||||
|
|
||||||
|
private final PayloadAttribute payload = addAttribute(PayloadAttribute.class);
|
||||||
|
private final BytesRef buf;
|
||||||
|
private final IntEncoder encoder;
|
||||||
|
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
private int idx;
|
private int idx;
|
||||||
private PayloadAttribute payload = addAttribute(PayloadAttribute.class);
|
|
||||||
private byte[] buf = new byte[20];
|
|
||||||
UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buf);
|
|
||||||
IntEncoder encoder;
|
|
||||||
private boolean exhausted = false;
|
private boolean exhausted = false;
|
||||||
private CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
|
||||||
|
|
||||||
public DataTokenStream(String text, IntEncoder encoder) {
|
public DataTokenStream(String text, IntEncoder encoder) {
|
||||||
this.encoder = encoder;
|
this.encoder = encoder;
|
||||||
term.setEmpty().append(text);
|
term.setEmpty().append(text);
|
||||||
|
buf = new BytesRef();
|
||||||
|
payload.setPayload(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setIdx(int idx) {
|
public void setIdx(int idx) {
|
||||||
|
@ -73,30 +75,26 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int[] values = data[idx];
|
// must copy because encoders may change the buffer
|
||||||
ubaos.reInit(buf);
|
encoder.encode(IntsRef.deepCopyOf(data[idx]), buf);
|
||||||
encoder.reInit(ubaos);
|
|
||||||
for (int val : values) {
|
|
||||||
encoder.encode(val);
|
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
payload.setPayload(new BytesRef(buf, 0, ubaos.length()));
|
|
||||||
|
|
||||||
exhausted = true;
|
exhausted = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static final int[][] data = new int[][] {
|
static final IntsRef[] data = new IntsRef[] {
|
||||||
new int[] { 1, 2 }, new int[] { 3, 4 }, new int[] { 1, 3 }, new int[] { 1, 2, 3, 4 },
|
new IntsRef(new int[] { 1, 2 }, 0, 2),
|
||||||
|
new IntsRef(new int[] { 3, 4 }, 0, 2),
|
||||||
|
new IntsRef(new int[] { 1, 3 }, 0, 2),
|
||||||
|
new IntsRef(new int[] { 1, 2, 3, 4 }, 0, 4)
|
||||||
};
|
};
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPayloadIntDecodingIterator() throws Exception {
|
public void testPayloadCategoryListIteraor() throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder(
|
final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
|
||||||
new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))));
|
DataTokenStream dts = new DataTokenStream("1",encoder);
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT,
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy()));
|
new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy()));
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
|
@ -108,21 +106,21 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
||||||
IndexReader reader = writer.getReader();
|
IndexReader reader = writer.getReader();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term(
|
IntsRef ordinals = new IntsRef();
|
||||||
"f","1"), dts.encoder.createMatchingDecoder());
|
CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder());
|
||||||
cli.init();
|
cli.init();
|
||||||
int totalCategories = 0;
|
int totalCategories = 0;
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
Set<Integer> values = new HashSet<Integer>();
|
Set<Integer> values = new HashSet<Integer>();
|
||||||
for (int j = 0; j < data[i].length; j++) {
|
for (int j = 0; j < data[i].length; j++) {
|
||||||
values.add(data[i][j]);
|
values.add(data[i].ints[j]);
|
||||||
}
|
}
|
||||||
cli.skipTo(i);
|
cli.getOrdinals(i, ordinals);
|
||||||
long cat;
|
assertTrue("no ordinals for document " + i, ordinals.length > 0);
|
||||||
while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) {
|
for (int j = 0; j < ordinals.length; j++) {
|
||||||
assertTrue("expected category not found: " + cat, values.contains((int) cat));
|
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
|
||||||
totalCategories ++;
|
|
||||||
}
|
}
|
||||||
|
totalCategories += ordinals.length;
|
||||||
}
|
}
|
||||||
assertEquals("Missing categories!",10,totalCategories);
|
assertEquals("Missing categories!",10,totalCategories);
|
||||||
reader.close();
|
reader.close();
|
||||||
|
@ -135,8 +133,8 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
||||||
@Test
|
@Test
|
||||||
public void testPayloadIteratorWithInvalidDoc() throws Exception {
|
public void testPayloadIteratorWithInvalidDoc() throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder(
|
final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
|
||||||
new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))));
|
DataTokenStream dts = new DataTokenStream("1", encoder);
|
||||||
// this test requires that no payloads ever be randomly present!
|
// this test requires that no payloads ever be randomly present!
|
||||||
final Analyzer noPayloadsAnalyzer = new Analyzer() {
|
final Analyzer noPayloadsAnalyzer = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -162,30 +160,27 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
||||||
IndexReader reader = writer.getReader();
|
IndexReader reader = writer.getReader();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term(
|
IntsRef ordinals = new IntsRef();
|
||||||
"f","1"), dts.encoder.createMatchingDecoder());
|
CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder());
|
||||||
assertTrue("Failed to initialize payload iterator", cli.init());
|
assertTrue("Failed to initialize payload iterator", cli.init());
|
||||||
int totalCats = 0;
|
int totalCategories = 0;
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
// doc no. i
|
|
||||||
Set<Integer> values = new HashSet<Integer>();
|
Set<Integer> values = new HashSet<Integer>();
|
||||||
for (int j = 0; j < data[i].length; j++) {
|
for (int j = 0; j < data[i].length; j++) {
|
||||||
values.add(data[i][j]);
|
values.add(data[i].ints[j]);
|
||||||
}
|
}
|
||||||
boolean hasDoc = cli.skipTo(i);
|
cli.getOrdinals(i, ordinals);
|
||||||
if (hasDoc) {
|
if (i == 0) {
|
||||||
assertTrue("Document " + i + " must not have a payload!", i == 0);
|
assertTrue("document 0 must have a payload", ordinals.length > 0);
|
||||||
long cat;
|
for (int j = 0; j < ordinals.length; j++) {
|
||||||
while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) {
|
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
|
||||||
assertTrue("expected category not found: " + cat, values.contains((int) cat));
|
|
||||||
++totalCats;
|
|
||||||
}
|
}
|
||||||
|
totalCategories += ordinals.length;
|
||||||
} else {
|
} else {
|
||||||
assertFalse("Document " + i + " must have a payload!", i == 0);
|
assertTrue("only document 0 should have a payload", ordinals.length == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
assertEquals("Wrong number of total categories!", 2, totalCats);
|
assertEquals("Wrong number of total categories!", 2, totalCategories);
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
|
|
@ -90,7 +90,9 @@ public class DrillDownTest extends LuceneTestCase {
|
||||||
paths.add(new CategoryPath("b"));
|
paths.add(new CategoryPath("b"));
|
||||||
}
|
}
|
||||||
FacetFields facetFields = new FacetFields(taxoWriter);
|
FacetFields facetFields = new FacetFields(taxoWriter);
|
||||||
facetFields.addFields(doc, paths);
|
if (paths.size() > 0) {
|
||||||
|
facetFields.addFields(doc, paths);
|
||||||
|
}
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -118,18 +119,17 @@ public class TestCategoryListCache extends FacetTestBase {
|
||||||
@Override
|
@Override
|
||||||
public CategoryListIterator iterator(int partition) throws IOException {
|
public CategoryListIterator iterator(int partition) throws IOException {
|
||||||
final CategoryListIterator it = cld.iterator(partition);
|
final CategoryListIterator it = cld.iterator(partition);
|
||||||
return new CategoryListIterator() {
|
return new CategoryListIterator() {
|
||||||
@Override
|
@Override
|
||||||
public boolean skipTo(int docId) throws IOException {
|
public void getOrdinals(int docID, IntsRef ints) throws IOException {
|
||||||
return it.skipTo(docId);
|
it.getOrdinals(docID, ints);
|
||||||
}
|
for (int i = 0; i < ints.length; i++) {
|
||||||
@Override
|
if (ints.ints[i] > 1) {
|
||||||
public long nextCategory() throws IOException {
|
ints.ints[i]--;
|
||||||
long res = it.nextCategory();
|
} else {
|
||||||
if (res>Integer.MAX_VALUE) {
|
ints.ints[i]++;
|
||||||
return res;
|
}
|
||||||
}
|
}
|
||||||
return res>1 ? res-1 : res+1;
|
|
||||||
}
|
}
|
||||||
@Override
|
@Override
|
||||||
public boolean init() throws IOException {
|
public boolean init() throws IOException {
|
||||||
|
|
|
@ -0,0 +1,126 @@
|
||||||
|
package org.apache.lucene.facet.search.params;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.facet.index.FacetFields;
|
||||||
|
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||||
|
import org.apache.lucene.facet.index.params.PerDimensionIndexingParams;
|
||||||
|
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||||
|
import org.apache.lucene.facet.search.PayloadCategoryListIteraor;
|
||||||
|
import org.apache.lucene.facet.search.cache.CategoryListCache;
|
||||||
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
|
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||||
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||||
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||||
|
import org.apache.lucene.facet.util.MultiCategoryListIterator;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.encoding.IntDecoder;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class MultiCategoryListIteratorTest extends LuceneTestCase {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultipleCategoryLists() throws Exception {
|
||||||
|
Random random = random();
|
||||||
|
int numDimensions = atLeast(random, 2); // at least 2 dimensions
|
||||||
|
String[] dimensions = new String[numDimensions];
|
||||||
|
for (int i = 0; i < numDimensions; i++) {
|
||||||
|
dimensions[i] = "dim" + i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the PerDimensionIndexingParams
|
||||||
|
HashMap<CategoryPath,CategoryListParams> clps = new HashMap<CategoryPath,CategoryListParams>();
|
||||||
|
for (String dim : dimensions) {
|
||||||
|
CategoryPath cp = new CategoryPath(dim);
|
||||||
|
CategoryListParams clp = new CategoryListParams(new Term("$" + dim, CategoryListParams.DEFAULT_TERM.bytes()));
|
||||||
|
clps.put(cp, clp);
|
||||||
|
}
|
||||||
|
PerDimensionIndexingParams indexingParams = new PerDimensionIndexingParams(clps);
|
||||||
|
|
||||||
|
// index some documents
|
||||||
|
Directory indexDir = newDirectory();
|
||||||
|
Directory taxoDir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, null).setMaxBufferedDocs(2));
|
||||||
|
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
|
||||||
|
FacetFields facetFields = new FacetFields(taxoWriter, indexingParams);
|
||||||
|
int ndocs = atLeast(random, 10);
|
||||||
|
for (int i = 0; i < ndocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
int numCategories = random.nextInt(numDimensions) + 1;
|
||||||
|
ArrayList<CategoryPath> categories = new ArrayList<CategoryPath>();
|
||||||
|
for (int j = 0; j < numCategories; j++) {
|
||||||
|
String dimension = dimensions[random.nextInt(dimensions.length)];
|
||||||
|
categories.add(new CategoryPath(dimension, Integer.toString(i)));
|
||||||
|
}
|
||||||
|
facetFields.addFields(doc, categories);
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
IOUtils.close(indexWriter, taxoWriter);
|
||||||
|
|
||||||
|
// test the multi iterator
|
||||||
|
CategoryListCache clCache = null;
|
||||||
|
if (random.nextBoolean()) {
|
||||||
|
clCache = new CategoryListCache();
|
||||||
|
}
|
||||||
|
|
||||||
|
DirectoryReader indexReader = DirectoryReader.open(indexDir);
|
||||||
|
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
CategoryListIterator[] iterators = new CategoryListIterator[numDimensions];
|
||||||
|
for (int i = 0; i < iterators.length; i++) {
|
||||||
|
CategoryListParams clp = indexingParams.getCategoryListParams(new CategoryPath(dimensions[i]));
|
||||||
|
IntDecoder decoder = clp.createEncoder().createMatchingDecoder();
|
||||||
|
if (clCache != null && random.nextBoolean()) {
|
||||||
|
clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams);
|
||||||
|
iterators[i] = clCache.get(clp).iterator(0); // no partitions
|
||||||
|
} else {
|
||||||
|
iterators[i] = new PayloadCategoryListIteraor(indexReader, clp.getTerm(), decoder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators);
|
||||||
|
assertTrue("failed to init multi-iterator", cli.init());
|
||||||
|
IntsRef ordinals = new IntsRef();
|
||||||
|
int maxDoc = indexReader.maxDoc();
|
||||||
|
for (int i = 0; i < maxDoc; i++) {
|
||||||
|
cli.getOrdinals(i, ordinals);
|
||||||
|
assertTrue("document " + i + " does not have categories", ordinals.length > 0);
|
||||||
|
for (int j = 0; j < ordinals.length; j++) {
|
||||||
|
CategoryPath cp = taxoReader.getPath(ordinals.ints[j]);
|
||||||
|
assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp);
|
||||||
|
if (cp.length == 2) {
|
||||||
|
assertEquals("invalid category for document " + i, i, Integer.parseInt(cp.components[1]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
IOUtils.close(indexReader, taxoReader);
|
||||||
|
IOUtils.close(indexDir, taxoDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,271 +0,0 @@
|
||||||
package org.apache.lucene.facet.search.params;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.facet.index.FacetFields;
|
|
||||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
|
||||||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
|
||||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
|
||||||
import org.apache.lucene.facet.search.FacetArrays;
|
|
||||||
import org.apache.lucene.facet.search.FacetResultsHandler;
|
|
||||||
import org.apache.lucene.facet.search.FacetsAccumulator;
|
|
||||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
|
||||||
import org.apache.lucene.facet.search.StandardFacetsAccumulator;
|
|
||||||
import org.apache.lucene.facet.search.TopKFacetResultsHandler;
|
|
||||||
import org.apache.lucene.facet.search.cache.CategoryListCache;
|
|
||||||
import org.apache.lucene.facet.search.results.FacetResult;
|
|
||||||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
|
||||||
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
|
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
|
||||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
|
||||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
|
||||||
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
|
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test faceted search with creation of multiple category list iterators by the
|
|
||||||
* same CLP, depending on the provided facet request
|
|
||||||
*/
|
|
||||||
public class MultiIteratorsPerCLParamsTest extends LuceneTestCase {
|
|
||||||
|
|
||||||
CategoryPath[][] perDocCategories = new CategoryPath[][] {
|
|
||||||
{ new CategoryPath("author", "Mark Twain"),
|
|
||||||
new CategoryPath("date", "2010") },
|
|
||||||
{ new CategoryPath("author", "Robert Frost"),
|
|
||||||
new CategoryPath("date", "2009") },
|
|
||||||
{ new CategoryPath("author", "Artur Miller"),
|
|
||||||
new CategoryPath("date", "2010") },
|
|
||||||
{ new CategoryPath("author", "Edgar Allan Poe"),
|
|
||||||
new CategoryPath("date", "2009") },
|
|
||||||
{ new CategoryPath("author", "Henry James"),
|
|
||||||
new CategoryPath("date", "2010") } };
|
|
||||||
|
|
||||||
String countForbiddenDimension;
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testCLParamMultiIteratorsByRequest() throws Exception {
|
|
||||||
doTestCLParamMultiIteratorsByRequest(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testCLParamMultiIteratorsByRequestCacheCLI() throws Exception {
|
|
||||||
doTestCLParamMultiIteratorsByRequest(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void doTestCLParamMultiIteratorsByRequest(boolean cacheCLI) throws Exception {
|
|
||||||
// Create a CLP which generates different CLIs according to the
|
|
||||||
// FacetRequest's dimension
|
|
||||||
CategoryListParams clp = new CategoryListParams();
|
|
||||||
FacetIndexingParams iParams = new FacetIndexingParams(clp);
|
|
||||||
Directory indexDir = newDirectory();
|
|
||||||
Directory taxoDir = newDirectory();
|
|
||||||
populateIndex(iParams, indexDir, taxoDir);
|
|
||||||
|
|
||||||
TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
|
|
||||||
IndexReader reader = DirectoryReader.open(indexDir);
|
|
||||||
|
|
||||||
CategoryListCache clCache = null;
|
|
||||||
if (cacheCLI) {
|
|
||||||
// caching the iteratorr, so:
|
|
||||||
// 1: create the cached iterator, using original params
|
|
||||||
clCache = new CategoryListCache();
|
|
||||||
clCache.loadAndRegister(clp, reader, taxo, iParams);
|
|
||||||
}
|
|
||||||
|
|
||||||
ScoredDocIDs allDocs = ScoredDocIdsUtils
|
|
||||||
.createAllDocsScoredDocIDs(reader);
|
|
||||||
|
|
||||||
// Search index with 'author' should filter ONLY ordinals whose parent
|
|
||||||
// is 'author'
|
|
||||||
countForbiddenDimension = "date";
|
|
||||||
validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "author", 5, 5);
|
|
||||||
|
|
||||||
// Search index with 'date' should filter ONLY ordinals whose parent is
|
|
||||||
// 'date'
|
|
||||||
countForbiddenDimension = "author";
|
|
||||||
validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "date", 5, 2);
|
|
||||||
|
|
||||||
// Search index with both 'date' and 'author'
|
|
||||||
countForbiddenDimension = null;
|
|
||||||
validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, new String[] {
|
|
||||||
"author", "date" }, new int[] { 5, 5 }, new int[] { 5, 2 });
|
|
||||||
taxo.close();
|
|
||||||
reader.close();
|
|
||||||
indexDir.close();
|
|
||||||
taxoDir.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void validateFacetedSearch(FacetIndexingParams iParams,
|
|
||||||
TaxonomyReader taxo, IndexReader reader, CategoryListCache clCache,
|
|
||||||
ScoredDocIDs allDocs, String dimension, int expectedValue, int expectedNumDescendants) throws IOException {
|
|
||||||
validateFacetedSearch(iParams, taxo, reader, clCache, allDocs,
|
|
||||||
new String[] { dimension }, new int[] { expectedValue },
|
|
||||||
new int[] { expectedNumDescendants });
|
|
||||||
}
|
|
||||||
|
|
||||||
private void validateFacetedSearch(FacetIndexingParams iParams,
|
|
||||||
TaxonomyReader taxo, IndexReader reader, final CategoryListCache clCache,
|
|
||||||
ScoredDocIDs allDocs, String[] dimension, int[] expectedValue,
|
|
||||||
int[] expectedNumDescendants) throws IOException {
|
|
||||||
List<FacetRequest> facetRequests = new ArrayList<FacetRequest>();
|
|
||||||
for (String dim : dimension) {
|
|
||||||
facetRequests.add(new PerDimCountFacetRequest(new CategoryPath(dim), 10));
|
|
||||||
}
|
|
||||||
FacetSearchParams sParams = new FacetSearchParams(facetRequests, iParams) {
|
|
||||||
@Override
|
|
||||||
public CategoryListCache getCategoryListCache() {
|
|
||||||
return clCache;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
FacetsAccumulator acc = new StandardFacetsAccumulator(sParams, reader, taxo);
|
|
||||||
|
|
||||||
// no use to test this with complement since at that mode all facets are taken
|
|
||||||
acc.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT);
|
|
||||||
|
|
||||||
List<FacetResult> results = acc.accumulate(allDocs);
|
|
||||||
assertEquals("Wrong #results", dimension.length, results.size());
|
|
||||||
|
|
||||||
for (int i = 0; i < results.size(); i++) {
|
|
||||||
FacetResult res = results.get(i);
|
|
||||||
assertEquals("wrong num-descendants for dimension " + dimension[i],
|
|
||||||
expectedNumDescendants[i], res.getNumValidDescendants());
|
|
||||||
FacetResultNode resNode = res.getFacetResultNode();
|
|
||||||
assertEquals("wrong value for dimension " + dimension[i],
|
|
||||||
expectedValue[i], (int) resNode.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void populateIndex(FacetIndexingParams iParams, Directory indexDir,
|
|
||||||
Directory taxoDir) throws Exception {
|
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), indexDir,
|
|
||||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)));
|
|
||||||
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
|
|
||||||
|
|
||||||
FacetFields facetFields = new FacetFields(taxoWriter, iParams);
|
|
||||||
for (CategoryPath[] categories : perDocCategories) {
|
|
||||||
Document doc = new Document();
|
|
||||||
facetFields.addFields(doc, Arrays.asList(categories));
|
|
||||||
writer.addDocument(doc);
|
|
||||||
}
|
|
||||||
taxoWriter.commit();
|
|
||||||
writer.commit();
|
|
||||||
taxoWriter.close();
|
|
||||||
writer.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private class PerDimCountFacetRequest extends CountFacetRequest {
|
|
||||||
|
|
||||||
public PerDimCountFacetRequest(CategoryPath path, int num) {
|
|
||||||
super(path, num);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public CategoryListIterator createCategoryListIterator(IndexReader reader,
|
|
||||||
TaxonomyReader taxo, FacetSearchParams sParams, int partition) throws IOException {
|
|
||||||
// categories of certain dimension only
|
|
||||||
return new PerDimensionCLI(taxo, super.createCategoryListIterator(
|
|
||||||
reader, taxo, sParams, partition), getCategoryPath());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
/** Override this method just for verifying that only specified facets are iterated.. */
|
|
||||||
public FacetResultsHandler createFacetResultsHandler(
|
|
||||||
TaxonomyReader taxonomyReader) {
|
|
||||||
return new TopKFacetResultsHandler(taxonomyReader, this) {
|
|
||||||
@Override
|
|
||||||
public IntermediateFacetResult fetchPartitionResult(
|
|
||||||
FacetArrays facetArrays, int offset) throws IOException {
|
|
||||||
final IntermediateFacetResult res = super.fetchPartitionResult(facetArrays, offset);
|
|
||||||
if (countForbiddenDimension!=null) {
|
|
||||||
int ord = taxonomyReader.getOrdinal(new CategoryPath(countForbiddenDimension));
|
|
||||||
assertEquals("Should not have accumulated for dimension '"+countForbiddenDimension+"'!",0,facetArrays.getIntArray()[ord]);
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* a CLI which filters another CLI for the dimension of the provided
|
|
||||||
* category-path
|
|
||||||
*/
|
|
||||||
private static class PerDimensionCLI implements CategoryListIterator {
|
|
||||||
private final CategoryListIterator superCLI;
|
|
||||||
private final int[] parentArray;
|
|
||||||
private final int parentOrdinal;
|
|
||||||
|
|
||||||
PerDimensionCLI(TaxonomyReader taxo, CategoryListIterator superCLI,
|
|
||||||
CategoryPath requestedPath) throws IOException {
|
|
||||||
this.superCLI = superCLI;
|
|
||||||
if (requestedPath == null) {
|
|
||||||
parentOrdinal = 0;
|
|
||||||
} else {
|
|
||||||
CategoryPath cp = new CategoryPath(requestedPath.components[0]);
|
|
||||||
parentOrdinal = taxo.getOrdinal(cp);
|
|
||||||
}
|
|
||||||
parentArray = taxo.getParallelTaxonomyArrays().parents();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean init() throws IOException {
|
|
||||||
return superCLI.init();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long nextCategory() throws IOException {
|
|
||||||
long next;
|
|
||||||
while ((next = superCLI.nextCategory()) <= Integer.MAX_VALUE
|
|
||||||
&& !isInDimension((int) next)) {
|
|
||||||
}
|
|
||||||
|
|
||||||
return next;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** look for original parent ordinal, meaning same dimension */
|
|
||||||
private boolean isInDimension(int ordinal) {
|
|
||||||
while (ordinal > 0) {
|
|
||||||
if (ordinal == parentOrdinal) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
ordinal = parentArray[ordinal];
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean skipTo(int docId) throws IOException {
|
|
||||||
return superCLI.skipTo(docId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,141 +0,0 @@
|
||||||
package org.apache.lucene.util;
|
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util.Vint8;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests the {@link Vint8} class.
|
|
||||||
*/
|
|
||||||
public class Vint8Test extends LuceneTestCase {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests the position wrapper.
|
|
||||||
* @throws Exception For any reason.
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testPosition() throws Exception {
|
|
||||||
Vint8.Position pos = new Vint8.Position();
|
|
||||||
assertEquals(0, pos.pos);
|
|
||||||
pos = new Vint8.Position(12345);
|
|
||||||
assertEquals(12345, pos.pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int[] testValues = {
|
|
||||||
-1000000000,
|
|
||||||
-1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14,
|
|
||||||
(1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28
|
|
||||||
};
|
|
||||||
private static int[] bytesNeededTestValues = {
|
|
||||||
5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests the {@code bytesNeeded} method.
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testBytesNeeded() {
|
|
||||||
assertEquals(5, Vint8.MAXIMUM_BYTES_NEEDED);
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
assertEquals(bytesNeededTestValues[j], Vint8.bytesNeeded(testValues[j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests encoding and decoding to and from a stream.
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testStreamEncodingAndDecoding() throws IOException {
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(256);
|
|
||||||
int expectedSize = 0;
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
Vint8.encode(testValues[j], baos);
|
|
||||||
expectedSize += bytesNeededTestValues[j];
|
|
||||||
}
|
|
||||||
assertEquals(expectedSize, baos.size());
|
|
||||||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
assertEquals(testValues[j], Vint8.decode(bais));
|
|
||||||
}
|
|
||||||
assertEquals(0, bais.available());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests encoding and decoding to and from an array.
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testArrayEncodingAndDecoding() throws IOException {
|
|
||||||
byte[] byteArray = new byte[256];
|
|
||||||
int position = 0, expectedSize = 0;
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
position += Vint8.encode(testValues[j], byteArray, position);
|
|
||||||
expectedSize += bytesNeededTestValues[j];
|
|
||||||
}
|
|
||||||
assertEquals(expectedSize, position);
|
|
||||||
Vint8.Position pos = new Vint8.Position();
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
assertEquals(testValues[j], Vint8.decode(byteArray, pos));
|
|
||||||
}
|
|
||||||
assertEquals(expectedSize, pos.pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The result of encoding the test values with the current algorithm. If these
|
|
||||||
* values are changed to match an algorithm change, compatibility with legacy
|
|
||||||
* data will be broken.
|
|
||||||
*/
|
|
||||||
private static final byte[] encodedTestValues = {
|
|
||||||
-4, -93, -108, -20, 0, -1, -1, -1, -1, 127, 0, 127, -127, 0, -1, 127,
|
|
||||||
-127, -128, 0, -1, -1, 127, -127, -128, -128, 0, -1, -1, -1, 127, -127,
|
|
||||||
-128, -128, -128, 0
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests algorithm.
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testLegacyCompatibility() throws IOException {
|
|
||||||
/* To generate the encoded test values:
|
|
||||||
byte[] byteArray = new byte[256];
|
|
||||||
int position = 0, expectedSize = 0;
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
position += Vint8.encode(testValues[j], byteArray, position);
|
|
||||||
expectedSize += bytesNeededTestValues[j];
|
|
||||||
}
|
|
||||||
assertEquals(expectedSize, position);
|
|
||||||
Vint8.Position pos = new Vint8.Position();
|
|
||||||
for (int j = 0; j < expectedSize; j++) {
|
|
||||||
System.out.print(byteArray[j] + ", ");
|
|
||||||
}
|
|
||||||
System.out.flush();
|
|
||||||
pos.pos = 0;
|
|
||||||
*/
|
|
||||||
Vint8.Position pos = new Vint8.Position();
|
|
||||||
for (int j = 0; j < testValues.length; j++) {
|
|
||||||
assertEquals(testValues[j], Vint8.decode(encodedTestValues, pos));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // end class Vint8Test
|
|
|
@ -1,21 +1,12 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.encoding.EightFlagsIntEncoder;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.encoding.FourFlagsIntEncoder;
|
|
||||||
import org.apache.lucene.util.encoding.IntDecoder;
|
|
||||||
import org.apache.lucene.util.encoding.IntEncoder;
|
|
||||||
import org.apache.lucene.util.encoding.NOnesIntEncoder;
|
|
||||||
import org.apache.lucene.util.encoding.SortingIntEncoder;
|
|
||||||
import org.apache.lucene.util.encoding.UniqueValuesIntEncoder;
|
|
||||||
import org.apache.lucene.util.encoding.VInt8IntEncoder;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -40,8 +31,8 @@ public class EncodingSpeed {
|
||||||
private static int[] data9910 = null;
|
private static int[] data9910 = null;
|
||||||
private static int[] data501871 = null;
|
private static int[] data501871 = null;
|
||||||
private static int[] data10k = null;
|
private static int[] data10k = null;
|
||||||
private static String resultsFormat = "%-20s %10s %20d %26s %20d %26s";
|
private static String resultsFormat = "%-60s %10s %20d %26s %20d %26s";
|
||||||
private static String headerFormat = "%-20s %10s %20s %26s %20s %26s";
|
private static String headerFormat = "%-60s %10s %20s %26s %20s %26s";
|
||||||
private static int integers = 100000000;
|
private static int integers = 100000000;
|
||||||
|
|
||||||
private static NumberFormat nf;
|
private static NumberFormat nf;
|
||||||
|
@ -53,8 +44,14 @@ public class EncodingSpeed {
|
||||||
testFacetIDs(data501871, 501871);
|
testFacetIDs(data501871, 501871);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void testFacetIDs(int[] facetIDs, int docID)
|
private static IntsRef newIntsRef(int[] data) {
|
||||||
throws IOException {
|
IntsRef res = new IntsRef(data.length);
|
||||||
|
System.arraycopy(data, 0, res.ints, 0, data.length);
|
||||||
|
res.length = data.length;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void testFacetIDs(int[] facetIDs, int docID) throws IOException {
|
||||||
int loopFactor = integers / facetIDs.length;
|
int loopFactor = integers / facetIDs.length;
|
||||||
System.out
|
System.out
|
||||||
.println("\nEstimating ~"
|
.println("\nEstimating ~"
|
||||||
|
@ -88,68 +85,53 @@ public class EncodingSpeed {
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void encoderTest(IntEncoder encoder, int[] data,
|
private static void encoderTest(IntEncoder encoder, int[] values, int loopFactor) throws IOException {
|
||||||
int loopFactor) throws IOException {
|
|
||||||
|
|
||||||
long startTime, endTime;
|
BytesRef bytes = new BytesRef(values.length); // at least one byte per value
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
|
||||||
|
|
||||||
// -- Looping 100 times as a warm up --------------------------
|
// -- Looping 100 times as a warm up --------------------------
|
||||||
for (int i = 100; i != 0; --i) {
|
for (int i = 100; i != 0; --i) {
|
||||||
baos.reset();
|
IntsRef data = newIntsRef(values);
|
||||||
encoder.reInit(baos);
|
encoder.encode(data, bytes);
|
||||||
for (int value : data) {
|
|
||||||
encoder.encode(value);
|
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
}
|
}
|
||||||
// -----------------------------------------------------------
|
// -----------------------------------------------------------
|
||||||
|
|
||||||
startTime = System.currentTimeMillis();
|
long encodeTime = 0;
|
||||||
for (int factor = loopFactor; factor > 0; --factor) {
|
for (int factor = loopFactor; factor > 0; --factor) {
|
||||||
baos.reset();
|
IntsRef data = newIntsRef(values);
|
||||||
encoder.reInit(baos);
|
long start = System.currentTimeMillis();
|
||||||
for (int value : data) {
|
encoder.encode(data, bytes);
|
||||||
encoder.encode(value);
|
encodeTime += System.currentTimeMillis() - start;
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
}
|
}
|
||||||
endTime = System.currentTimeMillis();
|
|
||||||
|
|
||||||
long encodeTime = endTime - startTime;
|
IntsRef decoded = new IntsRef(values.length);
|
||||||
|
int encodedSize = bytes.length;
|
||||||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
|
|
||||||
IntDecoder decoder = encoder.createMatchingDecoder();
|
IntDecoder decoder = encoder.createMatchingDecoder();
|
||||||
decoder.reInit(bais);
|
|
||||||
|
|
||||||
// -- Looping 100 times as a warm up --------------------------
|
// -- Looping 100 times as a warm up --------------------------
|
||||||
for (int i = 100; i != 0; --i) {
|
for (int i = 100; i != 0; --i) {
|
||||||
bais.mark(baos.size());
|
decoder.decode(bytes, decoded);
|
||||||
while (decoder.decode() != IntDecoder.EOS) {
|
|
||||||
}
|
|
||||||
bais.reset();
|
|
||||||
decoder.reInit(bais);
|
|
||||||
}
|
}
|
||||||
// -----------------------------------------------------------
|
// -----------------------------------------------------------
|
||||||
|
|
||||||
decoder.reInit(bais);
|
long decodeTime = 0;
|
||||||
startTime = System.currentTimeMillis();
|
|
||||||
for (int i = loopFactor; i > 0; --i) {
|
for (int i = loopFactor; i > 0; --i) {
|
||||||
bais.mark(baos.size());
|
long start = System.currentTimeMillis();
|
||||||
while (decoder.decode() != IntDecoder.EOS) {
|
decoder.decode(bytes, decoded);
|
||||||
}
|
decodeTime += System.currentTimeMillis() - start;
|
||||||
bais.reset();
|
}
|
||||||
decoder.reInit(bais);
|
|
||||||
|
if (decoded.length != values.length) {
|
||||||
|
throw new RuntimeException("wrong num values. expected=" + values.length + " actual=" + decoded.length +
|
||||||
|
" decoder=" + decoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
endTime = System.currentTimeMillis();
|
System.out.println(String.format(Locale.ROOT, resultsFormat, encoder,
|
||||||
long decodeTime = endTime - startTime;
|
nf.format(encodedSize * 8.0 / values.length),
|
||||||
|
encodeTime,
|
||||||
System.out.println(String.format(Locale.ROOT, resultsFormat, encoder, nf.format(baos
|
nf.format(encodeTime * 1000000.0 / (loopFactor * values.length)),
|
||||||
.size()
|
decodeTime,
|
||||||
* 8.0 / data.length), encodeTime, nf.format(encodeTime
|
nf.format(decodeTime * 1000000.0 / (loopFactor * values.length))));
|
||||||
* 1000000.0 / (loopFactor * data.length)), decodeTime, nf
|
|
||||||
.format(decodeTime * 1000000.0 / (loopFactor * data.length))));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
package org.apache.lucene.util.encoding;
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.Arrays;
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -29,65 +28,33 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class EncodingTest extends LuceneTestCase {
|
public class EncodingTest extends LuceneTestCase {
|
||||||
|
|
||||||
static int[] data = null;
|
private static IntsRef uniqueSortedData, data;
|
||||||
|
|
||||||
private static TreeSet<Long> dataSet = new TreeSet<Long>();
|
@BeforeClass
|
||||||
static {
|
public static void beforeClassEncodingTest() throws Exception {
|
||||||
setData();
|
int capacity = atLeast(10000);
|
||||||
}
|
data = new IntsRef(capacity);
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
@Test
|
data.ints[i] = i + 1; // small values
|
||||||
public void testVInt8() throws Exception {
|
}
|
||||||
encoderTest(new VInt8IntEncoder());
|
for (int i = 10; i < data.ints.length; i++) {
|
||||||
|
data.ints[i] = random().nextInt(Integer.MAX_VALUE - 1) + 1; // some encoders don't allow 0
|
||||||
|
}
|
||||||
|
data.length = data.ints.length;
|
||||||
|
|
||||||
// cover negative numbers;
|
uniqueSortedData = IntsRef.deepCopyOf(data);
|
||||||
IntEncoder enc = new VInt8IntEncoder();
|
Arrays.sort(uniqueSortedData.ints);
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
uniqueSortedData.length = 0;
|
||||||
enc.reInit(baos);
|
int prev = -1;
|
||||||
enc.encode(-1);
|
for (int i = 0; i < uniqueSortedData.ints.length; i++) {
|
||||||
|
if (uniqueSortedData.ints[i] != prev) {
|
||||||
IntDecoder dec = enc.createMatchingDecoder();
|
uniqueSortedData.ints[uniqueSortedData.length++] = uniqueSortedData.ints[i];
|
||||||
dec.reInit(new ByteArrayInputStream(baos.toByteArray()));
|
prev = uniqueSortedData.ints[i];
|
||||||
assertEquals(-1, dec.decode());
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
private static void encoderTest(IntEncoder encoder, IntsRef data, IntsRef expected) throws IOException {
|
||||||
public void testSimpleInt() {
|
|
||||||
encoderTest(new SimpleIntEncoder());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSortingUniqueValues() {
|
|
||||||
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder())));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSortingUniqueDGap() {
|
|
||||||
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSortingUniqueDGapEightFlags() {
|
|
||||||
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder()))));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSortingUniqueDGapFourFlags() {
|
|
||||||
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder()))));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSortingUniqueDGapNOnes4() {
|
|
||||||
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSortingUniqueDGapNOnes3() {
|
|
||||||
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void encoderTest(IntEncoder encoder) {
|
|
||||||
|
|
||||||
// ensure toString is implemented
|
// ensure toString is implemented
|
||||||
String toString = encoder.toString();
|
String toString = encoder.toString();
|
||||||
assertFalse(toString.startsWith(encoder.getClass().getName() + "@"));
|
assertFalse(toString.startsWith(encoder.getClass().getName() + "@"));
|
||||||
|
@ -95,320 +62,90 @@ public class EncodingTest extends LuceneTestCase {
|
||||||
toString = decoder.toString();
|
toString = decoder.toString();
|
||||||
assertFalse(toString.startsWith(decoder.getClass().getName() + "@"));
|
assertFalse(toString.startsWith(decoder.getClass().getName() + "@"));
|
||||||
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
BytesRef bytes = new BytesRef(100); // some initial capacity - encoders should grow the byte[]
|
||||||
|
IntsRef values = new IntsRef(100); // some initial capacity - decoders should grow the int[]
|
||||||
try {
|
encoding(encoder, data, bytes);
|
||||||
encoding(encoder, baos);
|
decoding(bytes, values, encoder.createMatchingDecoder());
|
||||||
decoding(baos, encoder.createMatchingDecoder());
|
assertTrue(expected.intsEquals(values));
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
fail(e.getMessage());
|
|
||||||
}
|
|
||||||
|
|
||||||
baos.reset();
|
|
||||||
|
|
||||||
try {
|
|
||||||
encoding(encoder, baos);
|
|
||||||
decoding(baos, encoder.createMatchingDecoder());
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
fail(e.getMessage());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void encoding(IntEncoder encoder, ByteArrayOutputStream baos) throws IOException {
|
private static void encoding(IntEncoder encoder, IntsRef data, BytesRef bytes) throws IOException {
|
||||||
encoder.reInit(baos);
|
final IntsRef values;
|
||||||
for (int value : data) {
|
if (random().nextBoolean()) { // randomly set the offset
|
||||||
encoder.encode(value);
|
values = new IntsRef(data.length + 1);
|
||||||
|
System.arraycopy(data.ints, 0, values.ints, 1, data.length);
|
||||||
|
values.offset = 1; // ints start at index 1
|
||||||
|
values.length = data.length;
|
||||||
|
} else {
|
||||||
|
// need to copy the array because it may be modified by encoders (e.g. sorting)
|
||||||
|
values = IntsRef.deepCopyOf(data);
|
||||||
}
|
}
|
||||||
encoder.close();
|
encoder.encode(values, bytes);
|
||||||
|
|
||||||
baos.reset();
|
|
||||||
encoder.reInit(baos);
|
|
||||||
for (int value : data) {
|
|
||||||
encoder.encode(value);
|
|
||||||
}
|
|
||||||
encoder.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void decoding(ByteArrayOutputStream baos, IntDecoder decoder)
|
private static void decoding(BytesRef bytes, IntsRef values, IntDecoder decoder) throws IOException {
|
||||||
throws IOException {
|
int offset = 0;
|
||||||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
|
if (random().nextBoolean()) { // randomly set the offset and length to other than 0,0
|
||||||
decoder.reInit(bais);
|
bytes.grow(bytes.length + 1); // ensure that we have enough capacity to shift values by 1
|
||||||
|
bytes.offset = 1; // bytes start at index 1 (must do that after grow)
|
||||||
HashSet<Long> set = new HashSet<Long>();
|
System.arraycopy(bytes.bytes, 0, bytes.bytes, 1, bytes.length);
|
||||||
long value = 0;
|
offset = 1;
|
||||||
while ((value = decoder.decode()) != IntDecoder.EOS) {
|
|
||||||
set.add(value);
|
|
||||||
}
|
}
|
||||||
assertEquals(dataSet.size(), set.size());
|
decoder.decode(bytes, values);
|
||||||
assertTrue(set.equals(dataSet));
|
assertEquals(offset, bytes.offset); // decoders should not mess with offsets
|
||||||
|
|
||||||
set.clear();
|
|
||||||
bais.reset();
|
|
||||||
decoder.reInit(bais);
|
|
||||||
value = 0;
|
|
||||||
while ((value = decoder.decode()) != IntDecoder.EOS) {
|
|
||||||
set.add(value);
|
|
||||||
}
|
|
||||||
assertEquals(dataSet.size(), set.size());
|
|
||||||
assertTrue(set.equals(dataSet));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setData() {
|
@Test
|
||||||
data = new int[] { 2, 4, 86133, 11, 16505, 86134, 86135, 86136, 1290,
|
public void testVInt8() throws Exception {
|
||||||
86137, 86138, 32473, 19346, 32474, 4922, 32475, 86139, 16914,
|
encoderTest(new VInt8IntEncoder(), data, data);
|
||||||
86140, 86141, 86142, 86143, 32478, 86144, 86145, 32480, 4884,
|
|
||||||
4887, 32481, 86146, 16572, 86147, 16295, 165, 86148, 3183,
|
// cover negative numbers;
|
||||||
21920, 21921, 21922, 555, 4006, 32484, 21925, 21926, 13775,
|
BytesRef bytes = new BytesRef(5);
|
||||||
86149, 13777, 85833, 85834, 13779, 13773, 13780, 75266, 17674,
|
IntEncoder enc = new VInt8IntEncoder();
|
||||||
13784, 13785, 13786, 13787, 13788, 6258, 86150, 13790, 75267,
|
IntsRef values = new IntsRef(1);
|
||||||
13793, 13794, 13795, 312, 4914, 4915, 6222, 86151, 4845, 4883,
|
values.ints[values.length++] = -1;
|
||||||
4918, 4894, 4919, 86152, 4921, 6223, 6224, 6225, 6226, 67909,
|
enc.encode(values, bytes);
|
||||||
6229, 18170, 6230, 5198, 25625, 6231, 6232, 6233, 1808, 6234,
|
|
||||||
6235, 6236, 41376, 6238, 6239, 67911, 6240, 86153, 6243, 6244,
|
IntDecoder dec = enc.createMatchingDecoder();
|
||||||
83549, 6246, 6247, 6248, 6249, 782, 444, 6251, 6250, 19863,
|
values.length = 0;
|
||||||
28963, 310, 2234, 144, 2236, 2309, 69437, 2311, 2325, 2241,
|
dec.decode(bytes, values);
|
||||||
69438, 69439, 2244, 2245, 2246, 23504, 2314, 69440, 36603,
|
assertEquals(1, values.length);
|
||||||
2250, 2268, 2271, 2251, 2254, 2255, 2257, 2240, 36604, 84726,
|
assertEquals(-1, values.ints[0]);
|
||||||
36605, 84727, 2262, 2263, 18431, 38853, 2317, 2149, 2326, 2327,
|
}
|
||||||
2329, 3980, 2275, 2277, 2258, 84728, 2260, 84729, 84730, 13766,
|
|
||||||
36607, 2282, 2283, 84731, 2284, 2286, 2287, 2337, 7424, 2288,
|
@Test
|
||||||
2338, 3522, 2290, 84733, 32902, 371, 37708, 2096, 3065, 3066,
|
public void testSimpleInt() throws Exception {
|
||||||
375, 377, 374, 378, 2100, 86154, 381, 382, 58795, 379, 383,
|
encoderTest(new SimpleIntEncoder(), data, data);
|
||||||
384, 385, 4449, 387, 388, 389, 390, 9052, 391, 18358, 2107,
|
}
|
||||||
394, 2111, 2108, 393, 2109, 395, 86155, 86156, 397, 2113, 398,
|
|
||||||
399, 400, 273, 274, 275, 40980, 276, 277, 31716, 279, 280,
|
@Test
|
||||||
31717, 281, 282, 1628, 1623, 1624, 1625, 2052, 1626, 725, 727,
|
public void testSortingUniqueValues() throws Exception {
|
||||||
728, 729, 730, 731, 1633, 733, 734, 735, 86157, 737, 738, 739,
|
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder())), data, uniqueSortedData);
|
||||||
1634, 3563, 3564, 3565, 1667, 12461, 76276, 3567, 5413, 77622,
|
}
|
||||||
5415, 5416, 5417, 5418, 107, 86158, 7784, 15363, 153, 3723,
|
|
||||||
2713, 7786, 3835, 7787, 86159, 7789, 7791, 7792, 7794, 86160,
|
@Test
|
||||||
7796, 86161, 6708, 7798, 7799, 7800, 7801, 7802, 7803, 1665,
|
public void testSortingUniqueDGap() throws Exception {
|
||||||
43150, 15365, 1581, 5656, 43152, 80258, 7450, 39922, 86162,
|
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))), data, uniqueSortedData);
|
||||||
51587, 9059, 4606, 396, 86163, 86164, 7250, 401, 403, 2860,
|
}
|
||||||
33281, 2964, 408, 9119, 409, 86165, 7669, 2861, 410, 413,
|
|
||||||
86166, 414, 415, 33282, 405, 33283, 7498, 2865, 7230, 33284,
|
@Test
|
||||||
2866, 86167, 2867, 47518, 2868, 86168, 2869, 2870, 4712, 7096,
|
public void testSortingUniqueDGapEightFlags() throws Exception {
|
||||||
28484, 6913, 6914, 6915, 6916, 37169, 37170, 7103, 28269, 6919,
|
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder()))), data, uniqueSortedData);
|
||||||
86169, 45431, 6922, 7104, 6923, 7108, 6924, 6925, 6926, 6927,
|
}
|
||||||
6928, 86170, 86171, 86172, 6930, 6931, 6932, 6934, 6935, 6936,
|
|
||||||
451, 6937, 6938, 4756, 3554, 5309, 8145, 3586, 16417, 9767,
|
@Test
|
||||||
14126, 25854, 6580, 10174, 86173, 5519, 21309, 8561, 20938,
|
public void testSortingUniqueDGapFourFlags() throws Exception {
|
||||||
10386, 86174, 781, 2030, 16419, 30323, 16420, 16421, 16424,
|
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder()))), data, uniqueSortedData);
|
||||||
86175, 86176, 86177, 28871, 86178, 28872, 63980, 6329, 49561,
|
}
|
||||||
4271, 38778, 86179, 86180, 20126, 16245, 193, 195, 196, 197,
|
|
||||||
56973, 199, 200, 201, 202, 203, 204, 56974, 56975, 205, 206,
|
@Test
|
||||||
4662, 207, 208, 209, 210, 211, 212, 47901, 641, 642, 643, 1380,
|
public void testSortingUniqueDGapNOnes4() throws Exception {
|
||||||
1079, 47902, 1381, 1081, 1082, 1083, 47903, 1382, 47904, 1087,
|
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))), data, uniqueSortedData);
|
||||||
47905, 965, 966, 1298, 968, 1387, 1300, 50288, 971, 972, 973,
|
}
|
||||||
974, 23974, 22183, 1390, 23313, 1389, 1391, 902, 23029, 296,
|
|
||||||
1304, 1395, 1303, 1309, 1308, 50289, 1312, 50290, 50291, 1315,
|
@Test
|
||||||
1317, 9270, 19796, 3605, 1320, 1321, 44946, 1322, 1323, 50292,
|
public void testSortingUniqueDGapNOnes3() throws Exception {
|
||||||
967, 1587, 1326, 1331, 17482, 633, 29115, 53858, 29118, 29119,
|
encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))), data, uniqueSortedData);
|
||||||
62624, 44494, 6965, 6966, 6959, 6967, 71562, 6969, 23459,
|
|
||||||
23460, 17464, 4225, 23461, 23462, 23463, 5893, 23464, 17467,
|
|
||||||
17468, 23465, 12562, 1405, 1406, 1407, 960, 961, 962, 687, 963,
|
|
||||||
86181, 86182, 5997, 10812, 11976, 11977, 1850, 577, 13393,
|
|
||||||
10810, 13394, 65040, 86183, 3935, 3936, 3937, 710, 86184, 5785,
|
|
||||||
5786, 29949, 5787, 5788, 283, 284, 2687, 285, 286, 287, 2689,
|
|
||||||
288, 289, 8880, 290, 2690, 13899, 991, 292, 295, 42007, 35616,
|
|
||||||
63103, 298, 299, 3520, 297, 9024, 303, 301, 302, 300, 31345,
|
|
||||||
3719, 304, 305, 306, 307, 308, 368, 364, 85002, 9026, 63105,
|
|
||||||
367, 39596, 25835, 19746, 293, 294, 26505, 85003, 18377, 56785,
|
|
||||||
10122, 10123, 10124, 86185, 39863, 86186, 10125, 39865, 4066,
|
|
||||||
4067, 24257, 4068, 4070, 86187, 4073, 4074, 86188, 4076, 7538,
|
|
||||||
4077, 86189, 4078, 4079, 7540, 7541, 4084, 4085, 7542, 86190,
|
|
||||||
4086, 86191, 4087, 4088, 86192, 7545, 44874, 7821, 44875,
|
|
||||||
86193, 4286, 86194, 51470, 17609, 1408, 47486, 1411, 1412,
|
|
||||||
47487, 1413, 1414, 1417, 1415, 47488, 1416, 1418, 1420, 470,
|
|
||||||
1422, 1423, 1424, 5001, 5002, 47489, 1427, 1429, 1430, 31811,
|
|
||||||
1432, 1433, 47490, 1435, 3753, 1437, 1439, 1440, 47491, 1443,
|
|
||||||
47492, 1446, 5004, 5005, 1450, 47493, 353, 1452, 42145, 3103,
|
|
||||||
3402, 3104, 3105, 4780, 3106, 3107, 3108, 12157, 3111, 42146,
|
|
||||||
42147, 3114, 4782, 42148, 3116, 3117, 42149, 42150, 3407, 3121,
|
|
||||||
3122, 18154, 3126, 3127, 3128, 3410, 3130, 3411, 3412, 3415,
|
|
||||||
24241, 3417, 3418, 3449, 42151, 3421, 3422, 7587, 42152, 3424,
|
|
||||||
3427, 3428, 3448, 3430, 3432, 42153, 42154, 41648, 1991, 407,
|
|
||||||
57234, 411, 2862, 57235, 2863, 18368, 57236, 2874, 7350, 4115,
|
|
||||||
2876, 2877, 17975, 86195, 4116, 2881, 2882, 2883, 2886, 463,
|
|
||||||
870, 872, 873, 874, 875, 8783, 8784, 877, 1480, 1481, 459,
|
|
||||||
2778, 881, 8785, 2779, 8786, 8787, 8788, 886, 887, 8789, 889,
|
|
||||||
8790, 86196, 6920, 86197, 5080, 5081, 7395, 7396, 9395, 9396,
|
|
||||||
1528, 42737, 805, 86198, 1209, 13595, 4126, 9680, 34368, 9682,
|
|
||||||
86199, 86200, 174, 175, 176, 177, 178, 179, 180, 182, 183,
|
|
||||||
1477, 31138, 186, 172, 187, 188, 189, 190, 191, 458, 871,
|
|
||||||
31294, 31295, 27604, 31296, 31297, 882, 883, 884, 31298, 890,
|
|
||||||
1089, 1488, 1489, 1092, 1093, 1094, 1095, 1096, 1097, 1490,
|
|
||||||
1098, 1495, 1502, 1099, 1100, 1101, 1493, 2997, 12223, 1103,
|
|
||||||
2654, 1498, 1499, 1500, 80615, 80616, 80617, 33359, 86201,
|
|
||||||
9294, 1501, 86202, 1506, 1507, 23454, 38802, 38803, 1014,
|
|
||||||
86203, 5583, 5584, 651, 74717, 5586, 5587, 5588, 5589, 74720,
|
|
||||||
5590, 38808, 33527, 78330, 10930, 5119, 10931, 1000, 10928,
|
|
||||||
10932, 10933, 10934, 10935, 5863, 10936, 86204, 10938, 10939,
|
|
||||||
86205, 192, 194, 38754, 38755, 198, 38756, 38757, 38758, 2842,
|
|
||||||
640, 22780, 22781, 1080, 86206, 86207, 1084, 1086, 1088, 63916,
|
|
||||||
9412, 970, 9413, 9414, 9415, 9416, 9417, 1310, 7168, 7169,
|
|
||||||
1318, 9418, 1324, 39159, 1804, 1557, 24850, 41499, 1560, 41500,
|
|
||||||
1562, 1563, 1565, 1927, 1928, 1566, 1569, 1570, 1571, 1572,
|
|
||||||
1573, 1574, 1575, 1576, 2674, 2677, 2678, 2679, 2946, 2682,
|
|
||||||
2676, 2683, 2947, 1156, 1157, 1158, 1467, 1160, 1468, 1469,
|
|
||||||
1161, 1162, 1163, 4369, 1165, 1166, 1167, 12923, 2917, 1169,
|
|
||||||
1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 18153, 8359,
|
|
||||||
1178, 1164, 1191, 1180, 12924, 86208, 86209, 54817, 66962,
|
|
||||||
2476, 86210, 86211, 41820, 41821, 41822, 41824, 1130, 1131,
|
|
||||||
1132, 32692, 1134, 34848, 1136, 1133, 1137, 1138, 1139, 1140,
|
|
||||||
1141, 1143, 1144, 1145, 34849, 2639, 34850, 1146, 1147, 1148,
|
|
||||||
34851, 1150, 1151, 1152, 1153, 1154, 1155, 1678, 1679, 1680,
|
|
||||||
1681, 40870, 2059, 1685, 1686, 32686, 14970, 1688, 1689, 86212,
|
|
||||||
1692, 1682, 1693, 1695, 1696, 1698, 12955, 8909, 41690, 1700,
|
|
||||||
41691, 86213, 30949, 41692, 1703, 1704, 1705, 41693, 14976,
|
|
||||||
1708, 2071, 1709, 1710, 1711, 1712, 1727, 86214, 86215, 86216,
|
|
||||||
1715, 86217, 1714, 1717, 1690, 41697, 86218, 1720, 86219, 2073,
|
|
||||||
41699, 1724, 2075, 1726, 1729, 1730, 1732, 2078, 2223, 1735,
|
|
||||||
1713, 41700, 1737, 14977, 1739, 1740, 1741, 2080, 1743, 1744,
|
|
||||||
1745, 1746, 1747, 1748, 1749, 1750, 1751, 41701, 1752, 1753,
|
|
||||||
1909, 86220, 2085, 1754, 19548, 86221, 19551, 5733, 3856, 5190,
|
|
||||||
4581, 25145, 86222, 86223, 4846, 86224, 4861, 86225, 86226,
|
|
||||||
86227, 25150, 86228, 86229, 13820, 2027, 4898, 4899, 4901,
|
|
||||||
2135, 4902, 4868, 4904, 86230, 4905, 25155, 4907, 86231, 4909,
|
|
||||||
4910, 4911, 4912, 86232, 6220, 81357, 86233, 2589, 73877,
|
|
||||||
29706, 6227, 6228, 86234, 6237, 86235, 6241, 6242, 1812, 13808,
|
|
||||||
13809, 70908, 2293, 2294, 86236, 2295, 2296, 2297, 22947,
|
|
||||||
16511, 2299, 2300, 2301, 13097, 73079, 86237, 13099, 50121,
|
|
||||||
86238, 86239, 13101, 86240, 2424, 4725, 4726, 4727, 4728, 4729,
|
|
||||||
4730, 86241, 26881, 10944, 4734, 4735, 4736, 26239, 26240,
|
|
||||||
71408, 86242, 57401, 71410, 26244, 5344, 26245, 86243, 4102,
|
|
||||||
71414, 11091, 6736, 86244, 6737, 6738, 38152, 6740, 6741, 6742,
|
|
||||||
6298, 6743, 6745, 6746, 20867, 6749, 20616, 86245, 9801, 65297,
|
|
||||||
20617, 65298, 20619, 5629, 65299, 20621, 20622, 8385, 20623,
|
|
||||||
20624, 5191, 20625, 20626, 442, 443, 445, 27837, 77681, 86246,
|
|
||||||
27839, 86247, 86248, 41435, 66511, 2478, 2479, 2480, 2481,
|
|
||||||
2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2494,
|
|
||||||
2493, 33025, 12084, 2542, 2497, 2499, 2501, 2503, 2504, 2505,
|
|
||||||
33026, 2506, 2507, 2508, 2509, 2511, 1787, 12080, 2513, 2514,
|
|
||||||
3988, 3176, 3989, 2518, 2521, 9285, 2522, 2524, 2525, 3990,
|
|
||||||
2527, 2528, 27499, 2529, 2530, 3991, 2532, 2534, 2535, 18038,
|
|
||||||
2536, 2538, 2495, 46077, 61493, 61494, 1006, 713, 4971, 4972,
|
|
||||||
4973, 4975, 4976, 650, 170, 7549, 7550, 7551, 7552, 7553,
|
|
||||||
86249, 7936, 956, 11169, 11170, 1249, 1244, 1245, 1247, 2544,
|
|
||||||
1250, 2545, 1252, 2547, 1253, 1254, 2549, 39636, 1259, 1257,
|
|
||||||
1258, 39637, 1260, 1261, 2551, 1262, 1263, 848, 86250, 86251,
|
|
||||||
854, 74596, 856, 1957, 86252, 855, 1959, 1961, 857, 86253, 851,
|
|
||||||
859, 860, 862, 1964, 864, 865, 866, 867, 1965, 1966, 1967,
|
|
||||||
1968, 1969, 86254, 1971, 1972, 1973, 1974, 1975, 1976, 1977,
|
|
||||||
841, 1954, 842, 2978, 846, 847, 849, 850, 852, 1956, 17452,
|
|
||||||
71941, 86255, 86256, 73665, 1471, 13690, 185, 503, 504, 2342,
|
|
||||||
505, 506, 4378, 508, 4379, 17313, 510, 511, 512, 520, 513,
|
|
||||||
4384, 17314, 514, 515, 46158, 17317, 518, 34269, 519, 4386,
|
|
||||||
523, 524, 525, 46159, 528, 529, 17319, 531, 532, 533, 534, 535,
|
|
||||||
7482, 537, 538, 5267, 536, 539, 541, 540, 19858, 17320, 17321,
|
|
||||||
906, 907, 908, 17322, 910, 17323, 912, 15850, 913, 4398, 17324,
|
|
||||||
86257, 278, 2948, 2949, 2950, 3007, 2951, 2952, 2953, 2954,
|
|
||||||
2955, 3013, 35352, 3014, 3015, 2962, 3016, 33505, 39118, 3017,
|
|
||||||
3018, 20492, 4000, 3021, 3022, 35353, 39293, 3024, 18443, 3029,
|
|
||||||
9467, 20529, 39119, 8380, 2965, 3030, 3043, 22714, 39120, 2956,
|
|
||||||
3035, 39121, 3037, 3038, 2688, 86258, 36675, 30894, 24505,
|
|
||||||
8888, 13541, 49728, 27660, 9082, 27661, 365, 366, 2232, 76098,
|
|
||||||
7233, 1494, 17391, 606, 607, 611, 610, 612, 614, 615, 613, 616,
|
|
||||||
9117, 617, 618, 21155, 1789, 619, 620, 7636, 12019, 621, 622,
|
|
||||||
1793, 623, 625, 624, 631, 626, 627, 21578, 21103, 628, 21579,
|
|
||||||
629, 9122, 9123, 12189, 9289, 3168, 3169, 630, 632, 634, 21580,
|
|
||||||
9121, 635, 636, 637, 21581, 12781, 1801, 638, 639, 1559, 24343,
|
|
||||||
9419, 9420, 795, 796, 1611, 86259, 1612, 21551, 21552, 3741,
|
|
||||||
1617, 3742, 1615, 1619, 1620, 6301, 3744, 1622, 67685, 8521,
|
|
||||||
55937, 9025, 27663, 8881, 13581, 86260, 11592, 44720, 86261,
|
|
||||||
63231, 50873, 42925, 52332, 86262, 72706, 17705, 17707, 17708,
|
|
||||||
3401, 40217, 1248, 40218, 86263, 7098, 86264, 86265, 1264,
|
|
||||||
86266, 1266, 1267, 1268, 1269, 86267, 1271, 1272, 1273, 1274,
|
|
||||||
2556, 1275, 1276, 1277, 1278, 1279, 1280, 1282, 1283, 22680,
|
|
||||||
11889, 86268, 45662, 7038, 86269, 19315, 45663, 45664, 86270,
|
|
||||||
5855, 34002, 49245, 10447, 5663, 86271, 15429, 53877, 49249,
|
|
||||||
86272, 86273, 86274, 60128, 60453, 60129, 5552, 31923, 43407,
|
|
||||||
4287, 17980, 64977, 86275, 86276, 8234, 86277, 3649, 8240,
|
|
||||||
1330, 11999, 1332, 27618, 1334, 1335, 340, 3651, 25640, 18165,
|
|
||||||
1343, 4618, 1474, 3653, 75921, 1349, 53519, 1779, 45454, 22778,
|
|
||||||
40153, 67677, 63826, 45455, 15128, 67678, 67679, 1792, 67680,
|
|
||||||
3171, 47816, 45457, 9288, 59891, 67681, 25703, 35731, 35732,
|
|
||||||
369, 35713, 35714, 35715, 34652, 35716, 31681, 35717, 12779,
|
|
||||||
35718, 35719, 11992, 806, 807, 808, 43499, 43500, 810, 776,
|
|
||||||
812, 813, 814, 241, 43501, 43502, 816, 755, 43503, 818, 819,
|
|
||||||
820, 43504, 821, 822, 823, 824, 825, 826, 43505, 43506, 43507,
|
|
||||||
828, 829, 20083, 43508, 43509, 832, 833, 834, 835, 86278,
|
|
||||||
19984, 19985, 86279, 24125, 19986, 86280, 19988, 86281, 5414,
|
|
||||||
86282, 85808, 5479, 5420, 5421, 5422, 5423, 63800, 86283,
|
|
||||||
86284, 30965, 86285, 416, 1510, 5740, 5741, 81991, 86286,
|
|
||||||
28938, 50149, 1003, 55512, 14306, 6960, 688, 86287, 14307,
|
|
||||||
5399, 5400, 17783, 24118, 720, 86288, 44913, 24557, 667, 24876,
|
|
||||||
6529, 24877, 24878, 24879, 24880, 31847, 20671, 4011, 171, 580,
|
|
||||||
86289, 3863, 914, 2202, 916, 917, 918, 919, 921, 922, 923,
|
|
||||||
7585, 925, 7586, 926, 927, 928, 7588, 929, 930, 931, 932, 933,
|
|
||||||
934, 1875, 1876, 7589, 7590, 1878, 1879, 7591, 7592, 1882,
|
|
||||||
1883, 1884, 2212, 7593, 1887, 1888, 1889, 1890, 1891, 1892,
|
|
||||||
1893, 1894, 1895, 1896, 1897, 1898, 2217, 1900, 7594, 1902,
|
|
||||||
2219, 7595, 1905, 1906, 1907, 3323, 7596, 1911, 1912, 7597,
|
|
||||||
1914, 1915, 1916, 2226, 1919, 7598, 2227, 1920, 1921, 7599,
|
|
||||||
7600, 4708, 1923, 355, 356, 1549, 358, 32077, 360, 32078,
|
|
||||||
21117, 362, 19043, 71677, 5716, 86290, 49790, 86291, 86292,
|
|
||||||
86293, 49792, 86294, 86295, 49794, 86296, 86297, 86298, 86299,
|
|
||||||
11882, 86300, 49798, 86301, 49800, 49801, 49802, 49803, 453,
|
|
||||||
49804, 8591, 6794, 49806, 18989, 49807, 49808, 16308, 49809,
|
|
||||||
86302, 86303, 10105, 86304, 5285, 10106, 10107, 6557, 86305,
|
|
||||||
23571, 10109, 38883, 10110, 5401, 86306, 67557, 16430, 67558,
|
|
||||||
40171, 16433, 25878, 86307, 21762, 23, 86308, 86309, 21766,
|
|
||||||
86310, 86311, 5149, 3926, 21768, 21769, 47826, 942, 46985,
|
|
||||||
6588, 58867, 6589, 6590, 86312, 6592, 6006, 53855, 9565, 359,
|
|
||||||
86313, 2845, 876, 879, 27556, 27557, 885, 27558, 888, 2847,
|
|
||||||
27559, 2115, 2116, 2117, 53962, 57839, 315, 316, 317, 318, 319,
|
|
||||||
86314, 321, 322, 2122, 323, 2123, 324, 325, 328, 326, 327,
|
|
||||||
40542, 329, 330, 18079, 18080, 331, 1790, 7382, 332, 7380,
|
|
||||||
7236, 23413, 23414, 18924, 18925, 333, 335, 336, 39750, 337,
|
|
||||||
86315, 339, 341, 342, 343, 16264, 16265, 6615, 86316, 86317,
|
|
||||||
86318, 86319, 16269, 10538, 33226, 86320, 16272, 5824, 16273,
|
|
||||||
16274, 16276, 16277, 16278, 16279, 16280, 14517, 1547, 6463,
|
|
||||||
3394, 49677, 659, 10380, 30013, 10382, 10378, 10379, 10383,
|
|
||||||
10384, 10385, 86321, 4139, 13370, 13371, 86322, 86323, 11878,
|
|
||||||
64509, 15141, 15142, 15143, 32737, 14183, 15144, 39101, 42768,
|
|
||||||
5645, 32738, 801, 803, 804, 86324, 14707, 86325, 6601, 12402,
|
|
||||||
712, 12403, 2936, 1447, 15477, 1410, 44872, 1550, 8614, 15478,
|
|
||||||
15479, 15480, 15481, 4811, 3752, 1442, 15482, 8818, 1445, 5006,
|
|
||||||
16304, 32277, 16305, 16306, 86326, 16307, 53691, 69305, 809,
|
|
||||||
86327, 815, 26724, 69307, 43484, 63904, 86328, 13498, 827,
|
|
||||||
86329, 831, 2857, 836, 86330, 86331, 837, 838, 839, 840, 228,
|
|
||||||
229, 43722, 230, 231, 43723, 234, 235, 236, 237, 238, 239,
|
|
||||||
2745, 2746, 240, 242, 243, 244, 43724, 19788, 246, 247, 21134,
|
|
||||||
248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 43725, 43726,
|
|
||||||
41, 43727, 262, 43728, 2751, 264, 265, 266, 267, 268, 269, 270,
|
|
||||||
271, 272, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032,
|
|
||||||
1033, 1034, 43729, 1035, 43730, 1037, 21821, 2926, 14388,
|
|
||||||
10432, 14389, 14390, 14391, 14392, 86332, 14394, 14395, 2035,
|
|
||||||
2169, 86333, 14397, 14398, 14399, 14400, 52, 14401, 14402,
|
|
||||||
7077, 21822, 14405, 14406, 14396, 86334, 17356, 17357, 84679,
|
|
||||||
84680, 76383, 17360, 17361, 86335, 38801, 2060, 30850, 12963,
|
|
||||||
1684, 1687, 2061, 14978, 1694, 43387, 1697, 1699, 2067, 1701,
|
|
||||||
1702, 1706, 43388, 43389, 76325, 1716, 1718, 26832, 1719, 1723,
|
|
||||||
2081, 2063, 1728, 39059, 76326, 1731, 86336, 1736, 76327, 1738,
|
|
||||||
19657, 6579, 6581, 6582, 6583, 6584, 6585, 29979, 1818, 28239,
|
|
||||||
68, 69, 3391, 86337, 10266, 63528, 86338, 10269, 10270, 10271,
|
|
||||||
10272, 86339, 86340, 63530, 63531, 63532, 63533, 10273, 63534,
|
|
||||||
86341, 10681, 10682, 86342, 9673, 86343, 10683, 460, 461, 462,
|
|
||||||
467, 4464, 4466, 3729, 471, 472, 468, 81634, 474, 81635, 475,
|
|
||||||
476, 477, 479, 480, 81636, 81637, 482, 17442, 81638, 81639,
|
|
||||||
484, 485, 486, 4473, 488, 489, 490, 493, 466, 494, 495, 496,
|
|
||||||
497, 499, 500, 501, 502, 34376, 86344, 63836, 56281, 1707,
|
|
||||||
20416, 61452, 56282, 1755, 56283, 56284, 18508, 53650, 63444,
|
|
||||||
86345, 3579, 63445, 3677, 1979, 1980, 1981, 3132, 3147, 34090,
|
|
||||||
1987, 12770, 1329, 80818, 80819, 1988, 23522, 1986, 15880,
|
|
||||||
1985, 32975, 1992, 1993, 7165, 3141, 3143, 86346, 1982, 1984,
|
|
||||||
3145, 86347, 78064, 55453, 2656, 2657, 35634, 35635, 2167,
|
|
||||||
43479,
|
|
||||||
// ensure there is a representative number for any # of int bytes
|
|
||||||
1, 1 << 8 + 1, 1 << 16 + 1, 1 << 24 + 1 };
|
|
||||||
// data = new int[]{1, 2, 3, 4};
|
|
||||||
for (int value : data) {
|
|
||||||
dataSet.add(new Long(value));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.util.encoding;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the {@link VInt8} class.
|
||||||
|
*/
|
||||||
|
public class Vint8Test extends LuceneTestCase {
|
||||||
|
|
||||||
|
private static final int[] TEST_VALUES = {
|
||||||
|
-1000000000,
|
||||||
|
-1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14,
|
||||||
|
(1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28
|
||||||
|
};
|
||||||
|
private static int[] BYTES_NEEDED_TEST_VALUES = {
|
||||||
|
5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5
|
||||||
|
};
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBytesRef() throws Exception {
|
||||||
|
BytesRef bytes = new BytesRef(256);
|
||||||
|
int expectedSize = 0;
|
||||||
|
for (int j = 0; j < TEST_VALUES.length; j++) {
|
||||||
|
VInt8.encode(TEST_VALUES[j], bytes);
|
||||||
|
expectedSize += BYTES_NEEDED_TEST_VALUES[j];
|
||||||
|
}
|
||||||
|
assertEquals(expectedSize, bytes.length);
|
||||||
|
|
||||||
|
for (int j = 0; j < TEST_VALUES.length; j++) {
|
||||||
|
assertEquals(TEST_VALUES[j], VInt8.decode(bytes));
|
||||||
|
}
|
||||||
|
assertEquals(bytes.offset, bytes.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -73,7 +73,7 @@ public class NativeUnixDirectory extends FSDirectory {
|
||||||
private final static long ALIGN = 512;
|
private final static long ALIGN = 512;
|
||||||
private final static long ALIGN_NOT_MASK = ~(ALIGN-1);
|
private final static long ALIGN_NOT_MASK = ~(ALIGN-1);
|
||||||
|
|
||||||
/** Default buffer size before writing to disk (256 MB);
|
/** Default buffer size before writing to disk (256 KB);
|
||||||
* larger means less IO load but more RAM and direct
|
* larger means less IO load but more RAM and direct
|
||||||
* buffer storage space consumed during merging. */
|
* buffer storage space consumed during merging. */
|
||||||
|
|
||||||
|
|
|
@ -237,7 +237,7 @@ public class FSTCompletionBuilder {
|
||||||
final Object empty = outputs.getNoOutput();
|
final Object empty = outputs.getNoOutput();
|
||||||
final Builder<Object> builder = new Builder<Object>(
|
final Builder<Object> builder = new Builder<Object>(
|
||||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||||
shareMaxTailLength, outputs, null, false);
|
shareMaxTailLength, outputs, null, false, true);
|
||||||
|
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
BytesRef entry;
|
BytesRef entry;
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.codecs.asserting;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.FilterCodec;
|
import org.apache.lucene.codecs.FilterCodec;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene41.Lucene41Codec;
|
import org.apache.lucene.codecs.lucene41.Lucene41Codec;
|
||||||
|
|
||||||
|
@ -29,6 +30,7 @@ public final class AssertingCodec extends FilterCodec {
|
||||||
|
|
||||||
private final PostingsFormat postings = new AssertingPostingsFormat();
|
private final PostingsFormat postings = new AssertingPostingsFormat();
|
||||||
private final TermVectorsFormat vectors = new AssertingTermVectorsFormat();
|
private final TermVectorsFormat vectors = new AssertingTermVectorsFormat();
|
||||||
|
private final StoredFieldsFormat storedFields = new AssertingStoredFieldsFormat();
|
||||||
|
|
||||||
public AssertingCodec() {
|
public AssertingCodec() {
|
||||||
super("Asserting", new Lucene41Codec());
|
super("Asserting", new Lucene41Codec());
|
||||||
|
@ -44,4 +46,8 @@ public final class AssertingCodec extends FilterCodec {
|
||||||
return vectors;
|
return vectors;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StoredFieldsFormat storedFieldsFormat() {
|
||||||
|
return storedFields;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,136 @@
|
||||||
|
package org.apache.lucene.codecs.asserting;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||||
|
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||||
|
import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.SegmentInfo;
|
||||||
|
import org.apache.lucene.index.StorableField;
|
||||||
|
import org.apache.lucene.index.StoredFieldVisitor;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Just like {@link Lucene41StoredFieldsFormat} but with additional asserts.
|
||||||
|
*/
|
||||||
|
public class AssertingStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
|
private final StoredFieldsFormat in = new Lucene41StoredFieldsFormat();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
|
||||||
|
return new AssertingStoredFieldsReader(in.fieldsReader(directory, si, fn, context), si.getDocCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
|
||||||
|
return new AssertingStoredFieldsWriter(in.fieldsWriter(directory, si, context));
|
||||||
|
}
|
||||||
|
|
||||||
|
static class AssertingStoredFieldsReader extends StoredFieldsReader {
|
||||||
|
private final StoredFieldsReader in;
|
||||||
|
private final int maxDoc;
|
||||||
|
|
||||||
|
AssertingStoredFieldsReader(StoredFieldsReader in, int maxDoc) {
|
||||||
|
this.in = in;
|
||||||
|
this.maxDoc = maxDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException {
|
||||||
|
assert n >= 0 && n < maxDoc;
|
||||||
|
in.visitDocument(n, visitor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StoredFieldsReader clone() {
|
||||||
|
return new AssertingStoredFieldsReader(in.clone(), maxDoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum Status {
|
||||||
|
UNDEFINED, STARTED, FINISHED;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class AssertingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
|
private final StoredFieldsWriter in;
|
||||||
|
private int numWritten;
|
||||||
|
private int fieldCount;
|
||||||
|
private Status docStatus;
|
||||||
|
|
||||||
|
AssertingStoredFieldsWriter(StoredFieldsWriter in) {
|
||||||
|
this.in = in;
|
||||||
|
this.docStatus = Status.UNDEFINED;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startDocument(int numStoredFields) throws IOException {
|
||||||
|
assert docStatus != Status.STARTED;
|
||||||
|
in.startDocument(numStoredFields);
|
||||||
|
assert fieldCount == 0;
|
||||||
|
fieldCount = numStoredFields;
|
||||||
|
numWritten++;
|
||||||
|
docStatus = Status.STARTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishDocument() throws IOException {
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
assert fieldCount == 0;
|
||||||
|
in.finishDocument();
|
||||||
|
docStatus = Status.FINISHED;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeField(FieldInfo info, StorableField field) throws IOException {
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
in.writeField(info, field);
|
||||||
|
assert fieldCount > 0;
|
||||||
|
fieldCount--;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void abort() {
|
||||||
|
in.abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finish(FieldInfos fis, int numDocs) throws IOException {
|
||||||
|
assert docStatus == (numDocs > 0 ? Status.FINISHED : Status.UNDEFINED);
|
||||||
|
in.finish(fis, numDocs);
|
||||||
|
assert fieldCount == 0;
|
||||||
|
assert numDocs == numWritten;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
in.close();
|
||||||
|
assert docStatus != Status.STARTED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,17 +18,20 @@ package org.apache.lucene.codecs.asserting;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.TermVectorsReader;
|
import org.apache.lucene.codecs.TermVectorsReader;
|
||||||
import org.apache.lucene.codecs.TermVectorsWriter;
|
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||||
import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
|
import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
|
||||||
import org.apache.lucene.index.AssertingAtomicReader;
|
import org.apache.lucene.index.AssertingAtomicReader;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
import org.apache.lucene.index.FieldInfos;
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.SegmentInfo;
|
import org.apache.lucene.index.SegmentInfo;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Just like {@link Lucene40TermVectorsFormat} but with additional asserts.
|
* Just like {@link Lucene40TermVectorsFormat} but with additional asserts.
|
||||||
|
@ -43,16 +46,16 @@ public class AssertingTermVectorsFormat extends TermVectorsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
|
public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
|
||||||
return in.vectorsWriter(directory, segmentInfo, context);
|
return new AssertingTermVectorsWriter(in.vectorsWriter(directory, segmentInfo, context));
|
||||||
}
|
}
|
||||||
|
|
||||||
static class AssertingTermVectorsReader extends TermVectorsReader {
|
static class AssertingTermVectorsReader extends TermVectorsReader {
|
||||||
private final TermVectorsReader in;
|
private final TermVectorsReader in;
|
||||||
|
|
||||||
AssertingTermVectorsReader(TermVectorsReader in) {
|
AssertingTermVectorsReader(TermVectorsReader in) {
|
||||||
this.in = in;
|
this.in = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
in.close();
|
in.close();
|
||||||
|
@ -68,5 +71,120 @@ public class AssertingTermVectorsFormat extends TermVectorsFormat {
|
||||||
public TermVectorsReader clone() {
|
public TermVectorsReader clone() {
|
||||||
return new AssertingTermVectorsReader(in.clone());
|
return new AssertingTermVectorsReader(in.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum Status {
|
||||||
|
UNDEFINED, STARTED, FINISHED;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class AssertingTermVectorsWriter extends TermVectorsWriter {
|
||||||
|
private final TermVectorsWriter in;
|
||||||
|
private Status docStatus, fieldStatus, termStatus;
|
||||||
|
private int fieldCount, termCount, positionCount;
|
||||||
|
boolean hasPositions;
|
||||||
|
|
||||||
|
AssertingTermVectorsWriter(TermVectorsWriter in) {
|
||||||
|
this.in = in;
|
||||||
|
docStatus = Status.UNDEFINED;
|
||||||
|
fieldStatus = Status.UNDEFINED;
|
||||||
|
termStatus = Status.UNDEFINED;
|
||||||
|
fieldCount = termCount = positionCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startDocument(int numVectorFields) throws IOException {
|
||||||
|
assert fieldCount == 0;
|
||||||
|
assert docStatus != Status.STARTED;
|
||||||
|
in.startDocument(numVectorFields);
|
||||||
|
docStatus = Status.STARTED;
|
||||||
|
fieldCount = numVectorFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishDocument() throws IOException {
|
||||||
|
assert fieldCount == 0;
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
in.finishDocument();
|
||||||
|
docStatus = Status.FINISHED;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startField(FieldInfo info, int numTerms, boolean positions,
|
||||||
|
boolean offsets, boolean payloads) throws IOException {
|
||||||
|
assert termCount == 0;
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
assert fieldStatus != Status.STARTED;
|
||||||
|
in.startField(info, numTerms, positions, offsets, payloads);
|
||||||
|
fieldStatus = Status.STARTED;
|
||||||
|
termCount = numTerms;
|
||||||
|
hasPositions = positions || offsets || payloads;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishField() throws IOException {
|
||||||
|
assert termCount == 0;
|
||||||
|
assert fieldStatus == Status.STARTED;
|
||||||
|
in.finishField();
|
||||||
|
fieldStatus = Status.FINISHED;
|
||||||
|
--fieldCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startTerm(BytesRef term, int freq) throws IOException {
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
assert fieldStatus == Status.STARTED;
|
||||||
|
assert termStatus != Status.STARTED;
|
||||||
|
in.startTerm(term, freq);
|
||||||
|
termStatus = Status.STARTED;
|
||||||
|
positionCount = hasPositions ? freq : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finishTerm() throws IOException {
|
||||||
|
assert positionCount == 0;
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
assert fieldStatus == Status.STARTED;
|
||||||
|
assert termStatus == Status.STARTED;
|
||||||
|
in.finishTerm();
|
||||||
|
termStatus = Status.FINISHED;
|
||||||
|
--termCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addPosition(int position, int startOffset, int endOffset,
|
||||||
|
BytesRef payload) throws IOException {
|
||||||
|
assert docStatus == Status.STARTED;
|
||||||
|
assert fieldStatus == Status.STARTED;
|
||||||
|
assert termStatus == Status.STARTED;
|
||||||
|
in.addPosition(position, startOffset, endOffset, payload);
|
||||||
|
--positionCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void abort() {
|
||||||
|
in.abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finish(FieldInfos fis, int numDocs) throws IOException {
|
||||||
|
assert docStatus == (numDocs > 0 ? Status.FINISHED : Status.UNDEFINED);
|
||||||
|
assert fieldStatus != Status.STARTED;
|
||||||
|
assert termStatus != Status.STARTED;
|
||||||
|
in.finish(fis, numDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Comparator<BytesRef> getComparator() throws IOException {
|
||||||
|
return in.getComparator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
in.close();
|
||||||
|
assert docStatus != Status.STARTED;
|
||||||
|
assert fieldStatus != Status.STARTED;
|
||||||
|
assert termStatus != Status.STARTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue