LUCENE-4820: add optional payload to Analyzing/FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-13 19:13:56 +00:00
parent c476a59ff9
commit 234fa47dba
11 changed files with 399 additions and 32 deletions

View File

@ -61,6 +61,9 @@ New Features
subclasses with ctors taking AttributeFactory.
(Renaud Delbru, Uwe Schindler, Steve Rowe)
* LUCENE-4820: Add payloads to Analyzing/FuzzySuggester, to record an
arbitrary byte[] per suggestion (Mike McCandless)
Optimizations
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the

View File

@ -0,0 +1,36 @@
package org.apache.lucene.search.spell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester; // javadocs
import org.apache.lucene.search.suggest.analyzing.FuzzySuggester; // javadocs
import org.apache.lucene.util.BytesRef;
/**
* Interface for enumerating term,weight,payload triples;
* currently only {@link AnalyzingSuggester} and {@link
* FuzzySuggester} support payloads.
*/
public interface TermFreqPayloadIterator extends TermFreqIterator {
/** An arbitrary byte[] to record per suggestion. See
* {@link LookupResult#payload} to retrieve the payload
* for each suggestion. */
public BytesRef payload();
}

View File

@ -25,6 +25,7 @@ import java.util.List;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.PriorityQueue;
@ -39,17 +40,29 @@ public abstract class Lookup {
public static final class LookupResult implements Comparable<LookupResult> {
/** the key's text */
public final CharSequence key;
/** the key's weight */
public final long value;
/** the key's payload (null if not present) */
public final BytesRef payload;
/**
* Create a new result from a key+weight pair.
*/
public LookupResult(CharSequence key, long value) {
this(key, value, null);
}
/**
* Create a new result from a key+weight+payload triple.
*/
public LookupResult(CharSequence key, long value, BytesRef payload) {
this.key = key;
this.value = value;
this.payload = payload;
}
@Override
public String toString() {
return key + "/" + value;

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.store.ByteArrayDataInput;
@ -180,6 +181,10 @@ public class AnalyzingSuggester extends Lookup {
* graphs this will always be 1. */
private int maxAnalyzedPathsForOneInput;
private boolean hasPayloads;
private static final int PAYLOAD_SEP = '\u001f';
/**
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@ -330,8 +335,15 @@ public class AnalyzingSuggester extends Lookup {
return new TokenStreamToAutomaton();
}
}
private static class AnalyzingComparator implements Comparator<BytesRef> {
private final boolean hasPayloads;
public AnalyzingComparator(boolean hasPayloads) {
this.hasPayloads = hasPayloads;
}
private Comparator<BytesRef> sortComparator = new Comparator<BytesRef>() {
private final ByteArrayDataInput readerA = new ByteArrayDataInput();
private final ByteArrayDataInput readerB = new ByteArrayDataInput();
private final BytesRef scratchA = new BytesRef();
@ -367,10 +379,19 @@ public class AnalyzingSuggester extends Lookup {
}
// Finally by surface form:
scratchA.offset = readerA.getPosition();
scratchA.length = a.length - scratchA.offset;
scratchB.offset = readerB.getPosition();
scratchB.length = b.length - scratchB.offset;
if (hasPayloads) {
readerA.setPosition(readerA.getPosition() + scratchA.length);
scratchA.length = readerA.readShort();
scratchA.offset = readerA.getPosition();
readerB.setPosition(readerB.getPosition() + scratchB.length);
scratchB.length = readerB.readShort();
scratchB.offset = readerB.getPosition();
} else {
scratchA.offset = readerA.getPosition();
scratchA.length = a.length - scratchA.offset;
scratchB.offset = readerB.getPosition();
scratchB.length = b.length - scratchB.offset;
}
cmp = scratchA.compareTo(scratchB);
if (cmp != 0) {
@ -380,21 +401,28 @@ public class AnalyzingSuggester extends Lookup {
return 0;
}
};
@Override
public void build(TermFreqIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
File tempInput = File.createTempFile(prefix, ".input", directory);
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
TermFreqPayloadIterator payloads;
if (iterator instanceof TermFreqPayloadIterator) {
payloads = (TermFreqPayloadIterator) iterator;
} else {
payloads = null;
}
hasPayloads = payloads != null;
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
Sort.ByteSequencesReader reader = null;
BytesRef scratch = new BytesRef();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
boolean success = false;
byte buffer[] = new byte[8];
try {
@ -419,6 +447,19 @@ public class AnalyzingSuggester extends Lookup {
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE-2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
}
payload = payloads.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
@ -430,7 +471,18 @@ public class AnalyzingSuggester extends Lookup {
output.writeInt(encodeWeight(iterator.weight()));
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
if (hasPayloads) {
for(int i=0;i<surfaceForm.length;i++) {
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;
@ -440,7 +492,7 @@ public class AnalyzingSuggester extends Lookup {
writer.close();
// Sort all input/output pairs (required by FST.Builder):
new Sort(sortComparator).sort(tempInput, tempSorted);
new Sort(new AnalyzingComparator(payloads != null)).sort(tempInput, tempSorted);
// Free disk space:
tempInput.delete();
@ -474,8 +526,13 @@ public class AnalyzingSuggester extends Lookup {
long cost = input.readInt();
surface.bytes = scratch.bytes;
surface.offset = input.getPosition();
surface.length = scratch.length - surface.offset;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = scratch.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRef();
@ -513,7 +570,18 @@ public class AnalyzingSuggester extends Lookup {
Util.toIntsRef(analyzed, scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
if (!hasPayloads) {
builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = scratch.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts, outputs.newPair(cost, br));
}
}
fst = builder.finish();
@ -542,6 +610,7 @@ public class AnalyzingSuggester extends Lookup {
fst.save(dataOut);
dataOut.writeVInt(maxAnalyzedPathsForOneInput);
dataOut.writeByte((byte) (hasPayloads ? 1 : 0));
} finally {
IOUtils.close(output);
}
@ -554,12 +623,58 @@ public class AnalyzingSuggester extends Lookup {
try {
this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
maxAnalyzedPathsForOneInput = dataIn.readVInt();
hasPayloads = dataIn.readByte() == 1;
} finally {
IOUtils.close(input);
}
return true;
}
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {
LookupResult result;
if (hasPayloads) {
int sepIndex = -1;
for(int i=0;i<output2.length;i++) {
if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
sepIndex = i;
break;
}
}
assert sepIndex != -1;
spare.grow(sepIndex);
int payloadLen = output2.length - sepIndex - 1;
output2.length = sepIndex;
UnicodeUtil.UTF8toUTF16(output2, spare);
BytesRef payload = new BytesRef(payloadLen);
System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
payload.length = payloadLen;
result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
} else {
spare.grow(output2.length);
UnicodeUtil.UTF8toUTF16(output2, spare);
result = new LookupResult(spare.toString(), decodeWeight(output1));
}
return result;
}
private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
if (hasPayloads) {
// output2 has at least PAYLOAD_SEP byte:
if (key.length >= output2.length) {
return false;
}
for(int i=0;i<key.length;i++) {
if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) {
return false;
}
}
return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
} else {
return key.bytesEquals(output2);
}
}
@Override
public List<LookupResult> lookup(final CharSequence key, boolean onlyMorePopular, int num) {
assert num > 0;
@ -639,10 +754,9 @@ public class AnalyzingSuggester extends Lookup {
// nodes we have and the
// maxSurfaceFormsPerAnalyzedForm:
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
if (utf8Key.bytesEquals(completion.output.output2)) {
spare.grow(completion.output.output2.length);
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
@ -676,7 +790,7 @@ public class AnalyzingSuggester extends Lookup {
// In exactFirst mode, don't accept any paths
// matching the surface form since that will
// create duplicate results:
if (utf8Key.bytesEquals(output.output2)) {
if (sameSurfaceForm(utf8Key, output.output2)) {
// We found exact match, which means we should
// have already found it in the first search:
assert results.size() == 1;
@ -697,9 +811,8 @@ public class AnalyzingSuggester extends Lookup {
MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
spare.grow(completion.output.output2.length);
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required

View File

@ -25,9 +25,10 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.search.suggest.Sort.SortInfo;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
@ -141,6 +142,9 @@ public class FSTCompletionLookup extends Lookup {
@Override
public void build(TermFreqIterator tfit) throws IOException {
if (tfit instanceof TermFreqPayloadIterator) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
File tempInput = File.createTempFile(
FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
File tempSorted = File.createTempFile(

View File

@ -26,9 +26,10 @@ import java.util.Comparator;
import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput;
@ -40,12 +41,12 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.MinResult;
import org.apache.lucene.util.fst.Util;
/**
* Suggester based on a weighted FST: it first traverses the prefix,
@ -93,6 +94,9 @@ public class WFSTCompletionLookup extends Lookup {
@Override
public void build(TermFreqIterator iterator) throws IOException {
if (iterator instanceof TermFreqPayloadIterator) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
BytesRef scratch = new BytesRef();
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator);
IntsRef scratchInts = new IntsRef();

View File

@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
@ -53,6 +54,9 @@ public class JaspellLookup extends Lookup {
@Override
public void build(TermFreqIterator tfit) throws IOException {
if (tfit instanceof TermFreqPayloadIterator) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (tfit.getComparator() != null) {
// make sure it's unsorted
// WTF - this could result in yet another sorted iteration....

View File

@ -25,9 +25,10 @@ import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
@ -51,6 +52,9 @@ public class TSTLookup extends Lookup {
@Override
public void build(TermFreqIterator tfit) throws IOException {
if (tfit instanceof TermFreqPayloadIterator) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
root = new TernaryTreeNode();
// buffer first
if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {

View File

@ -0,0 +1,36 @@
package org.apache.lucene.search.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
public final class TermFreqPayload {
public final BytesRef term;
public final long v;
public final BytesRef payload;
public TermFreqPayload(String term, long v, BytesRef payload) {
this(new BytesRef(term), v, payload);
}
public TermFreqPayload(BytesRef term, long v, BytesRef payload) {
this.term = term;
this.v = v;
this.payload = payload;
}
}

View File

@ -0,0 +1,72 @@
package org.apache.lucene.search.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
import org.apache.lucene.util.BytesRef;
/**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
*/
public final class TermFreqPayloadArrayIterator implements TermFreqPayloadIterator {
private final Iterator<TermFreqPayload> i;
private TermFreqPayload current;
private final BytesRef spare = new BytesRef();
public TermFreqPayloadArrayIterator(Iterator<TermFreqPayload> i) {
this.i = i;
}
public TermFreqPayloadArrayIterator(TermFreqPayload[] i) {
this(Arrays.asList(i));
}
public TermFreqPayloadArrayIterator(Iterable<TermFreqPayload> i) {
this(i.iterator());
}
@Override
public long weight() {
return current.v;
}
@Override
public BytesRef next() {
if (i.hasNext()) {
current = i.next();
spare.copyBytes(current.term);
return spare;
}
return null;
}
@Override
public BytesRef payload() {
return current.payload;
}
@Override
public Comparator<BytesRef> getComparator() {
return null;
}
}

View File

@ -53,6 +53,8 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.TermFreq;
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
import org.apache.lucene.search.suggest.TermFreqPayload;
import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@ -103,6 +105,56 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
assertEquals(6, results.get(2).value, 0.01F);
}
public void testKeywordWithPayloads() throws Exception {
TermFreqPayload keys[] = new TermFreqPayload[] {
new TermFreqPayload("foo", 50, new BytesRef("hello")),
new TermFreqPayload("bar", 10, new BytesRef("goodbye")),
new TermFreqPayload("barbar", 12, new BytesRef("thank you")),
new TermFreqPayload("barbara", 6, new BytesRef("for all the fish"))
};
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
suggester.build(new TermFreqPayloadArrayIterator(keys));
// top N of 2, but only foo is available
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2);
assertEquals(1, results.size());
assertEquals("foo", results.get(0).key.toString());
assertEquals(50, results.get(0).value, 0.01F);
assertEquals(new BytesRef("hello"), results.get(0).payload);
// top N of 1 for 'bar': we return this even though
// barbar is higher because exactFirst is enabled:
results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1);
assertEquals(1, results.size());
assertEquals("bar", results.get(0).key.toString());
assertEquals(10, results.get(0).value, 0.01F);
assertEquals(new BytesRef("goodbye"), results.get(0).payload);
// top N Of 2 for 'b'
results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2);
assertEquals(2, results.size());
assertEquals("barbar", results.get(0).key.toString());
assertEquals(12, results.get(0).value, 0.01F);
assertEquals(new BytesRef("thank you"), results.get(0).payload);
assertEquals("bar", results.get(1).key.toString());
assertEquals(10, results.get(1).value, 0.01F);
assertEquals(new BytesRef("goodbye"), results.get(1).payload);
// top N of 3 for 'ba'
results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3);
assertEquals(3, results.size());
assertEquals("barbar", results.get(0).key.toString());
assertEquals(12, results.get(0).value, 0.01F);
assertEquals(new BytesRef("thank you"), results.get(0).payload);
assertEquals("bar", results.get(1).key.toString());
assertEquals(10, results.get(1).value, 0.01F);
assertEquals(new BytesRef("goodbye"), results.get(1).payload);
assertEquals("barbara", results.get(2).key.toString());
assertEquals(6, results.get(2).value, 0.01F);
assertEquals(new BytesRef("for all the fish"), results.get(2).payload);
}
// TODO: more tests
/**
* basic "standardanalyzer" test with stopword removal
@ -435,11 +487,13 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
public final String surfaceForm;
public final String analyzedForm;
public final long weight;
public final BytesRef payload;
public TermFreq2(String surfaceForm, String analyzedForm, long weight) {
public TermFreq2(String surfaceForm, String analyzedForm, long weight, BytesRef payload) {
this.surfaceForm = surfaceForm;
this.analyzedForm = analyzedForm;
this.weight = weight;
this.payload = payload;
}
@Override
@ -549,7 +603,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
final TreeSet<String> allPrefixes = new TreeSet<String>();
final Set<String> seen = new HashSet<String>();
TermFreq[] keys = new TermFreq[numQueries];
boolean doPayloads = random().nextBoolean();
TermFreq[] keys = null;
TermFreqPayload[] payloadKeys = null;
if (doPayloads) {
payloadKeys = new TermFreqPayload[numQueries];
} else {
keys = new TermFreq[numQueries];
}
boolean preserveSep = random().nextBoolean();
@ -614,9 +676,18 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
}
// we can probably do Integer.MAX_VALUE here, but why worry.
int weight = random().nextInt(1<<24);
keys[i] = new TermFreq(key, weight);
BytesRef payload;
if (doPayloads) {
byte[] bytes = new byte[random().nextInt(10)];
random().nextBytes(bytes);
payload = new BytesRef(bytes);
payloadKeys[i] = new TermFreqPayload(key, weight, payload);
} else {
keys[i] = new TermFreq(key, weight);
payload = null;
}
slowCompletor.add(new TermFreq2(key, analyzedKey, weight));
slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload));
}
if (VERBOSE) {
@ -632,7 +703,11 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);
suggester.build(new TermFreqArrayIterator(keys));
if (doPayloads) {
suggester.build(new TermFreqPayloadArrayIterator(payloadKeys));
} else {
suggester.build(new TermFreqArrayIterator(keys));
}
for (String prefix : allPrefixes) {
@ -739,6 +814,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
//System.out.println(" check hit " + hit);
assertEquals(matches.get(hit).surfaceForm.toString(), r.get(hit).key.toString());
assertEquals(matches.get(hit).weight, r.get(hit).value, 0f);
if (doPayloads) {
assertEquals(matches.get(hit).payload, r.get(hit).payload);
}
}
}
}