mirror of https://github.com/apache/lucene.git
LUCENE-4820: add optional payload to Analyzing/FuzzySuggester
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c476a59ff9
commit
234fa47dba
|
@ -61,6 +61,9 @@ New Features
|
|||
subclasses with ctors taking AttributeFactory.
|
||||
(Renaud Delbru, Uwe Schindler, Steve Rowe)
|
||||
|
||||
* LUCENE-4820: Add payloads to Analyzing/FuzzySuggester, to record an
|
||||
arbitrary byte[] per suggestion (Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
|
||||
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester; // javadocs
|
||||
import org.apache.lucene.search.suggest.analyzing.FuzzySuggester; // javadocs
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Interface for enumerating term,weight,payload triples;
|
||||
* currently only {@link AnalyzingSuggester} and {@link
|
||||
* FuzzySuggester} support payloads.
|
||||
*/
|
||||
public interface TermFreqPayloadIterator extends TermFreqIterator {
|
||||
|
||||
/** An arbitrary byte[] to record per suggestion. See
|
||||
* {@link LookupResult#payload} to retrieve the payload
|
||||
* for each suggestion. */
|
||||
public BytesRef payload();
|
||||
}
|
|
@ -25,6 +25,7 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
|
@ -39,17 +40,29 @@ public abstract class Lookup {
|
|||
public static final class LookupResult implements Comparable<LookupResult> {
|
||||
/** the key's text */
|
||||
public final CharSequence key;
|
||||
|
||||
/** the key's weight */
|
||||
public final long value;
|
||||
|
||||
/** the key's payload (null if not present) */
|
||||
public final BytesRef payload;
|
||||
|
||||
/**
|
||||
* Create a new result from a key+weight pair.
|
||||
*/
|
||||
public LookupResult(CharSequence key, long value) {
|
||||
this(key, value, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new result from a key+weight+payload triple.
|
||||
*/
|
||||
public LookupResult(CharSequence key, long value, BytesRef payload) {
|
||||
this.key = key;
|
||||
this.value = value;
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return key + "/" + value;
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.Sort;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
|
@ -180,6 +181,10 @@ public class AnalyzingSuggester extends Lookup {
|
|||
* graphs this will always be 1. */
|
||||
private int maxAnalyzedPathsForOneInput;
|
||||
|
||||
private boolean hasPayloads;
|
||||
|
||||
private static final int PAYLOAD_SEP = '\u001f';
|
||||
|
||||
/**
|
||||
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
||||
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
||||
|
@ -330,8 +335,15 @@ public class AnalyzingSuggester extends Lookup {
|
|||
return new TokenStreamToAutomaton();
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzingComparator implements Comparator<BytesRef> {
|
||||
|
||||
private final boolean hasPayloads;
|
||||
|
||||
public AnalyzingComparator(boolean hasPayloads) {
|
||||
this.hasPayloads = hasPayloads;
|
||||
}
|
||||
|
||||
private Comparator<BytesRef> sortComparator = new Comparator<BytesRef>() {
|
||||
private final ByteArrayDataInput readerA = new ByteArrayDataInput();
|
||||
private final ByteArrayDataInput readerB = new ByteArrayDataInput();
|
||||
private final BytesRef scratchA = new BytesRef();
|
||||
|
@ -367,10 +379,19 @@ public class AnalyzingSuggester extends Lookup {
|
|||
}
|
||||
|
||||
// Finally by surface form:
|
||||
scratchA.offset = readerA.getPosition();
|
||||
scratchA.length = a.length - scratchA.offset;
|
||||
scratchB.offset = readerB.getPosition();
|
||||
scratchB.length = b.length - scratchB.offset;
|
||||
if (hasPayloads) {
|
||||
readerA.setPosition(readerA.getPosition() + scratchA.length);
|
||||
scratchA.length = readerA.readShort();
|
||||
scratchA.offset = readerA.getPosition();
|
||||
readerB.setPosition(readerB.getPosition() + scratchB.length);
|
||||
scratchB.length = readerB.readShort();
|
||||
scratchB.offset = readerB.getPosition();
|
||||
} else {
|
||||
scratchA.offset = readerA.getPosition();
|
||||
scratchA.length = a.length - scratchA.offset;
|
||||
scratchB.offset = readerB.getPosition();
|
||||
scratchB.length = b.length - scratchB.offset;
|
||||
}
|
||||
|
||||
cmp = scratchA.compareTo(scratchB);
|
||||
if (cmp != 0) {
|
||||
|
@ -380,21 +401,28 @@ public class AnalyzingSuggester extends Lookup {
|
|||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@Override
|
||||
public void build(TermFreqIterator iterator) throws IOException {
|
||||
String prefix = getClass().getSimpleName();
|
||||
File directory = Sort.defaultTempDir();
|
||||
File tempInput = File.createTempFile(prefix, ".input", directory);
|
||||
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
|
||||
|
||||
|
||||
TermFreqPayloadIterator payloads;
|
||||
if (iterator instanceof TermFreqPayloadIterator) {
|
||||
payloads = (TermFreqPayloadIterator) iterator;
|
||||
} else {
|
||||
payloads = null;
|
||||
}
|
||||
hasPayloads = payloads != null;
|
||||
|
||||
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
|
||||
Sort.ByteSequencesReader reader = null;
|
||||
BytesRef scratch = new BytesRef();
|
||||
|
||||
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
|
||||
|
||||
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
|
||||
boolean success = false;
|
||||
byte buffer[] = new byte[8];
|
||||
try {
|
||||
|
@ -419,6 +447,19 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// compute the required length:
|
||||
// analyzed sequence + weight (4) + surface + analyzedLength (short)
|
||||
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
|
||||
|
||||
BytesRef payload;
|
||||
|
||||
if (hasPayloads) {
|
||||
if (surfaceForm.length > (Short.MAX_VALUE-2)) {
|
||||
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
|
||||
}
|
||||
payload = payloads.payload();
|
||||
// payload + surfaceLength (short)
|
||||
requiredLength += payload.length + 2;
|
||||
} else {
|
||||
payload = null;
|
||||
}
|
||||
|
||||
buffer = ArrayUtil.grow(buffer, requiredLength);
|
||||
|
||||
|
@ -430,7 +471,18 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
output.writeInt(encodeWeight(iterator.weight()));
|
||||
|
||||
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
|
||||
if (hasPayloads) {
|
||||
for(int i=0;i<surfaceForm.length;i++) {
|
||||
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
|
||||
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
|
||||
}
|
||||
}
|
||||
output.writeShort((short) surfaceForm.length);
|
||||
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
|
||||
output.writeBytes(payload.bytes, payload.offset, payload.length);
|
||||
} else {
|
||||
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
|
||||
}
|
||||
|
||||
assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;
|
||||
|
||||
|
@ -440,7 +492,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
writer.close();
|
||||
|
||||
// Sort all input/output pairs (required by FST.Builder):
|
||||
new Sort(sortComparator).sort(tempInput, tempSorted);
|
||||
new Sort(new AnalyzingComparator(payloads != null)).sort(tempInput, tempSorted);
|
||||
|
||||
// Free disk space:
|
||||
tempInput.delete();
|
||||
|
@ -474,8 +526,13 @@ public class AnalyzingSuggester extends Lookup {
|
|||
long cost = input.readInt();
|
||||
|
||||
surface.bytes = scratch.bytes;
|
||||
surface.offset = input.getPosition();
|
||||
surface.length = scratch.length - surface.offset;
|
||||
if (hasPayloads) {
|
||||
surface.length = input.readShort();
|
||||
surface.offset = input.getPosition();
|
||||
} else {
|
||||
surface.offset = input.getPosition();
|
||||
surface.length = scratch.length - surface.offset;
|
||||
}
|
||||
|
||||
if (previousAnalyzed == null) {
|
||||
previousAnalyzed = new BytesRef();
|
||||
|
@ -513,7 +570,18 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
Util.toIntsRef(analyzed, scratchInts);
|
||||
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
|
||||
builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
|
||||
if (!hasPayloads) {
|
||||
builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
|
||||
} else {
|
||||
int payloadOffset = input.getPosition() + surface.length;
|
||||
int payloadLength = scratch.length - payloadOffset;
|
||||
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
|
||||
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
|
||||
br.bytes[surface.length] = PAYLOAD_SEP;
|
||||
System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
|
||||
br.length = br.bytes.length;
|
||||
builder.add(scratchInts, outputs.newPair(cost, br));
|
||||
}
|
||||
}
|
||||
fst = builder.finish();
|
||||
|
||||
|
@ -542,6 +610,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
fst.save(dataOut);
|
||||
dataOut.writeVInt(maxAnalyzedPathsForOneInput);
|
||||
dataOut.writeByte((byte) (hasPayloads ? 1 : 0));
|
||||
} finally {
|
||||
IOUtils.close(output);
|
||||
}
|
||||
|
@ -554,12 +623,58 @@ public class AnalyzingSuggester extends Lookup {
|
|||
try {
|
||||
this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
|
||||
maxAnalyzedPathsForOneInput = dataIn.readVInt();
|
||||
hasPayloads = dataIn.readByte() == 1;
|
||||
} finally {
|
||||
IOUtils.close(input);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {
|
||||
LookupResult result;
|
||||
if (hasPayloads) {
|
||||
int sepIndex = -1;
|
||||
for(int i=0;i<output2.length;i++) {
|
||||
if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
|
||||
sepIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert sepIndex != -1;
|
||||
spare.grow(sepIndex);
|
||||
int payloadLen = output2.length - sepIndex - 1;
|
||||
output2.length = sepIndex;
|
||||
UnicodeUtil.UTF8toUTF16(output2, spare);
|
||||
BytesRef payload = new BytesRef(payloadLen);
|
||||
System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
|
||||
payload.length = payloadLen;
|
||||
result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
|
||||
} else {
|
||||
spare.grow(output2.length);
|
||||
UnicodeUtil.UTF8toUTF16(output2, spare);
|
||||
result = new LookupResult(spare.toString(), decodeWeight(output1));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
|
||||
if (hasPayloads) {
|
||||
// output2 has at least PAYLOAD_SEP byte:
|
||||
if (key.length >= output2.length) {
|
||||
return false;
|
||||
}
|
||||
for(int i=0;i<key.length;i++) {
|
||||
if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
|
||||
} else {
|
||||
return key.bytesEquals(output2);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<LookupResult> lookup(final CharSequence key, boolean onlyMorePopular, int num) {
|
||||
assert num > 0;
|
||||
|
@ -639,10 +754,9 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// nodes we have and the
|
||||
// maxSurfaceFormsPerAnalyzedForm:
|
||||
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
|
||||
if (utf8Key.bytesEquals(completion.output.output2)) {
|
||||
spare.grow(completion.output.output2.length);
|
||||
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
||||
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
|
||||
BytesRef output2 = completion.output.output2;
|
||||
if (sameSurfaceForm(utf8Key, output2)) {
|
||||
results.add(getLookupResult(completion.output.output1, output2, spare));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -676,7 +790,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// In exactFirst mode, don't accept any paths
|
||||
// matching the surface form since that will
|
||||
// create duplicate results:
|
||||
if (utf8Key.bytesEquals(output.output2)) {
|
||||
if (sameSurfaceForm(utf8Key, output.output2)) {
|
||||
// We found exact match, which means we should
|
||||
// have already found it in the first search:
|
||||
assert results.size() == 1;
|
||||
|
@ -697,9 +811,8 @@ public class AnalyzingSuggester extends Lookup {
|
|||
MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();
|
||||
|
||||
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
|
||||
spare.grow(completion.output.output2.length);
|
||||
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
||||
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
|
||||
|
||||
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
|
||||
|
||||
// TODO: for fuzzy case would be nice to return
|
||||
// how many edits were required
|
||||
|
|
|
@ -25,9 +25,10 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.Sort;
|
||||
import org.apache.lucene.search.suggest.Sort.SortInfo;
|
||||
import org.apache.lucene.search.suggest.Sort;
|
||||
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
|
@ -141,6 +142,9 @@ public class FSTCompletionLookup extends Lookup {
|
|||
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
if (tfit instanceof TermFreqPayloadIterator) {
|
||||
throw new IllegalArgumentException("this suggester doesn't support payloads");
|
||||
}
|
||||
File tempInput = File.createTempFile(
|
||||
FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
|
||||
File tempSorted = File.createTempFile(
|
||||
|
|
|
@ -26,9 +26,10 @@ import java.util.Comparator;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
|
||||
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
|
@ -40,12 +41,12 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.Arc;
|
||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.fst.Util.MinResult;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
* Suggester based on a weighted FST: it first traverses the prefix,
|
||||
|
@ -93,6 +94,9 @@ public class WFSTCompletionLookup extends Lookup {
|
|||
|
||||
@Override
|
||||
public void build(TermFreqIterator iterator) throws IOException {
|
||||
if (iterator instanceof TermFreqPayloadIterator) {
|
||||
throw new IllegalArgumentException("this suggester doesn't support payloads");
|
||||
}
|
||||
BytesRef scratch = new BytesRef();
|
||||
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator);
|
||||
IntsRef scratchInts = new IntsRef();
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
|
||||
|
@ -53,6 +54,9 @@ public class JaspellLookup extends Lookup {
|
|||
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
if (tfit instanceof TermFreqPayloadIterator) {
|
||||
throw new IllegalArgumentException("this suggester doesn't support payloads");
|
||||
}
|
||||
if (tfit.getComparator() != null) {
|
||||
// make sure it's unsorted
|
||||
// WTF - this could result in yet another sorted iteration....
|
||||
|
|
|
@ -25,9 +25,10 @@ import java.io.OutputStream;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -51,6 +52,9 @@ public class TSTLookup extends Lookup {
|
|||
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
if (tfit instanceof TermFreqPayloadIterator) {
|
||||
throw new IllegalArgumentException("this suggester doesn't support payloads");
|
||||
}
|
||||
root = new TernaryTreeNode();
|
||||
// buffer first
|
||||
if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.search.suggest;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
public final class TermFreqPayload {
|
||||
public final BytesRef term;
|
||||
public final long v;
|
||||
public final BytesRef payload;
|
||||
|
||||
public TermFreqPayload(String term, long v, BytesRef payload) {
|
||||
this(new BytesRef(term), v, payload);
|
||||
}
|
||||
|
||||
public TermFreqPayload(BytesRef term, long v, BytesRef payload) {
|
||||
this.term = term;
|
||||
this.v = v;
|
||||
this.payload = payload;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
package org.apache.lucene.search.suggest;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
|
||||
*/
|
||||
public final class TermFreqPayloadArrayIterator implements TermFreqPayloadIterator {
|
||||
private final Iterator<TermFreqPayload> i;
|
||||
private TermFreqPayload current;
|
||||
private final BytesRef spare = new BytesRef();
|
||||
|
||||
public TermFreqPayloadArrayIterator(Iterator<TermFreqPayload> i) {
|
||||
this.i = i;
|
||||
}
|
||||
|
||||
public TermFreqPayloadArrayIterator(TermFreqPayload[] i) {
|
||||
this(Arrays.asList(i));
|
||||
}
|
||||
|
||||
public TermFreqPayloadArrayIterator(Iterable<TermFreqPayload> i) {
|
||||
this(i.iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
public long weight() {
|
||||
return current.v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() {
|
||||
if (i.hasNext()) {
|
||||
current = i.next();
|
||||
spare.copyBytes(current.term);
|
||||
return spare;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef payload() {
|
||||
return current.payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -53,6 +53,8 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
|||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.search.suggest.TermFreq;
|
||||
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
|
||||
import org.apache.lucene.search.suggest.TermFreqPayload;
|
||||
import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -103,6 +105,56 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
assertEquals(6, results.get(2).value, 0.01F);
|
||||
}
|
||||
|
||||
public void testKeywordWithPayloads() throws Exception {
|
||||
TermFreqPayload keys[] = new TermFreqPayload[] {
|
||||
new TermFreqPayload("foo", 50, new BytesRef("hello")),
|
||||
new TermFreqPayload("bar", 10, new BytesRef("goodbye")),
|
||||
new TermFreqPayload("barbar", 12, new BytesRef("thank you")),
|
||||
new TermFreqPayload("barbara", 6, new BytesRef("for all the fish"))
|
||||
};
|
||||
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||
|
||||
// top N of 2, but only foo is available
|
||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals("foo", results.get(0).key.toString());
|
||||
assertEquals(50, results.get(0).value, 0.01F);
|
||||
assertEquals(new BytesRef("hello"), results.get(0).payload);
|
||||
|
||||
// top N of 1 for 'bar': we return this even though
|
||||
// barbar is higher because exactFirst is enabled:
|
||||
results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals("bar", results.get(0).key.toString());
|
||||
assertEquals(10, results.get(0).value, 0.01F);
|
||||
assertEquals(new BytesRef("goodbye"), results.get(0).payload);
|
||||
|
||||
// top N Of 2 for 'b'
|
||||
results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2);
|
||||
assertEquals(2, results.size());
|
||||
assertEquals("barbar", results.get(0).key.toString());
|
||||
assertEquals(12, results.get(0).value, 0.01F);
|
||||
assertEquals(new BytesRef("thank you"), results.get(0).payload);
|
||||
assertEquals("bar", results.get(1).key.toString());
|
||||
assertEquals(10, results.get(1).value, 0.01F);
|
||||
assertEquals(new BytesRef("goodbye"), results.get(1).payload);
|
||||
|
||||
// top N of 3 for 'ba'
|
||||
results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3);
|
||||
assertEquals(3, results.size());
|
||||
assertEquals("barbar", results.get(0).key.toString());
|
||||
assertEquals(12, results.get(0).value, 0.01F);
|
||||
assertEquals(new BytesRef("thank you"), results.get(0).payload);
|
||||
assertEquals("bar", results.get(1).key.toString());
|
||||
assertEquals(10, results.get(1).value, 0.01F);
|
||||
assertEquals(new BytesRef("goodbye"), results.get(1).payload);
|
||||
assertEquals("barbara", results.get(2).key.toString());
|
||||
assertEquals(6, results.get(2).value, 0.01F);
|
||||
assertEquals(new BytesRef("for all the fish"), results.get(2).payload);
|
||||
}
|
||||
|
||||
// TODO: more tests
|
||||
/**
|
||||
* basic "standardanalyzer" test with stopword removal
|
||||
|
@ -435,11 +487,13 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
public final String surfaceForm;
|
||||
public final String analyzedForm;
|
||||
public final long weight;
|
||||
public final BytesRef payload;
|
||||
|
||||
public TermFreq2(String surfaceForm, String analyzedForm, long weight) {
|
||||
public TermFreq2(String surfaceForm, String analyzedForm, long weight, BytesRef payload) {
|
||||
this.surfaceForm = surfaceForm;
|
||||
this.analyzedForm = analyzedForm;
|
||||
this.weight = weight;
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -549,7 +603,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
final TreeSet<String> allPrefixes = new TreeSet<String>();
|
||||
final Set<String> seen = new HashSet<String>();
|
||||
|
||||
TermFreq[] keys = new TermFreq[numQueries];
|
||||
boolean doPayloads = random().nextBoolean();
|
||||
|
||||
TermFreq[] keys = null;
|
||||
TermFreqPayload[] payloadKeys = null;
|
||||
if (doPayloads) {
|
||||
payloadKeys = new TermFreqPayload[numQueries];
|
||||
} else {
|
||||
keys = new TermFreq[numQueries];
|
||||
}
|
||||
|
||||
boolean preserveSep = random().nextBoolean();
|
||||
|
||||
|
@ -614,9 +676,18 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
}
|
||||
// we can probably do Integer.MAX_VALUE here, but why worry.
|
||||
int weight = random().nextInt(1<<24);
|
||||
keys[i] = new TermFreq(key, weight);
|
||||
BytesRef payload;
|
||||
if (doPayloads) {
|
||||
byte[] bytes = new byte[random().nextInt(10)];
|
||||
random().nextBytes(bytes);
|
||||
payload = new BytesRef(bytes);
|
||||
payloadKeys[i] = new TermFreqPayload(key, weight, payload);
|
||||
} else {
|
||||
keys[i] = new TermFreq(key, weight);
|
||||
payload = null;
|
||||
}
|
||||
|
||||
slowCompletor.add(new TermFreq2(key, analyzedKey, weight));
|
||||
slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload));
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
|
@ -632,7 +703,11 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,
|
||||
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
if (doPayloads) {
|
||||
suggester.build(new TermFreqPayloadArrayIterator(payloadKeys));
|
||||
} else {
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
}
|
||||
|
||||
for (String prefix : allPrefixes) {
|
||||
|
||||
|
@ -739,6 +814,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
//System.out.println(" check hit " + hit);
|
||||
assertEquals(matches.get(hit).surfaceForm.toString(), r.get(hit).key.toString());
|
||||
assertEquals(matches.get(hit).weight, r.get(hit).value, 0f);
|
||||
if (doPayloads) {
|
||||
assertEquals(matches.get(hit).payload, r.get(hit).payload);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue