From 4f4f3a2b103b707e75d3dcedc6debd7698e11a17 Mon Sep 17 00:00:00 2001 From: Alexander Reelsen Date: Thu, 1 Aug 2013 08:44:09 +0200 Subject: [PATCH] Added prefix suggestions based on AnalyzingSuggester This commit introduces near realtime suggestions. For more information about its usage refer to github issue #3376 From the implementation point of view, a custom AnalyzingSuggester is used in combination with a custom postingsformat (which is not exposed to the user anywhere for him to use). Closes #3376 --- .../analyzing/XAnalyzingSuggester.java | 1033 +++++++++++++++++ .../index/mapper/DocumentMapperParser.java | 1 + .../mapper/core/CompletionFieldMapper.java | 317 +++++ .../elasticsearch/search/suggest/Suggest.java | 12 +- .../search/suggest/SuggestModule.java | 2 + .../search/suggest/SuggestUtils.java | 1 + .../AnalyzingCompletionLookupProvider.java | 260 +++++ .../Completion090PostingsFormat.java | 332 ++++++ .../CompletionPostingsFormatProvider.java | 41 + .../completion/CompletionSuggestParser.java | 60 + .../completion/CompletionSuggester.java | 112 ++ .../completion/CompletionSuggestion.java | 117 ++ .../CompletionSuggestionBuilder.java | 41 + .../CompletionSuggestionContext.java | 43 + .../completion/CompletionTokenStream.java | 145 +++ .../suggest/completion/PayloadProcessor.java | 35 + .../org.apache.lucene.codecs.PostingsFormat | 1 + .../suggest/CompletionPostingsFormatTest.java | 251 ++++ .../suggest/CompletionSuggestSearchTests.java | 454 ++++++++ .../suggest/CompletionTokenStreamTest.java | 169 +++ .../CompletionFieldMapperTests.java | 88 ++ 21 files changed, 3512 insertions(+), 3 deletions(-) create mode 100644 src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java create mode 100644 src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatProvider.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java create mode 100644 src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java create mode 100644 src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java create mode 100644 src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionTokenStreamTest.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java new file mode 100644 index 00000000000..51249984a2f --- /dev/null +++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java @@ -0,0 +1,1033 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.suggest.analyzing; + +import gnu.trove.map.hash.TObjectIntHashMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Sort; +import org.apache.lucene.store.*; +import org.apache.lucene.util.*; +import org.apache.lucene.util.automaton.*; +import org.apache.lucene.util.fst.*; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.Util.MinResult; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.*; + +/** + * Suggester that first analyzes the surface form, adds the + * analyzed form to a weighted FST, and then does the same + * thing at lookup time. This means lookup is based on the + * analyzed form while suggestions are still the surface + * form(s). + * + *

+ * This can result in powerful suggester functionality. For + * example, if you use an analyzer removing stop words, + * then the partial text "ghost chr..." could see the + * suggestion "The Ghost of Christmas Past". Note that + * position increments MUST NOT be preserved for this example + * to work, so you should call + * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}. + * + *

+ * If SynonymFilter is used to map wifi and wireless network to + * hotspot then the partial text "wirele..." could suggest + * "wifi router". Token normalization like stemmers, accent + * removal, etc., would allow suggestions to ignore such + * variations. + * + *

+ * When two matching suggestions have the same weight, they + * are tie-broken by the analyzed form. If their analyzed + * form is the same then the order is undefined. + * + *

+ * There are some limitations: + *

+ * + * @lucene.experimental + */ +public class XAnalyzingSuggester extends Lookup { + + /** + * FST: + * input is the analyzed form, with a null byte between terms + * weights are encoded as costs: (Integer.MAX_VALUE-weight) + * surface is the original, unanalyzed form. + */ + private FST> fst = null; + + /** + * Analyzer that will be used for analyzing suggestions at + * index time. + */ + private final Analyzer indexAnalyzer; + + /** + * Analyzer that will be used for analyzing suggestions at + * query time. + */ + private final Analyzer queryAnalyzer; + + /** + * True if exact match suggestions should always be returned first. + */ + private final boolean exactFirst; + + /** + * True if separator between tokens should be preserved. + */ + private final boolean preserveSep; + + /** Include this flag in the options parameter to {@link + * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to always + * return the exact match first, regardless of score. This + * has no performance impact but could result in + * low-quality suggestions. */ + public static final int EXACT_FIRST = 1; + + /** Include this flag in the options parameter to {@link + * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to preserve + * token separators when matching. */ + public static final int PRESERVE_SEP = 2; + + /** Represents the separation between tokens, if + * PRESERVE_SEP was specified */ + private static final int SEP_LABEL = 0xFF; + + /** Marks end of the analyzed input and start of dedup + * byte. */ + private static final int END_BYTE = 0x0; + + /** Maximum number of dup surface forms (different surface + * forms for the same analyzed form). */ + private final int maxSurfaceFormsPerAnalyzedForm; + + /** Maximum graph paths to index for a single analyzed + * surface form. This only matters if your analyzer + * makes lots of alternate paths (e.g. contains + * SynonymFilter). */ + private final int maxGraphExpansions; + + /** Highest number of analyzed paths we saw for any single + * input surface form. For analyzers that never create + * graphs this will always be 1. */ + private int maxAnalyzedPathsForOneInput; + + private boolean hasPayloads; + + private static final int PAYLOAD_SEP = '\u001f'; + + /** Whether position holes should appear in the automaton. */ + private boolean preservePositionIncrements; + + /** + * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int) + * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} + */ + public XAnalyzingSuggester(Analyzer analyzer) { + this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0); + } + + /** + * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int) + * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} + */ + public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0); + } + + /** + * Creates a new suggester. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + */ + public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions + , FST> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) { + // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput + this.indexAnalyzer = indexAnalyzer; + this.queryAnalyzer = queryAnalyzer; + this.fst = fst; + this.hasPayloads = hasPayloads; + if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { + throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); + } + this.exactFirst = (options & EXACT_FIRST) != 0; + this.preserveSep = (options & PRESERVE_SEP) != 0; + + // NOTE: this is just an implementation limitation; if + // somehow this is a problem we could fix it by using + // more than one byte to disambiguate ... but 256 seems + // like it should be way more then enough. + if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { + throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); + } + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + + if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { + throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); + } + this.maxGraphExpansions = maxGraphExpansions; + this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; + this.preservePositionIncrements = true; + } + + /** Whether to take position holes (position increment > 1) into account when + * building the automaton, true by default. */ + public void setPreservePositionIncrements(boolean preservePositionIncrements) { + this.preservePositionIncrements = preservePositionIncrements; + } + + /** Returns byte size of the underlying FST. */ + public long sizeInBytes() { + return fst == null ? 0 : fst.sizeInBytes(); + } + + private static void copyDestTransitions(State from, State to, List transitions) { + if (to.isAccept()) { + from.setAccept(true); + } + for(Transition t : to.getTransitions()) { + transitions.add(t); + } + } + + // Replaces SEP with epsilon or remaps them if + // we were asked to preserve them: + private static void replaceSep(Automaton a, boolean preserveSep) { + + State[] states = a.getNumberedStates(); + + // Go in reverse topo sort so we know we only have to + // make one pass: + for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) { + final State state = states[stateNumber]; + List newTransitions = new ArrayList(); + for(Transition t : state.getTransitions()) { + assert t.getMin() == t.getMax(); + if (t.getMin() == TokenStreamToAutomaton.POS_SEP) { + if (preserveSep) { + // Remap to SEP_LABEL: + newTransitions.add(new Transition(SEP_LABEL, t.getDest())); + } else { + copyDestTransitions(state, t.getDest(), newTransitions); + a.setDeterministic(false); + } + } else if (t.getMin() == TokenStreamToAutomaton.HOLE) { + + // Just remove the hole: there will then be two + // SEP tokens next to each other, which will only + // match another hole at search time. Note that + // it will also match an empty-string token ... if + // that's somehow a problem we can always map HOLE + // to a dedicated byte (and escape it in the + // input). + copyDestTransitions(state, t.getDest(), newTransitions); + a.setDeterministic(false); + } else { + newTransitions.add(t); + } + } + state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()])); + } + } + + /** Just escapes the 0xff byte (which we still for SEP). */ + private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + + final BytesRef spare = new BytesRef(); + + @Override + protected BytesRef changeToken(BytesRef in) { + int upto = 0; + for(int i=0;i { + + private final boolean hasPayloads; + + public AnalyzingComparator(boolean hasPayloads) { + this.hasPayloads = hasPayloads; + } + + private final ByteArrayDataInput readerA = new ByteArrayDataInput(); + private final ByteArrayDataInput readerB = new ByteArrayDataInput(); + private final BytesRef scratchA = new BytesRef(); + private final BytesRef scratchB = new BytesRef(); + + @Override + public int compare(BytesRef a, BytesRef b) { + + // First by analyzed form: + readerA.reset(a.bytes, a.offset, a.length); + scratchA.length = readerA.readShort(); + scratchA.bytes = a.bytes; + scratchA.offset = readerA.getPosition(); + + readerB.reset(b.bytes, b.offset, b.length); + scratchB.bytes = b.bytes; + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + + int cmp = scratchA.compareTo(scratchB); + if (cmp != 0) { + return cmp; + } + readerA.skipBytes(scratchA.length); + readerB.skipBytes(scratchB.length); + // Next by cost: + long aCost = readerA.readInt(); + long bCost = readerB.readInt(); + if (aCost < bCost) { + return -1; + } else if (aCost > bCost) { + return 1; + } + + // Finally by surface form: + if (hasPayloads) { + scratchA.length = readerA.readShort(); + scratchA.offset = readerA.getPosition(); + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + } else { + scratchA.offset = readerA.getPosition(); + scratchA.length = a.length - scratchA.offset; + scratchB.offset = readerB.getPosition(); + scratchB.length = b.length - scratchB.offset; + } + return scratchA.compareTo(scratchB); + } + }; + + @Override + public void build(TermFreqIterator iterator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + File tempInput = File.createTempFile(prefix, ".input", directory); + File tempSorted = File.createTempFile(prefix, ".sorted", directory); + + TermFreqPayloadIterator payloads; + if (iterator instanceof TermFreqPayloadIterator) { + payloads = (TermFreqPayloadIterator) iterator; + } else { + payloads = null; + } + hasPayloads = payloads != null; + + Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + Sort.ByteSequencesReader reader = null; + BytesRef scratch = new BytesRef(); + + TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + + boolean success = false; + byte buffer[] = new byte[8]; + try { + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + BytesRef surfaceForm; + + while ((surfaceForm = iterator.next()) != null) { + Set paths = toFiniteStrings(surfaceForm, ts2a); + + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); + + for (IntsRef path : paths) { + + Util.toBytesRef(path, scratch); + + // length of the analyzed text (FST input) + if (scratch.length > Short.MAX_VALUE-2) { + throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")"); + } + short analyzedLength = (short) scratch.length; + + // compute the required length: + // analyzed sequence + weight (4) + surface + analyzedLength (short) + int requiredLength = analyzedLength + 4 + surfaceForm.length + 2; + + BytesRef payload; + + if (hasPayloads) { + if (surfaceForm.length > (Short.MAX_VALUE-2)) { + throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")"); + } + payload = payloads.payload(); + // payload + surfaceLength (short) + requiredLength += payload.length + 2; + } else { + payload = null; + } + + buffer = ArrayUtil.grow(buffer, requiredLength); + + output.reset(buffer); + + output.writeShort(analyzedLength); + + output.writeBytes(scratch.bytes, scratch.offset, scratch.length); + + output.writeInt(encodeWeight(iterator.weight())); + + if (hasPayloads) { + for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); + + // Build FST: + BytesRef previousAnalyzed = null; + BytesRef analyzed = new BytesRef(); + BytesRef surface = new BytesRef(); + IntsRef scratchInts = new IntsRef(); + ByteArrayDataInput input = new ByteArrayDataInput(); + + // Used to remove duplicate surface forms (but we + // still index the hightest-weight one). We clear + // this when we see a new analyzed form, so it cannot + // grow unbounded (at most 256 entries): + Set seenSurfaceForms = new HashSet(); + + int dedup = 0; + while (reader.read(scratch)) { + input.reset(scratch.bytes, scratch.offset, scratch.length); + short analyzedLength = input.readShort(); + analyzed.grow(analyzedLength+2); + input.readBytes(analyzed.bytes, 0, analyzedLength); + analyzed.length = analyzedLength; + + long cost = input.readInt(); + + surface.bytes = scratch.bytes; + if (hasPayloads) { + surface.length = input.readShort(); + surface.offset = input.getPosition(); + } else { + surface.offset = input.getPosition(); + surface.length = scratch.length - surface.offset; + } + + if (previousAnalyzed == null) { + previousAnalyzed = new BytesRef(); + previousAnalyzed.copyBytes(analyzed); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } else if (analyzed.equals(previousAnalyzed)) { + dedup++; + if (dedup >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: + continue; + } + if (seenSurfaceForms.contains(surface)) { + continue; + } + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } else { + dedup = 0; + previousAnalyzed.copyBytes(analyzed); + seenSurfaceForms.clear(); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } + + // TODO: I think we can avoid the extra 2 bytes when + // there is no dup (dedup==0), but we'd have to fix + // the exactFirst logic ... which would be sort of + // hairy because we'd need to special case the two + // (dup/not dup)... + + // NOTE: must be byte 0 so we sort before whatever + // is next + analyzed.bytes[analyzed.offset+analyzed.length] = 0; + analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup; + analyzed.length += 2; + + Util.toIntsRef(analyzed, scratchInts); + //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); + if (!hasPayloads) { + builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); + } else { + int payloadOffset = input.getPosition() + surface.length; + int payloadLength = scratch.length - payloadOffset; + BytesRef br = new BytesRef(surface.length + 1 + payloadLength); + System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); + br.bytes[surface.length] = PAYLOAD_SEP; + System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); + br.length = br.bytes.length; + builder.add(scratchInts, outputs.newPair(cost, br)); + } + } + fst = builder.finish(); + + //Util.dotToFile(fst, "/tmp/suggest.dot"); + + success = true; + } finally { + if (success) { + IOUtils.close(reader, writer); + } else { + IOUtils.closeWhileHandlingException(reader, writer); + } + + tempInput.delete(); + tempSorted.delete(); + } + } + + @Override + public boolean store(OutputStream output) throws IOException { + DataOutput dataOut = new OutputStreamDataOutput(output); + try { + if (fst == null) { + return false; + } + + fst.save(dataOut); + dataOut.writeVInt(maxAnalyzedPathsForOneInput); + dataOut.writeByte((byte) (hasPayloads ? 1 : 0)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + DataInput dataIn = new InputStreamDataInput(input); + try { + this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + maxAnalyzedPathsForOneInput = dataIn.readVInt(); + hasPayloads = dataIn.readByte() == 1; + } finally { + IOUtils.close(input); + } + return true; + } + + private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) { + LookupResult result; + if (hasPayloads) { + int sepIndex = -1; + for(int i=0;i= output2.length) { + return false; + } + for(int i=0;i lookup(final CharSequence key, boolean onlyMorePopular, int num) { + assert num > 0; + + if (onlyMorePopular) { + throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false"); + } + if (fst == null) { + return Collections.emptyList(); + } + + //System.out.println("lookup key=" + key + " num=" + num); + final BytesRef utf8Key = new BytesRef(key); + try { + + Automaton lookupAutomaton = toLookupAutomaton(key); + + final CharsRef spare = new CharsRef(); + + //System.out.println(" now intersect exactFirst=" + exactFirst); + + // Intersect automaton w/ suggest wFST and get all + // prefix starting nodes & their outputs: + //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); + + //System.out.println(" prefixPaths: " + prefixPaths.size()); + + BytesReader bytesReader = fst.getBytesReader(); + + FST.Arc> scratchArc = new FST.Arc>(); + + final List results = new ArrayList(); + + List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst); + + if (exactFirst) { + + int count = 0; + for (FSTUtil.Path> path : prefixPaths) { + if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + count++; + } + } + + // Searcher just to find the single exact only + // match, if present: + Util.TopNSearcher> searcher; + searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + + // NOTE: we could almost get away with only using + // the first start node. The only catch is if + // maxSurfaceFormsPerAnalyzedForm had kicked in and + // pruned our exact match from one of these nodes + // ...: + for (FSTUtil.Path> path : prefixPaths) { + if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input); + } + } + + MinResult> completions[] = searcher.search(); + + // NOTE: this is rather inefficient: we enumerate + // every matching "exactly the same analyzed form" + // path, and then do linear scan to see if one of + // these exactly matches the input. It should be + // possible (though hairy) to do something similar + // to getByOutput, since the surface form is encoded + // into the FST output, so we more efficiently hone + // in on the exact surface-form match. Still, I + // suspect very little time is spent in this linear + // seach: it's bounded by how many prefix start + // nodes we have and the + // maxSurfaceFormsPerAnalyzedForm: + for(MinResult> completion : completions) { + BytesRef output2 = completion.output.output2; + if (sameSurfaceForm(utf8Key, output2)) { + results.add(getLookupResult(completion.output.output1, output2, spare)); + break; + } + } + + if (results.size() == num) { + // That was quick: + return results; + } + } + + Util.TopNSearcher> searcher; + searcher = new Util.TopNSearcher>(fst, + num - results.size(), + num * maxAnalyzedPathsForOneInput, + weightComparator) { + private final Set seen = new HashSet(); + + @Override + protected boolean acceptResult(IntsRef input, Pair output) { + + // Dedup: when the input analyzes to a graph we + // can get duplicate surface forms: + if (seen.contains(output.output2)) { + return false; + } + seen.add(output.output2); + + if (!exactFirst) { + return true; + } else { + // In exactFirst mode, don't accept any paths + // matching the surface form since that will + // create duplicate results: + if (sameSurfaceForm(utf8Key, output.output2)) { + // We found exact match, which means we should + // have already found it in the first search: + assert results.size() == 1; + return false; + } else { + return true; + } + } + } + }; + + prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); + + for (FSTUtil.Path> path : prefixPaths) { + searcher.addStartPaths(path.fstNode, path.output, true, path.input); + } + + MinResult> completions[] = searcher.search(); + + for(MinResult> completion : completions) { + + LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare); + + // TODO: for fuzzy case would be nice to return + // how many edits were required + + //System.out.println(" result=" + result); + results.add(result); + + if (results.size() == num) { + // In the exactFirst=true case the search may + // produce one extra path + break; + } + } + + return results; + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + /** Returns all completion paths to initialize the search. */ + protected List>> getFullPrefixPaths(List>> prefixPaths, + Automaton lookupAutomaton, + FST> fst) + throws IOException { + return prefixPaths; + } + + final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + // Analyze surface form: + TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString()); + return toFiniteStrings(ts2a, ts); + } + + public final Set toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException { + // Analyze surface form: + + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + Automaton automaton = ts2a.toAutomaton(ts); + ts.close(); + + replaceSep(automaton, preserveSep); + + assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + // TODO: we could walk & add simultaneously, so we + // don't have to alloc [possibly biggish] + // intermediate HashSet in RAM: + return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + } + + final Automaton toLookupAutomaton(final CharSequence key) throws IOException { + // Turn tokenstream into automaton: + TokenStream ts = queryAnalyzer.tokenStream("", key.toString()); + Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + ts.close(); + + // TODO: we could use the end offset to "guess" + // whether the final token was a partial token; this + // would only be a heuristic ... but maybe an OK one. + // This way we could eg differentiate "net" from "net ", + // which we can't today... + + replaceSep(automaton, preserveSep); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + BasicOperations.determinize(automaton); + return automaton; + } + + + + /** + * Returns the weight associated with an input string, + * or null if it does not exist. + */ + public Object get(CharSequence key) { + throw new UnsupportedOperationException(); + } + + /** cost -> weight */ + public static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); + } + + /** weight -> cost */ + public static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return Integer.MAX_VALUE - (int)value; + } + + static final Comparator> weightComparator = new Comparator> () { + @Override + public int compare(Pair left, Pair right) { + return left.output1.compareTo(right.output1); + } + }; + + + public static class XBuilder { + private Builder> builder; + BytesRef previousAnalyzed = null; + private int maxSurfaceFormsPerAnalyzedForm; + private IntsRef scratchInts = new IntsRef(); + private final PairOutputs outputs; + private boolean hasPayloads; + private BytesRef analyzed = new BytesRef(); + private final SurfaceFormAndPayload[] surfaceFormsAndPayload; + private int count; + private TObjectIntHashMap seenSurfaceForms = new TObjectIntHashMap(256, 0.75f, -1); + + public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads) { + this.outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + this.builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + this.hasPayloads = hasPayloads; + surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm]; + + } + public void startTerm(BytesRef analyzed) { + this.analyzed.copyBytes(analyzed); + this.analyzed.grow(analyzed.length+2); + } + + private final static class SurfaceFormAndPayload implements Comparable { + BytesRef payload; + long weight; + + public SurfaceFormAndPayload(BytesRef payload, long cost) { + super(); + this.payload = payload; + this.weight = cost; + } + + @Override + public int compareTo(SurfaceFormAndPayload o) { + int res = compare(weight, o.weight); + if (res == 0 ){ + return payload.compareTo(o.payload); + } + return res; + } + public static int compare(long x, long y) { + return (x < y) ? -1 : ((x == y) ? 0 : 1); + } + } + + public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException { + int surfaceIndex = -1; + long encodedWeight = cost == -1 ? cost : encodeWeight(cost); + /* + * we need to check if we have seen this surface form, if so only use the + * the surface form with the highest weight and drop the rest no matter if + * the payload differs. + */ + if (count >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: + return; + } + BytesRef surfaceCopy; + if (count > 0 && (surfaceIndex = seenSurfaceForms.get(surface)) >= 0) { + SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex]; + if (encodedWeight >= surfaceFormAndPayload.weight) { + return; + } + surfaceCopy = BytesRef.deepCopyOf(surface); + } else { + surfaceIndex = count++; + surfaceCopy = BytesRef.deepCopyOf(surface); + seenSurfaceForms.put(surfaceCopy, surfaceIndex); + } + + BytesRef payloadRef; + if (!hasPayloads) { + payloadRef = surfaceCopy; + } else { + int len = surface.length + 1 + payload.length; + final BytesRef br = new BytesRef(len); + System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); + br.bytes[surface.length] = PAYLOAD_SEP; + System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length); + br.length = len; + payloadRef = br; + } + if (surfaceFormsAndPayload[surfaceIndex] == null) { + surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight); + } else { + surfaceFormsAndPayload[surfaceIndex].payload = payloadRef; + surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight; + } + } + + public void finishTerm(long defaultWeight) throws IOException { + ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); + int deduplicator = 0; + analyzed.bytes[analyzed.offset + analyzed.length] = 0; + analyzed.length += 2; + for (int i = 0; i < count; i++) { + analyzed.bytes[analyzed.offset + analyzed.length - 1 ] = (byte) deduplicator++; + Util.toIntsRef(analyzed, scratchInts); + SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; + long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; + builder.add(scratchInts, outputs.newPair(cost, candiate.payload)); + } + seenSurfaceForms.clear(); + count = 0; + } + + public FST> build() throws IOException { + return builder.finish(); + } + + public boolean hasPayloads() { + return hasPayloads; + } + + public int maxSurfaceFormsPerAnalyzedForm() { + return maxSurfaceFormsPerAnalyzedForm; + } + + } +} diff --git a/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java b/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java index 464b9a917fe..b51f1a555aa 100644 --- a/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java +++ b/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java @@ -92,6 +92,7 @@ public class DocumentMapperParser extends AbstractIndexComponent { .put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser()) .put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser()) .put(MultiFieldMapper.CONTENT_TYPE, new MultiFieldMapper.TypeParser()) + .put(CompletionFieldMapper.CONTENT_TYPE, new CompletionFieldMapper.TypeParser()) .put(GeoPointFieldMapper.CONTENT_TYPE, new GeoPointFieldMapper.TypeParser()); if (ShapesAvailability.JTS_AVAILABLE) { diff --git a/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java new file mode 100644 index 00000000000..45205102b7e --- /dev/null +++ b/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java @@ -0,0 +1,317 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper.core; + +import com.google.common.collect.Lists; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; +import org.elasticsearch.index.fielddata.FieldDataType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperException; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.similarity.SimilarityProvider; +import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider; +import org.elasticsearch.search.suggest.completion.CompletionPostingsFormatProvider; +import org.elasticsearch.search.suggest.completion.CompletionTokenStream; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * + */ +public class CompletionFieldMapper extends AbstractFieldMapper { + + public static final String CONTENT_TYPE = "completion"; + + public static class Defaults extends AbstractFieldMapper.Defaults { + public static final FieldType FIELD_TYPE = new FieldType(AbstractFieldMapper.Defaults.FIELD_TYPE); + + static { + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.freeze(); + } + + public static final boolean DEFAULT_PRESERVE_SEPARATORS = true; + public static final boolean DEFAULT_POSITION_INCREMENTS = true; + public static final boolean DEFAULT_HAS_PAYLOADS = false; + } + + public static class Fields { + public static final String INDEX_ANALYZER = "index_analyzer"; + public static final String SEARCH_ANALYZER = "search_analyzer"; + public static final String PRESERVE_SEPARATORS = "preserve_separators"; + public static final String PRESERVE_POSITION_INCREMENTS = "preserve_position_increments"; + public static final String PAYLOADS = "payloads"; + public static final String TYPE = "type"; + } + + public static class Builder extends AbstractFieldMapper.OpenBuilder { + + private NamedAnalyzer searchAnalyzer; + private NamedAnalyzer indexAnalyzer; + private boolean preserveSeparators = Defaults.DEFAULT_PRESERVE_SEPARATORS; + private boolean payloads = Defaults.DEFAULT_HAS_PAYLOADS; + private boolean preservePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS; + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE); + } + + public Builder searchAnalyzer(NamedAnalyzer searchAnalyzer) { + this.searchAnalyzer = searchAnalyzer; + return this; + } + + public Builder indexAnalyzer(NamedAnalyzer indexAnalyzer) { + this.indexAnalyzer = indexAnalyzer; + return this; + } + + public Builder payloads(boolean payloads) { + this.payloads = payloads; + return this; + } + + public Builder preserveSeparators(boolean preserveSeparators) { + this.preserveSeparators = preserveSeparators; + return this; + } + + public Builder preservePositionIncrements(boolean preservePositionIncrements) { + this.preservePositionIncrements = preservePositionIncrements; + return this; + } + + @Override + public CompletionFieldMapper build(Mapper.BuilderContext context) { + return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads, preserveSeparators, preservePositionIncrements); + } + } + + public static class TypeParser implements Mapper.TypeParser { + + @Override + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + CompletionFieldMapper.Builder builder = new CompletionFieldMapper.Builder(name); + for (Map.Entry entry : node.entrySet()) { + String fieldName = entry.getKey(); + Object fieldNode = entry.getValue(); + if (fieldName.equals("type")) { + continue; + } + if (fieldName.equals(Fields.INDEX_ANALYZER) || fieldName.equals("indexAnalyzer")) { + builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString())); + } else if (fieldName.equals(Fields.SEARCH_ANALYZER) || fieldName.equals("searchAnalyzer")) { + builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString())); + } else if (fieldName.equals(Fields.PAYLOADS)) { + builder.payloads(Boolean.parseBoolean(fieldNode.toString())); + } else if (fieldName.equals(Fields.PRESERVE_SEPARATORS) || fieldName.equals("preserveSeparators")) { + builder.preserveSeparators(Boolean.parseBoolean(fieldNode.toString())); + } else if (fieldName.equals(Fields.PRESERVE_POSITION_INCREMENTS) || fieldName.equals("preservePositionIncrements")) { + builder.preservePositionIncrements(Boolean.parseBoolean(fieldNode.toString())); + } + } + + if (builder.searchAnalyzer == null) { + builder.searchAnalyzer(parserContext.analysisService().analyzer("simple")); + } + + if (builder.indexAnalyzer == null) { + builder.indexAnalyzer(parserContext.analysisService().analyzer("simple")); + } + // we are just using this as the default to be wrapped by the CompletionPostingsFormatProvider in the SuggesteFieldMapper ctor + builder.postingsFormat(parserContext.postingFormatService().get("default")); + return builder; + } + } + + private static final BytesRef EMPTY = new BytesRef(); + + private final CompletionPostingsFormatProvider completionPostingsFormatProvider; + private final AnalyzingCompletionLookupProvider analyzingSuggestLookupProvider; + private final boolean payloads; + private final boolean preservePositionIncrements; + private final boolean preserveSeparators; + + public CompletionFieldMapper(Names names, NamedAnalyzer indexAnalyzer, NamedAnalyzer searchAnalyzer, PostingsFormatProvider provider, SimilarityProvider similarity, boolean payloads, + boolean preserveSeparators, boolean preservePositionIncrements) { + super(names, 1.0f, Defaults.FIELD_TYPE, indexAnalyzer, searchAnalyzer, provider, similarity, null); + analyzingSuggestLookupProvider = new AnalyzingCompletionLookupProvider(preserveSeparators, false, preservePositionIncrements, payloads); + this.completionPostingsFormatProvider = new CompletionPostingsFormatProvider("completion", provider, analyzingSuggestLookupProvider); + this.preserveSeparators = preserveSeparators; + this.payloads = payloads; + this.preservePositionIncrements = preservePositionIncrements; + } + + + @Override + public PostingsFormatProvider postingsFormatProvider() { + return this.completionPostingsFormatProvider; + } + + @Override + public void parse(ParseContext context) throws IOException { + XContentParser parser = context.parser(); + XContentParser.Token token = parser.currentToken(); + + String surfaceForm = null; + BytesRef payload = null; + long weight = -1; + List inputs = Lists.newArrayListWithExpectedSize(4); + + if (token == XContentParser.Token.VALUE_STRING) { + inputs.add(parser.text()); + } else { + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if ("payload".equals(currentFieldName)) { + if (!isStoringPayloads()) { + throw new MapperException("Payloads disabled in mapping"); + } + if (token == XContentParser.Token.START_OBJECT) { + XContentBuilder payloadBuilder = XContentFactory.contentBuilder(parser.contentType()).copyCurrentStructure(parser); + payload = payloadBuilder.bytes().toBytesRef(); + payloadBuilder.close(); + } + } else if (token == XContentParser.Token.VALUE_STRING) { + if ("output".equals(currentFieldName)) { + surfaceForm = parser.text(); + } + if ("input".equals(currentFieldName)) { + inputs.add(parser.text()); + } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if ("weight".equals(currentFieldName)) { + weight = parser.longValue(); // always parse a long to make sure we don't get the overflow value + if (weight < 0 || weight > Integer.MAX_VALUE) { + throw new ElasticSearchIllegalArgumentException("Weight must be in the interval [0..2147483647] but was " + weight); + } + } + } else if (token == XContentParser.Token.START_ARRAY) { + if ("input".equals(currentFieldName)) { + while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { + inputs.add(parser.text()); + } + } + } + } + } + payload = payload == null ? EMPTY: payload; + if (surfaceForm == null) { // no surface form use the input + for (String input : inputs) { + BytesRef suggestPayload = analyzingSuggestLookupProvider.buildPayload(new BytesRef( + input), weight, payload); + context.doc().add(getCompletionField(input, suggestPayload)); + } + } else { + BytesRef suggestPayload = analyzingSuggestLookupProvider.buildPayload(new BytesRef( + surfaceForm), weight, payload); + for (String input : inputs) { + context.doc().add(getCompletionField(input, suggestPayload)); + } + } + } + + public Field getCompletionField(String input, BytesRef payload) { + return new SuggestField(names().fullName(), input, this.fieldType, payload, analyzingSuggestLookupProvider); + } + + public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException { + return analyzingSuggestLookupProvider.buildPayload( + surfaceForm, weight, payload); + } + + private static final class SuggestField extends Field { + private final BytesRef payload; + private final CompletionTokenStream.ToFiniteStrings toFiniteStrings; + + public SuggestField(String name, String value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) { + super(name, value, type); + this.payload = payload; + this.toFiniteStrings = toFiniteStrings; + } + + @Override + public TokenStream tokenStream(Analyzer analyzer) throws IOException { + TokenStream ts = super.tokenStream(analyzer); + return new CompletionTokenStream(ts, payload, toFiniteStrings); + } + } + + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + return builder.startObject(name()) + .field(Fields.TYPE, CONTENT_TYPE) + .field(Fields.INDEX_ANALYZER, indexAnalyzer.name()) + .field(Fields.SEARCH_ANALYZER, searchAnalyzer.name()) + .field(Fields.PAYLOADS, this.payloads) + .field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators) + .field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements) + .endObject(); + } + + @Override + protected Field parseCreateField(ParseContext context) throws IOException { + return null; + } + + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + + @Override + public FieldType defaultFieldType() { + return Defaults.FIELD_TYPE; + } + + @Override + public FieldDataType defaultFieldDataType() { + return null; + } + + @Override + public String value(Object value) { + if (value == null) { + return null; + } + return value.toString(); + } + + public boolean isStoringPayloads() { + return payloads; + } +} diff --git a/src/main/java/org/elasticsearch/search/suggest/Suggest.java b/src/main/java/org/elasticsearch/search/suggest/Suggest.java index a244a872697..c93fadbe390 100644 --- a/src/main/java/org/elasticsearch/search/suggest/Suggest.java +++ b/src/main/java/org/elasticsearch/search/suggest/Suggest.java @@ -29,6 +29,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilderString; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; +import org.elasticsearch.search.suggest.completion.CompletionSuggestion; import org.elasticsearch.search.suggest.term.TermSuggestion; import java.io.IOException; @@ -114,6 +115,9 @@ public class Suggest implements Iterable>(); break; @@ -521,6 +525,10 @@ public class Suggest implements Iterable suggester) { diff --git a/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java b/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java index bdd95e19a16..641d40fe412 100644 --- a/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java +++ b/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java @@ -159,6 +159,7 @@ public final class SuggestUtils { numTokens++; } consumer.end(); + stream.close(); return numTokens; } diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java b/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java new file mode 100644 index 00000000000..0ae04bf4cb5 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java @@ -0,0 +1,260 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.codecs.*; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.*; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.CompletionLookupProvider; +import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory; + +import java.io.IOException; +import java.util.*; + +public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider { + + // for serialization + public static final int SERIALIZE_PRESERVE_SEPERATORS = 1; + public static final int SERIALIZE_HAS_PAYLOADS = 2; + public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4; + + private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256; + private static final int MAX_GRAPH_EXPANSIONS = -1; + + public static final String CODEC_NAME = "analyzing"; + public static final int CODEC_VERSION = 1; + + private boolean preserveSep; + private boolean preservePositionIncrements; + private int maxSurfaceFormsPerAnalyzedForm; + private int maxGraphExpansions; + private boolean hasPayloads; + private final XAnalyzingSuggester prototype; + + public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) { + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.hasPayloads = hasPayloads; + this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM; + this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS; + int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0; + // needs to fixed in the suggester first before it can be supported + //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0; + prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, null, false, 1); + prototype.setPreservePositionIncrements(preservePositionIncrements); + } + + @Override + public String getName() { + return "analyzing"; + } + + @Override + public FieldsConsumer consumer(final IndexOutput output) throws IOException { + CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION); + return new FieldsConsumer() { + private Map fieldOffsets = new HashMap(); + @Override + public void close() throws IOException { + try { /* + * write the offsets per field such that we know where + * we need to load the FSTs from + */ + long pointer = output.getFilePointer(); + output.writeVInt(fieldOffsets.size()); + for (Map.Entry entry : fieldOffsets.entrySet()) { + output.writeString(entry.getKey().name); + output.writeVLong(entry.getValue()); + } + output.writeLong(pointer); + output.flush(); + } finally { + IOUtils.close(output); + } + } + + @Override + public TermsConsumer addField(final FieldInfo field) throws IOException { + + return new TermsConsumer() { + final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads); + final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProvider.this, builder); + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + builder.startTerm(text); + return postingsConsumer; + } + + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + builder.finishTerm(stats.docFreq); // use doc freq as a fallback + } + + @Override + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { + /* + * Here we are done processing the field and we can + * buid the FST and write it to disk. + */ + FST> build = builder.build(); + fieldOffsets.put(field, output.getFilePointer()); + build.save(output); + /* write some more meta-info */ + output.writeVInt(postingsConsumer.getMaxAnalyzedPathsForOneInput()); + output.writeVInt(maxSurfaceFormsPerAnalyzedForm); + output.writeInt(maxGraphExpansions); // can be negative + int options = 0; + options |= preserveSep ? SERIALIZE_PRESERVE_SEPERATORS : 0; + options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; + options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; + output.writeVInt(options); + } + }; + } + }; + } + + private static final class CompletionPostingsConsumer extends PostingsConsumer { + private final SuggestPayload spare = new SuggestPayload(); + private AnalyzingCompletionLookupProvider analyzingSuggestLookupProvider; + private XAnalyzingSuggester.XBuilder builder; + private int maxAnalyzedPathsForOneInput = 0; + + public CompletionPostingsConsumer(AnalyzingCompletionLookupProvider analyzingSuggestLookupProvider, XAnalyzingSuggester.XBuilder builder) { + this.analyzingSuggestLookupProvider = analyzingSuggestLookupProvider; + this.builder = builder; + } + + @Override + public void startDoc(int docID, int freq) throws IOException { + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + analyzingSuggestLookupProvider.parsePayload(payload, spare); + builder.addSurface(spare.surfaceForm, spare.payload, spare.weight); + // multi fields have the same surface form so we sum up here + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position+1); + } + + @Override + public void finishDoc() throws IOException { + } + + public int getMaxAnalyzedPathsForOneInput() { + return maxAnalyzedPathsForOneInput; + } + }; + + + @Override + public LookupFactory load(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION); + final Map lookupMap = new HashMap(); + input.seek(input.length() - 8); + long metaPointer = input.readLong(); + input.seek(metaPointer); + int numFields = input.readVInt(); + + Map meta = new TreeMap(); + for (int i = 0; i < numFields; i++) { + String name = input.readString(); + long offset = input.readVLong(); + meta.put(offset, name); + } + + for (Map.Entry entry : meta.entrySet()) { + input.seek(entry.getKey()); + FST> fst = new FST>(input, new PairOutputs( + PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + int maxAnalyzedPathsForOneInput = input.readVInt(); + int maxSurfaceFormsPerAnalyzedForm = input.readVInt(); + int maxGraphExpansions = input.readInt(); + int options = input.readVInt(); + boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0; + boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0; + boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0; + lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, + hasPayloads, maxAnalyzedPathsForOneInput, fst)); + } + return new LookupFactory() { + @Override + public Lookup getLookup(FieldMapper mapper, boolean exactFirst) { + AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(mapper.names().fullName()); + if (analyzingSuggestHolder == null) { + return null; + } + int flags = exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0; + if (analyzingSuggestHolder.preserveSep) { + flags |= XAnalyzingSuggester.PRESERVE_SEP; + } + XAnalyzingSuggester suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags, + analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions, + analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads, + analyzingSuggestHolder.maxAnalyzedPathsForOneInput); + suggester.setPreservePositionIncrements(analyzingSuggestHolder.preservePositionIncrements); + return suggester; + } + }; + } + + static class AnalyzingSuggestHolder { + final boolean preserveSep; + final boolean preservePositionIncrements; + final int maxSurfaceFormsPerAnalyzedForm; + final int maxGraphExpansions; + final boolean hasPayloads; + final int maxAnalyzedPathsForOneInput; + final FST> fst; + + public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, + int maxAnalyzedPathsForOneInput, FST> fst) { + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + this.maxGraphExpansions = maxGraphExpansions; + this.hasPayloads = hasPayloads; + this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; + this.fst = fst; + } + + } + + @Override + public Set toFiniteStrings(TokenStream stream) throws IOException { + return prototype.toFiniteStrings(prototype.getTokenStreamToAutomaton(), stream); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java b/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java new file mode 100644 index 00000000000..fa79be07f6f --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java @@ -0,0 +1,332 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMap.Builder; +import org.apache.lucene.codecs.*; +import org.apache.lucene.index.*; +import org.apache.lucene.index.FilterAtomicReader.FilterTerms; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.store.IOContext.Context; +import org.apache.lucene.store.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.ElasticSearchIllegalStateException; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.search.suggest.completion.CompletionTokenStream.ToFiniteStrings; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; + +/** + * This {@link PostingsFormat} is basically a T-Sink for a default postings + * format that is used to store postings on disk fitting the lucene APIs and + * builds a suggest FST as an auxiliary data structure next to the actual + * postings format. It uses the delegate postings format for simplicity to + * handle all the merge operations. The auxiliary suggest FST data structure is + * only loaded if a FieldsProducer is requested for reading, for merging it uses + * the low memory delegate postings format. + * + */ +public class Completion090PostingsFormat extends PostingsFormat { + + public static final String CODEC_NAME = "completion090"; + public static final int SUGGEST_CODEC_VERSION = 1; + public static final String EXTENSION = "cmp"; + + private PostingsFormat delegatePostingsFormat; + private final static Map providers; + private CompletionLookupProvider writeProvider; + + static { + final CompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, false); + final Builder builder = ImmutableMap.builder(); + providers = builder.put(provider.getName(), provider).build(); + } + + public Completion090PostingsFormat(PostingsFormat delegatePostingsFormat, CompletionLookupProvider provider) { + super(CODEC_NAME); + this.delegatePostingsFormat = delegatePostingsFormat; + this.writeProvider = provider; + assert delegatePostingsFormat != null && writeProvider != null; + } + + /* + * Used only by core Lucene at read-time via Service Provider instantiation + * do not use at Write-time in application code. + */ + public Completion090PostingsFormat() { + super(CODEC_NAME); + } + + @Override + public SuggestFieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + if (delegatePostingsFormat == null) { + throw new UnsupportedOperationException("Error - " + getClass().getName() + + " has been constructed without a choice of PostingsFormat"); + } + assert writeProvider != null; + return new SuggestFieldsConsumer(state); + } + + @Override + public CompletionFieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + return new CompletionFieldsProducer(state); + } + + private class SuggestFieldsConsumer extends FieldsConsumer { + + private FieldsConsumer delegatesFieldsConsumer; + private FieldsConsumer suggestFieldsConsumer; + + public SuggestFieldsConsumer(SegmentWriteState state) throws IOException { + this.delegatesFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state); + String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION); + IndexOutput output = null; + boolean success = false; + try { + output = state.directory.createOutput(suggestFSTFile, state.context); + CodecUtil.writeHeader(output, CODEC_NAME, SUGGEST_CODEC_VERSION); + /* + * we write the delegate postings format name so we can load it + * without getting an instance in the ctor + */ + output.writeString(delegatePostingsFormat.getName()); + output.writeString(writeProvider.getName()); + this.suggestFieldsConsumer = writeProvider.consumer(output); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(output); + } + } + } + + @Override + public TermsConsumer addField(final FieldInfo field) throws IOException { + final TermsConsumer delegateConsumer = delegatesFieldsConsumer.addField(field); + final TermsConsumer suggestTermConsumer = suggestFieldsConsumer.addField(field); + final GroupedPostingsConsumer groupedPostingsConsumer = new GroupedPostingsConsumer(delegateConsumer, suggestTermConsumer); + + return new TermsConsumer() { + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + groupedPostingsConsumer.startTerm(text); + return groupedPostingsConsumer; + } + + @Override + public Comparator getComparator() throws IOException { + return delegateConsumer.getComparator(); + } + + @Override + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + suggestTermConsumer.finishTerm(text, stats); + delegateConsumer.finishTerm(text, stats); + } + + @Override + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { + suggestTermConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount); + delegateConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount); + } + }; + } + + @Override + public void close() throws IOException { + IOUtils.close(delegatesFieldsConsumer, suggestFieldsConsumer); + } + } + + private class GroupedPostingsConsumer extends PostingsConsumer { + + private TermsConsumer[] termsConsumers; + private PostingsConsumer[] postingsConsumers; + + public GroupedPostingsConsumer(TermsConsumer... termsConsumersArgs) { + termsConsumers = termsConsumersArgs; + postingsConsumers = new PostingsConsumer[termsConsumersArgs.length]; + } + + @Override + public void startDoc(int docID, int freq) throws IOException { + for (PostingsConsumer postingsConsumer : postingsConsumers) { + postingsConsumer.startDoc(docID, freq); + } + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + for (PostingsConsumer postingsConsumer : postingsConsumers) { + postingsConsumer.addPosition(position, payload, startOffset, endOffset); + } + } + + @Override + public void finishDoc() throws IOException { + for (PostingsConsumer postingsConsumer : postingsConsumers) { + postingsConsumer.finishDoc(); + } + } + + public void startTerm(BytesRef text) throws IOException { + for (int i = 0; i < termsConsumers.length; i++) { + postingsConsumers[i] = termsConsumers[i].startTerm(text); + } + } + } + + private class CompletionFieldsProducer extends FieldsProducer { + + private FieldsProducer delegateProducer; + private LookupFactory lookupFactory; + + public CompletionFieldsProducer(SegmentReadState state) throws IOException { + String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION); + IndexInput input = state.directory.openInput(suggestFSTFile, state.context); + CodecUtil.checkHeader(input, CODEC_NAME, SUGGEST_CODEC_VERSION, SUGGEST_CODEC_VERSION); + boolean success = false; + try { + PostingsFormat delegatePostingsFormat = PostingsFormat.forName(input.readString()); + String providerName = input.readString(); + CompletionLookupProvider completionLookupProvider = providers.get(providerName); + if (completionLookupProvider == null) { + throw new ElasticSearchIllegalStateException("no provider with name [" + providerName + "] registered"); + } + // TODO: we could clone the ReadState and make it always forward IOContext.MERGE to prevent unecessary heap usage? + this.delegateProducer = delegatePostingsFormat.fieldsProducer(state); + /* + * If we are merging we don't load the FSTs at all such that we + * don't consume so much memory during merge + */ + if (state.context.context != Context.MERGE) { + // TODO: maybe we can do this in a fully lazy fashion based on some configuration + // eventually we should have some kind of curciut breaker that prevents us from going OOM here + // with some configuration + this.lookupFactory = completionLookupProvider.load(input); + } + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(delegateProducer, input); + } else { + IOUtils.close(input); + } + } + } + + @Override + public void close() throws IOException { + IOUtils.close(delegateProducer); + } + + @Override + public Iterator iterator() { + return delegateProducer.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + Terms terms = delegateProducer.terms(field); + if (terms == null) { + return terms; + } + return new CompletionTerms(terms, this.lookupFactory); + } + + @Override + public int size() { + return delegateProducer.size(); + } + } + + public static final class CompletionTerms extends FilterTerms { + private final LookupFactory lookup; + + public CompletionTerms(Terms delegate, LookupFactory lookup) { + super(delegate); + this.lookup = lookup; + } + + public Lookup getLookup(FieldMapper mapper, boolean exactFirst) { + return lookup.getLookup(mapper, exactFirst); + } + } + + public static abstract class CompletionLookupProvider implements PayloadProcessor, ToFiniteStrings { + + public static final char UNIT_SEPARATOR = '\u001f'; + + public abstract FieldsConsumer consumer(IndexOutput output) throws IOException; + + public abstract String getName(); + + public abstract LookupFactory load(IndexInput input) throws IOException; + + @Override + public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException { + if (weight < -1 || weight > Integer.MAX_VALUE) { + throw new IllegalArgumentException("weight must be >= -1 && <= Integer.MAX_VALUE"); + } + for (int i = 0; i < surfaceForm.length; i++) { + if (surfaceForm.bytes[i] == UNIT_SEPARATOR) { + throw new IllegalArgumentException( + "surface form cannot contain unit separator character U+001F; this character is reserved"); + } + } + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream); + output.writeVLong(weight + 1); + output.writeVInt(surfaceForm.length); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + output.writeVInt(payload.length); + output.writeBytes(payload.bytes, 0, payload.length); + + output.close(); + return new BytesRef(byteArrayOutputStream.toByteArray()); + } + + @Override + public void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException { + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length); + InputStreamDataInput input = new InputStreamDataInput(byteArrayInputStream); + ref.weight = input.readVLong() - 1; + int len = input.readVInt(); + ref.surfaceForm.grow(len); + ref.surfaceForm.length = len; + input.readBytes(ref.surfaceForm.bytes, ref.surfaceForm.offset, ref.surfaceForm.length); + len = input.readVInt(); + ref.payload.grow(len); + ref.payload.length = len; + input.readBytes(ref.payload.bytes, ref.payload.offset, ref.payload.length); + input.close(); + } + } + + public static abstract class LookupFactory { + public abstract Lookup getLookup(FieldMapper mapper, boolean exactFirst); + } +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatProvider.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatProvider.java new file mode 100644 index 00000000000..ff807345fae --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatProvider.java @@ -0,0 +1,41 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.codecs.PostingsFormat; +import org.elasticsearch.index.codec.postingsformat.AbstractPostingsFormatProvider; +import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; + +/** + * + */ +public final class CompletionPostingsFormatProvider extends AbstractPostingsFormatProvider { + + private final Completion090PostingsFormat postingsFormat; + + public CompletionPostingsFormatProvider(String name, PostingsFormatProvider delegate, Completion090PostingsFormat.CompletionLookupProvider provider) { + super(name); + this.postingsFormat = new Completion090PostingsFormat(delegate.get(), provider); + } + + @Override + public PostingsFormat get() { + return postingsFormat; + } +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java new file mode 100644 index 00000000000..33c7c7dfe02 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java @@ -0,0 +1,60 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.search.suggest.SuggestContextParser; +import org.elasticsearch.search.suggest.SuggestionSearchContext; + +import java.io.IOException; + +import static org.elasticsearch.search.suggest.SuggestUtils.parseSuggestContext; + +/** + * + */ +public class CompletionSuggestParser implements SuggestContextParser { + + private CompletionSuggester completionSuggester; + + public CompletionSuggestParser(CompletionSuggester completionSuggester) { + this.completionSuggester = completionSuggester; + } + + @Override + public SuggestionSearchContext.SuggestionContext parse(XContentParser parser, MapperService mapperService) throws IOException { + XContentParser.Token token; + String fieldName = null; + CompletionSuggestionContext suggestion = new CompletionSuggestionContext(completionSuggester); + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fieldName = parser.currentName(); + } else if (token.isValue()) { + parseSuggestContext(parser, mapperService, fieldName, suggestion); + suggestion.mapper(mapperService.smartNameFieldMapper(suggestion.getField())); + } else { + throw new ElasticSearchIllegalArgumentException("suggester[completion] doesn't support field [" + fieldName + "]"); + } + } + return suggestion; + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java new file mode 100644 index 00000000000..d741c1f129f --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -0,0 +1,112 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import com.google.common.collect.Maps; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.text.StringText; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.search.suggest.Suggest; +import org.elasticsearch.search.suggest.SuggestContextParser; +import org.elasticsearch.search.suggest.Suggester; +import org.elasticsearch.search.suggest.completion.CompletionSuggestion.Entry.Option; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +public class CompletionSuggester implements Suggester { + + private static final ScoreComparator scoreComparator = new ScoreComparator(); + + @Override + public Suggest.Suggestion> execute(String name, + CompletionSuggestionContext suggestionContext, IndexReader indexReader, CharsRef spare) throws IOException { + CompletionSuggestion completionSuggestionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); + CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(new StringText(suggestionContext.getText() + .utf8ToString()), 0, suggestionContext.getText().toString().length()); + completionSuggestionSuggestion.addTerm(completionSuggestEntry); + String fieldName = suggestionContext.getField(); + + if (suggestionContext.mapper() == null || !(suggestionContext.mapper() instanceof CompletionFieldMapper)) { + throw new ElasticSearchException("Field [" + suggestionContext.getField() + "] is not a completion suggest field"); + } + String prefix = suggestionContext.getText().utf8ToString(); + + Map results = Maps.newHashMapWithExpectedSize(indexReader.leaves().size() * suggestionContext.getSize()); + for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) { + AtomicReader atomicReader = atomicReaderContext.reader(); + Terms terms = atomicReader.fields().terms(fieldName); + if (terms instanceof Completion090PostingsFormat.CompletionTerms) { + Completion090PostingsFormat.CompletionTerms lookupTerms = (Completion090PostingsFormat.CompletionTerms) terms; + Lookup lookup = lookupTerms.getLookup(suggestionContext.mapper(), false); + List lookupResults = lookup.lookup(prefix, false, suggestionContext.getSize()); + for (Lookup.LookupResult res : lookupResults) { + + final String key = res.key.toString(); + final float score = res.value; + final Option value = results.get(key); + if (value == null) { + final Option option = new CompletionSuggestion.Entry.Option(new StringText(key), score, res.payload == null ? null + : new BytesArray(res.payload)); + results.put(key, option); + } else if (value.getScore() < score) { + value.setScore(score); + value.setPayload(res.payload == null ? null : new BytesArray(res.payload)); + } + } + } + } + final List options = new ArrayList(results.values()); + CollectionUtil.introSort(options, scoreComparator); + + for (int i = 0 ; i < Math.min(suggestionContext.getSize(), options.size()) ; i++) { + completionSuggestEntry.addOption(options.get(i)); + } + + return completionSuggestionSuggestion; + } + + @Override + public String[] names() { + return new String[] { "completion" }; + } + + @Override + public SuggestContextParser getContextParser() { + return new CompletionSuggestParser(this); + } + + public static class ScoreComparator implements Comparator { + @Override + public int compare(Option o1, Option o2) { + return Float.compare(o2.getScore(), o1.getScore()); + } + } +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java new file mode 100644 index 00000000000..29f518b0244 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java @@ -0,0 +1,117 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.suggest.Suggest; + +import java.io.IOException; + +/** + * + */ +public class CompletionSuggestion extends Suggest.Suggestion { + + public static final int TYPE = 2; + + public CompletionSuggestion() { + } + + public CompletionSuggestion(String name, int size) { + super(name, size); + } + + @Override + public int getType() { + return TYPE; + } + + @Override + protected Entry newEntry() { + return new Entry(); + } + + public static class Entry extends org.elasticsearch.search.suggest.Suggest.Suggestion.Entry { + + public Entry(Text text, int offset, int length) { + super(text, offset, length); + } + + protected Entry() { + super(); + } + + @Override + protected Option newOption() { + return new Option(); + } + + public static class Option extends org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option { + private BytesReference payload; + + public Option(Text text, float score,BytesReference payload) { + super(text, score); + this.payload = payload; + } + + + protected Option() { + super(); + } + + public void setPayload(BytesReference payload) { + this.payload = payload; + } + + public BytesReference getPayload() { + return payload; + } + + public void setScore(float score) { + super.setScore(score); + } + + @Override + protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { + super.innerToXContent(builder, params); + if (payload != null && payload.length() > 0) { + builder.rawField("payload", payload); + } + return builder; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + super.readFrom(in); + payload = in.readBytesReference(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeBytesReference(payload); + } + } + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java new file mode 100644 index 00000000000..f44cd3e40a1 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java @@ -0,0 +1,41 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.suggest.SuggestBuilder; + +import java.io.IOException; + +/** + * + */ +public class CompletionSuggestionBuilder extends SuggestBuilder.SuggestionBuilder { + + public CompletionSuggestionBuilder(String name) { + super(name, "completion"); + } + + @Override + protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Params params) throws IOException { + return builder; + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java new file mode 100644 index 00000000000..03843e02f49 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java @@ -0,0 +1,43 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.search.suggest.Suggester; +import org.elasticsearch.search.suggest.SuggestionSearchContext; + +/** + * + */ +public class CompletionSuggestionContext extends SuggestionSearchContext.SuggestionContext { + + private FieldMapper mapper; + + public CompletionSuggestionContext(Suggester suggester) { + super(suggester); + } + + public FieldMapper mapper() { + return this.mapper; + } + + public void mapper(FieldMapper mapper) { + this.mapper = mapper; + } +} diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java new file mode 100644 index 00000000000..72f5b48d09e --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java @@ -0,0 +1,145 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Util; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Set; + +/** + * + */ +public final class CompletionTokenStream extends TokenStream { + + private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);; + private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); + private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class); + + private final TokenStream input; + private BytesRef payload; + private Iterator finiteStrings; + private ToFiniteStrings toFiniteStrings; + private int posInc = -1; + private static final int MAX_PATHS = 256; + private final BytesRef scratch = new BytesRef(); + + public CompletionTokenStream(TokenStream input, BytesRef payload, ToFiniteStrings toFiniteStrings) throws IOException { + this.input = input; + this.payload = payload; + this.toFiniteStrings = toFiniteStrings; + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (finiteStrings == null) { + Set strings = toFiniteStrings.toFiniteStrings(input); + + if (strings.size() > MAX_PATHS) { + throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS + + " finite strings are supported"); + } + posInc = strings.size(); + finiteStrings = strings.iterator(); + } + if (finiteStrings.hasNext()) { + posAttr.setPositionIncrement(posInc); + /* + * this posInc encodes the number of paths that this surface form + * produced. Multi Fields have the same surface form and therefore sum up + */ + posInc = 0; + Util.toBytesRef(finiteStrings.next(), scratch); // now we have UTF-8 + bytesAtt.setBytesRef(scratch); + if (payload != null) { + payloadAttr.setPayload(this.payload); + } + return true; + } + + return false; + } + + @Override + public void end() throws IOException { + if (posInc == -1) { + input.end(); + } + } + + @Override + public void close() throws IOException { + if (posInc == -1) { + input.close(); + } + } + + public static interface ToFiniteStrings { + public Set toFiniteStrings(TokenStream stream) throws IOException; + } + + @Override + public void reset() throws IOException { + super.reset(); + finiteStrings = null; + posInc = -1; + } + + public interface ByteTermAttribute extends TermToBytesRefAttribute { + public void setBytesRef(BytesRef bytes); + } + + public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute { + private BytesRef bytes; + + @Override + public int fillBytesRef() { + return bytes.hashCode(); + } + + @Override + public BytesRef getBytesRef() { + return bytes; + } + + @Override + public void setBytesRef(BytesRef bytes) { + this.bytes = bytes; + } + + @Override + public void clear() { + } + + @Override + public void copyTo(AttributeImpl target) { + ByteTermAttributeImpl other = (ByteTermAttributeImpl) target; + other.bytes = bytes; + } + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java b/src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java new file mode 100644 index 00000000000..ae4e0d1f8a2 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java @@ -0,0 +1,35 @@ +package org.elasticsearch.search.suggest.completion; +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +interface PayloadProcessor { + + BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException; + + void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException; + + static class SuggestPayload { + final BytesRef payload = new BytesRef(); + long weight = 0; + final BytesRef surfaceForm = new BytesRef(); + } +} diff --git a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 68ac25d0065..9b0bcfa3c24 100644 --- a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -1,2 +1,3 @@ org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat +org.elasticsearch.search.suggest.completion.Completion090PostingsFormat diff --git a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java new file mode 100644 index 00000000000..ed6da7f12a1 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java @@ -0,0 +1,251 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.integration.search.suggest; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.codecs.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.*; +import org.apache.lucene.index.FieldInfo.DocValuesType; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LineFileDocs; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat; +import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; +import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider; +import org.elasticsearch.index.mapper.FieldMapper.Names; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.search.suggest.SuggestUtils; +import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider; +import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat; +import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory; +import org.elasticsearch.test.integration.ElasticsearchTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; + +import static org.hamcrest.Matchers.equalTo; + +public class CompletionPostingsFormatTest extends ElasticsearchTestCase { + + @Test + public void testCompletionPostingsFormat() throws IOException { + AnalyzingCompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, true); + RAMDirectory dir = new RAMDirectory(); + IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT); + FieldsConsumer consumer = provider.consumer(output); + FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, + DocValuesType.SORTED, DocValuesType.BINARY, new HashMap()); + TermsConsumer addField = consumer.addField(fieldInfo); + + PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator")); + postingsConsumer.startDoc(0, 1); + postingsConsumer.addPosition(256 - 2, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0, + 1); + postingsConsumer.finishDoc(); + addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1)); + addField.startTerm(new BytesRef("generator")); + postingsConsumer.startDoc(0, 1); + postingsConsumer.addPosition(256 - 1, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0, + 1); + postingsConsumer.finishDoc(); + addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1)); + addField.finish(1, 1, 1); + consumer.close(); + output.close(); + + IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT); + LookupFactory load = provider.load(input); + PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat()); + NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT)); + Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), false); + List result = lookup.lookup("ge", false, 10); + assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters")); + assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10")); + dir.close(); + } + + @Test + public void testDuellCompletions() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException, + IllegalAccessException { + final boolean preserveSeparators = getRandom().nextBoolean(); + final boolean preservePositionIncrements = getRandom().nextBoolean(); + final boolean usePayloads = getRandom().nextBoolean(); + final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0; + + XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), new StandardAnalyzer( + TEST_VERSION_CURRENT), options, 256, -1, null, false, 1); + reference.setPreservePositionIncrements(preservePositionIncrements); + LineFileDocs docs = new LineFileDocs(getRandom()); + int num = atLeast(150); + final String[] titles = new String[num]; + final long[] weights = new long[num]; + for (int i = 0; i < titles.length; i++) { + Document nextDoc = docs.nextDoc(); + IndexableField field = nextDoc.getField("title"); + titles[i] = field.stringValue(); + weights[i] = between(0, 100); + + } + docs.close(); + final TermFreqIterator primaryIter = new TermFreqIterator() { + int index = 0; + long currentWeight = -1; + + @Override + public Comparator getComparator() { + return null; + } + + @Override + public BytesRef next() throws IOException { + if (index < titles.length) { + currentWeight = weights[index]; + return new BytesRef(titles[index++]); + } + return null; + } + + @Override + public long weight() { + return currentWeight; + } + + }; + TermFreqIterator iter; + if (usePayloads) { + iter = new TermFreqPayloadIterator() { + @Override + public long weight() { + return primaryIter.weight(); + } + + @Override + public Comparator getComparator() { + return primaryIter.getComparator(); + } + + @Override + public BytesRef next() throws IOException { + return primaryIter.next(); + } + + @Override + public BytesRef payload() { + return new BytesRef(Long.toString(weight())); + } + }; + } else { + iter = primaryIter; + } + reference.build(iter); + PostingsFormatProvider provider = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat()); + + NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT)); + final CompletionFieldMapper mapper = new CompletionFieldMapper(new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads, + preserveSeparators, preservePositionIncrements); + Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights); + Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); + field.setAccessible(true); + Field refField = reference.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); + refField.setAccessible(true); + assertThat(refField.get(reference), equalTo(field.get(buildAnalyzingLookup))); + + for (int i = 0; i < titles.length; i++) { + int res = between(1, 10); + final StringBuilder builder = new StringBuilder(); + SuggestUtils.analyze(namedAnalzyer.tokenStream("foo", titles[i]), new SuggestUtils.TokenConsumer() { + @Override + public void nextToken() throws IOException { + if (builder.length() == 0) { + builder.append(this.charTermAttr.toString()); + } + } + }); + String firstTerm = builder.toString(); + String prefix = firstTerm.isEmpty() ? "" : firstTerm.substring(0, between(1, firstTerm.length())); + List refLookup = reference.lookup(prefix, false, res); + List lookup = buildAnalyzingLookup.lookup(prefix, false, res); + assertThat(refLookup.toString(),lookup.size(), equalTo(refLookup.size())); + for (int j = 0; j < refLookup.size(); j++) { + assertThat(lookup.get(j).key, equalTo(refLookup.get(j).key)); + assertThat("prefix: " + prefix + " " + j + " -- missmatch cost: " + lookup.get(j).key + " - " + lookup.get(j).value + " | " + refLookup.get(j).key + " - " + refLookup.get(j).value , + lookup.get(j).value, equalTo(refLookup.get(j).value)); + assertThat(lookup.get(j).payload, equalTo(refLookup.get(j).payload)); + if (usePayloads) { + assertThat(lookup.get(j).payload.utf8ToString(), equalTo(Long.toString(lookup.get(j).value))); + } + } + } + } + + public Lookup buildAnalyzingLookup(final CompletionFieldMapper mapper, String[] terms, String[] surfaces, long[] weights) + throws IOException { + RAMDirectory dir = new RAMDirectory(); + FilterCodec filterCodec = new FilterCodec("filtered", Codec.getDefault()) { + public PostingsFormat postingsFormat() { + return mapper.postingsFormatProvider().get(); + } + }; + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(TEST_VERSION_CURRENT, mapper.indexAnalyzer()); + + indexWriterConfig.setCodec(filterCodec); + IndexWriter writer = new IndexWriter(dir, indexWriterConfig); + for (int i = 0; i < weights.length; i++) { + Document doc = new Document(); + BytesRef payload = mapper.buildPayload(new BytesRef(surfaces[i]), weights[i], new BytesRef(Long.toString(weights[i]))); + doc.add(mapper.getCompletionField(terms[i], payload)); + if (randomBoolean()) { + writer.commit(); + } + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + writer.commit(); + DirectoryReader reader = DirectoryReader.open(writer, true); + assertThat(reader.leaves().size(), equalTo(1)); + assertThat(reader.leaves().get(0).reader().numDocs(), equalTo(weights.length)); + AtomicReaderContext atomicReaderContext = reader.leaves().get(0); + Terms luceneTerms = atomicReaderContext.reader().terms(mapper.name()); + Lookup lookup = ((Completion090PostingsFormat.CompletionTerms) luceneTerms).getLookup(mapper, false); + reader.close(); + writer.close(); + dir.close(); + return lookup; + } + + // TODO ADD more unittests +} diff --git a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java new file mode 100644 index 00000000000..61a5bcdf89f --- /dev/null +++ b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java @@ -0,0 +1,454 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.test.integration.search.suggest; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import com.google.common.collect.Lists; +import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus; +import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse; +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.action.suggest.SuggestResponse; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.index.mapper.MapperException; +import org.elasticsearch.search.suggest.Suggest; +import org.elasticsearch.search.suggest.completion.CompletionSuggestion; +import org.elasticsearch.search.suggest.completion.CompletionSuggestionBuilder; +import org.elasticsearch.test.integration.AbstractSharedClusterTest; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ExecutionException; + +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.*; + +public class CompletionSuggestSearchTests extends AbstractSharedClusterTest { + + private static final String INDEX = "test"; + private static final String TYPE = "testType"; + private static final String FIELD = "testField"; + + @Test + public void testSimple() throws Exception{ + createIndexAndMapping(); + String[][] input = {{"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"}, + {"Generator", "Foo Fighters Generator"}, {"Learn to Fly", "Foo Fighters Learn to Fly" }, + {"The Prodigy"}, {"The Prodigy"}, {"The Prodigy"}, {"Firestarter", "The Prodigy Firestarter"}, + {"Turbonegro"}, {"Turbonegro"}, {"Get it on", "Turbonegro Get it on"}}; // work with frequencies + for (int i = 0; i < input.length; i++) { + client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(input[i]).endArray() + .endObject() + .endObject() + ) + .execute().actionGet(); + } + + refresh(); + + assertSuggestionsNotInOrder("f", "Foo Fighters", "Firestarter", "Foo Fighters Generator", "Foo Fighters Learn to Fly"); + assertSuggestionsNotInOrder("t", "The Prodigy", "Turbonegro", "Turbonegro Get it on", "The Prodigy Firestarter"); + } + + @Test + public void testBasicPrefixSuggestion() throws Exception { + createIndexAndMapping(); + for (int i = 0; i < 2; i++) { + createData(i==0); + assertSuggestions("f", "Firestarter - The Prodigy", "Foo Fighters", "Generator - Foo Fighters", "Learn to Fly - Foo Fighters"); + assertSuggestions("ge", "Generator - Foo Fighters", "Get it on - Turbonegro"); + assertSuggestions("ge", "Generator - Foo Fighters", "Get it on - Turbonegro"); + assertSuggestions("t", "The Prodigy", "Firestarter - The Prodigy", "Get it on - Turbonegro", "Turbonegro"); + } + } + + @Test + public void testThatWeightsAreWorking() throws Exception { + createIndexAndMapping(); + + List similarNames = Lists.newArrayList("the", "The Prodigy", "The Verve", "The the"); + // the weight is 1000 divided by string length, so the results are easy to to check + for (String similarName : similarNames) { + client().prepareIndex(INDEX, TYPE, similarName).setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(similarName).endArray() + .field("weight", 1000 / similarName.length()) + .endObject().endObject() + ).get(); + } + + refresh(); + + assertSuggestions("the", "the", "The the", "The Verve", "The Prodigy"); + } + + @Test + public void testThatInputCanBeAStringInsteadOfAnArray() throws Exception { + createIndexAndMapping(); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .field("input", "Foo Fighters") + .field("output", "Boo Fighters") + .endObject().endObject() + ).get(); + + refresh(); + + assertSuggestions("f", "Boo Fighters"); + } + + @Test + public void testThatPayloadsAreArbitraryJsonObjects() throws Exception { + createIndexAndMapping(); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foo Fighters").endArray() + .field("output", "Boo Fighters") + .startObject("payload").field("foo", "bar").startArray("test").value("spam").value("eggs").endArray().endObject() + .endObject().endObject() + ).get(); + + refresh(); + + SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion( + new CompletionSuggestionBuilder("testSuggestions").field(FIELD).text("foo").size(10) + ).execute().actionGet(); + + assertSuggestions(suggestResponse, "testSuggestions", "Boo Fighters"); + Suggest.Suggestion.Entry.Option option = suggestResponse.getSuggest().getSuggestion("testSuggestions").getEntries().get(0).getOptions().get(0); + assertThat(option, is(instanceOf(CompletionSuggestion.Entry.Option.class))); + CompletionSuggestion.Entry.Option prefixOption = (CompletionSuggestion.Entry.Option) option; + assertThat(prefixOption.getPayload(), is(notNullValue())); + + // parse JSON + Map jsonMap = JsonXContent.jsonXContent.createParser(prefixOption.getPayload()).mapAndClose(); + assertThat(jsonMap.size(), is(2)); + assertThat(jsonMap.get("foo").toString(), is("bar")); + assertThat(jsonMap.get("test"), is(instanceOf(List.class))); + List listValues = (List) jsonMap.get("test"); + assertThat(listValues, hasItems("spam", "eggs")); + } + + @Test(expected = MapperException.class) + public void testThatExceptionIsThrownWhenPayloadsAreDisabledButInIndexRequest() throws Exception { + createIndexAndMapping("simple", "simple", false, false, true); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foo Fighters").endArray() + .field("output", "Boo Fighters") + .startArray("payload").value("spam").value("eggs").endArray() + .endObject().endObject() + ).get(); + } + + @Test + public void testDisabledPreserveSeperators() throws Exception { + createIndexAndMapping("simple", "simple", true, false, true); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foo Fighters").endArray() + .field("weight", 10) + .endObject().endObject() + ).get(); + + client().prepareIndex(INDEX, TYPE, "2").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foof").endArray() + .field("weight", 20) + .endObject().endObject() + ).get(); + + refresh(); + + assertSuggestions("foof", "Foof", "Foo Fighters"); + } + + @Test + public void testEnabledPreserveSeperators() throws Exception { + createIndexAndMapping("simple", "simple", true, true, true); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foo Fighters").endArray() + .endObject().endObject() + ).get(); + + client().prepareIndex(INDEX, TYPE, "2").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foof").endArray() + .endObject().endObject() + ).get(); + + refresh(); + + assertSuggestions("foof", "Foof"); + } + + @Test + public void testThatMultipleInputsAreSuppored() throws Exception { + createIndexAndMapping("simple", "simple", false, false, true); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foo Fighters").value("Fu Fighters").endArray() + .field("output", "The incredible Foo Fighters") + .endObject().endObject() + ).get(); + + refresh(); + + assertSuggestions("foo", "The incredible Foo Fighters"); + assertSuggestions("fu", "The incredible Foo Fighters"); + } + + @Test + public void testThatShortSyntaxIsWorking() throws Exception { + createIndexAndMapping(); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startArray(FIELD) + .value("The Prodigy Firestarter").value("Firestarter") + .endArray().endObject() + ).get(); + + refresh(); + + assertSuggestions("t", "The Prodigy Firestarter"); + assertSuggestions("f", "Firestarter"); + } + + @Test + public void testThatDisablingPositionIncrementsWorkForStopwords() throws Exception { + // analyzer which removes stopwords... so may not be the simple one + createIndexAndMapping("standard", "standard", false, false, false); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("The Beatles").endArray() + .endObject().endObject() + ).get(); + + refresh(); + + assertSuggestions("b", "The Beatles"); + } + + @Test + public void testThatSynonymsWork() throws Exception { + Settings.Builder settingsBuilder = settingsBuilder() + .put("analysis.analyzer.suggest_analyzer_synonyms.type", "custom") + .put("analysis.analyzer.suggest_analyzer_synonyms.tokenizer", "standard") + .putArray("analysis.analyzer.suggest_analyzer_synonyms.filter", "standard", "lowercase", "my_synonyms") + .put("analysis.filter.my_synonyms.type", "synonym") + .putArray("analysis.filter.my_synonyms.synonyms", "foo,renamed"); + createIndexAndMappingAndSettings(settingsBuilder, "suggest_analyzer_synonyms", "suggest_analyzer_synonyms", false, false, true); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value("Foo Fighters").endArray() + .endObject().endObject() + ).get(); + + refresh(); + + // get suggestions for renamed + assertSuggestions("r", "Foo Fighters"); + } + + @Test + public void testThatUpgradeToMultiFieldWorks() throws Exception { + client().admin().indices().prepareDelete().get(); + int randomShardNumber = between(1, 5); + int randomReplicaNumber = between(0, 2); + Settings.Builder settingsBuilder = settingsBuilder().put(SETTING_NUMBER_OF_SHARDS, randomShardNumber).put(SETTING_NUMBER_OF_REPLICAS, randomReplicaNumber); + + client().admin().indices().prepareCreate(INDEX).setSettings(settingsBuilder).get(); + ensureYellow(); + client().prepareIndex(INDEX, TYPE, "1").setRefresh(true).setSource(jsonBuilder().startObject().field(FIELD, "Foo Fighters").endObject()).get(); + + PutMappingResponse putMappingResponse = client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject() + .startObject(TYPE).startObject("properties") + .startObject(FIELD) + .field("type", "multi_field") + .startObject("fields") + .startObject(FIELD).field("type", "string").endObject() + .startObject("suggest").field("type", "completion").field("index_analyzer", "simple").field("search_analyzer", "simple").endObject() + .endObject() + .endObject() + .endObject().endObject() + .endObject()) + .get(); + assertThat(putMappingResponse.isAcknowledged(), is(true)); + + SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion( + new CompletionSuggestionBuilder("suggs").field(FIELD + ".suggest").text("f").size(10) + ).execute().actionGet(); + assertSuggestions(suggestResponse, "suggs"); + + client().prepareIndex(INDEX, TYPE, "1").setRefresh(true).setSource(jsonBuilder().startObject().field(FIELD, "Foo Fighters").endObject()).get(); + waitForRelocation(ClusterHealthStatus.GREEN); + + SuggestResponse afterReindexingResponse = client().prepareSuggest(INDEX).addSuggestion( + new CompletionSuggestionBuilder("suggs").field(FIELD + ".suggest").text("f").size(10) + ).execute().actionGet(); + assertSuggestions(afterReindexingResponse, "suggs", "Foo Fighters"); + } + + public void assertSuggestions(String suggestion, String ... suggestions) { + String suggestionName = RandomStrings.randomAsciiOfLength(new Random(), 10); + SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion( + new CompletionSuggestionBuilder(suggestionName).field(FIELD).text(suggestion).size(10) + ).execute().actionGet(); + + assertSuggestions(suggestResponse, suggestionName, suggestions); + } + + public void assertSuggestionsNotInOrder(String suggestString, String ... suggestions) { + String suggestionName = RandomStrings.randomAsciiOfLength(new Random(), 10); + SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion( + new CompletionSuggestionBuilder(suggestionName).field(FIELD).text(suggestString).size(10) + ).execute().actionGet(); + + assertSuggestions(suggestResponse, false, suggestionName, suggestions); + } + + private void assertSuggestions(SuggestResponse suggestResponse, String name, String... suggestions) { + assertSuggestions(suggestResponse, true, name, suggestions); + } + + private void assertSuggestions(SuggestResponse suggestResponse, boolean suggestionOrderStrict, String name, String... suggestions) { + assertThat(suggestResponse.getFailedShards(), is(0)); + + assertThat(suggestResponse.getSuggest().getSuggestion(name), is(notNullValue())); + Suggest.Suggestion> suggestion = suggestResponse.getSuggest().getSuggestion(name); + + List suggestionList = getNames(suggestion.getEntries().get(0)); + List options = suggestion.getEntries().get(0).getOptions(); + + String assertMsg = String.format(Locale.ROOT, "Expected options %s length to be %s, but was %s", suggestionList, suggestions.length, options.size()); + assertThat(assertMsg, options.size(), is(suggestions.length)); + if (suggestionOrderStrict) { + for (int i = 0; i < suggestions.length; i++) { + String errMsg = String.format(Locale.ROOT, "Expected elem %s in list %s to be [%s] score: %s", i, suggestionList, suggestions[i], options.get(i).getScore()); + assertThat(errMsg, options.get(i).getText().toString(), is(suggestions[i])); + } + } else { + for (String expectedSuggestion : suggestions) { + String errMsg = String.format(Locale.ROOT, "Expected elem %s to be in list %s", expectedSuggestion, suggestionList); + assertThat(errMsg, suggestionList, hasItem(expectedSuggestion)); + } + } + } + + private List getNames(Suggest.Suggestion.Entry suggestEntry) { + List names = Lists.newArrayList(); + for (Suggest.Suggestion.Entry.Option entry : suggestEntry.getOptions()) { + names.add(entry.getText().string()); + } + return names; + } + + private void createIndexAndMapping() throws IOException { + createIndexAndMapping("simple", "simple", true, false, true); + } + + private void createIndexAndMappingAndSettings(Settings.Builder settingsBuilder, String indexAnalyzer, String searchAnalyzer, boolean payloads, boolean preserveSeparators, boolean preservePositionIncrements) throws IOException { + client().admin().indices().prepareDelete().get(); + client().admin().indices().prepareCreate(INDEX) + .setSettings(settingsBuilder) + .get(); + PutMappingResponse putMappingResponse = client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject() + .startObject(TYPE).startObject("properties") + .startObject(FIELD) + .field("type", "completion") + .field("index_analyzer", indexAnalyzer) + .field("search_analyzer", searchAnalyzer) + .field("payloads", payloads) + .field("preserve_separators", preserveSeparators) + .field("preserve_position_increments", preservePositionIncrements) + .endObject() + .endObject().endObject() + .endObject()) + .get(); + assertThat(putMappingResponse.isAcknowledged(), is(true)); + ensureYellow(); + } + + private void createIndexAndMapping(String indexAnalyzer, String searchAnalyzer, boolean payloads, boolean preserveSeparators, boolean preservePositionIncrements) throws IOException { + int randomShardNumber = between(1, 5); + int randomReplicaNumber = between(0, 2); + Settings.Builder settingsBuilder = settingsBuilder().put(SETTING_NUMBER_OF_SHARDS, randomShardNumber).put(SETTING_NUMBER_OF_REPLICAS, randomReplicaNumber); + createIndexAndMappingAndSettings(settingsBuilder, indexAnalyzer, searchAnalyzer, payloads, preserveSeparators, preservePositionIncrements); + } + + private void createData(boolean optimize) throws IOException, InterruptedException, ExecutionException { + String[][] input = {{"Foo Fighters"}, {"Generator", "Foo Fighters Generator"}, {"Learn to Fly", "Foo Fighters Learn to Fly" }, {"The Prodigy"}, {"Firestarter", "The Prodigy Firestarter"}, {"Turbonegro"}, {"Get it on", "Turbonegro Get it on"}}; + String[] surface = {"Foo Fighters", "Generator - Foo Fighters", "Learn to Fly - Foo Fighters", "The Prodigy", "Firestarter - The Prodigy", "Turbonegro", "Get it on - Turbonegro"}; + int[] weight = {10, 9, 8, 12, 11, 6, 7}; + IndexRequestBuilder[] builders = new IndexRequestBuilder[input.length]; + for (int i = 0; i < builders.length; i++) { + builders[i] = client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(input[i]).endArray() + .field("output",surface[i]) + .field("payload", "id: " + i) + .field("weight", 1) // WE FORCEFULLY INDEX A BOGUS WEIGHT + .endObject() + .endObject() + ); + } + indexRandom(INDEX, false, builders); + + for (int i = 0; i < builders.length; i++) { // add them again to make sure we deduplicate on the surface form + builders[i] = client().prepareIndex(INDEX, TYPE, "n" + i) + .setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(input[i]).endArray() + .field("output",surface[i]) + .field("payload", "id: " + i) + .field("weight", weight[i]) + .endObject() + .endObject() + ); + } + indexRandom(INDEX, false, builders); + + client().admin().indices().prepareRefresh(INDEX).execute().actionGet(); + if (optimize) { + // make sure merging works just fine + client().admin().indices().prepareFlush(INDEX).execute().actionGet(); + client().admin().indices().prepareOptimize(INDEX).execute().actionGet(); + } + } +} diff --git a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionTokenStreamTest.java b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionTokenStreamTest.java new file mode 100644 index 00000000000..cf7349674a6 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionTokenStreamTest.java @@ -0,0 +1,169 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.test.integration.search.suggest; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.synonym.SynonymMap.Builder; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; +import org.elasticsearch.search.suggest.completion.CompletionTokenStream; +import org.elasticsearch.search.suggest.completion.CompletionTokenStream.ByteTermAttribute; +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Set; + +public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { + + final XAnalyzingSuggester suggester = new XAnalyzingSuggester(new SimpleAnalyzer(TEST_VERSION_CURRENT)); + + @Test + public void testSuggestTokenFilter() throws Exception { + TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true); + BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); + TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() { + @Override + public Set toFiniteStrings(TokenStream stream) throws IOException { + return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); + } + })); + assertTokenStreamContents(suggestTokenStream, new String[] {"mykeyword"}, null, null, new String[] {"Surface keyword|friggin payload|10"}, new int[] { 1 }, null, null); + } + + @Test + public void testSuggestTokenFilterWithSynonym() throws Exception { + Builder builder = new SynonymMap.Builder(true); + builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true); + + MockTokenizer tokenizer = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true); + SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); + + BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); + TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(new CompletionTokenStream(filter, payload, new CompletionTokenStream.ToFiniteStrings() { + @Override + public Set toFiniteStrings(TokenStream stream) throws IOException { + return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); + } + })); + assertTokenStreamContents(suggestTokenStream, new String[] {"mysynonym", "mykeyword"}, null, null, new String[] {"Surface keyword|friggin payload|10", "Surface keyword|friggin payload|10"}, new int[] { 2, 0 }, null, null); + } + + @Test + public void testValidNumberOfExpansions() throws IOException { + Builder builder = new SynonymMap.Builder(true); + for (int i = 0; i < 256; i++) { + builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true); + } + StringBuilder valueBuilder = new StringBuilder(); + for (int i = 0 ; i < 8 ; i++) { + valueBuilder.append(i+1); + valueBuilder.append(" "); + } + MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()), MockTokenizer.WHITESPACE, true); + SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); + + TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { + @Override + public Set toFiniteStrings(TokenStream stream) throws IOException { + Set finiteStrings = suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); + return finiteStrings; + } + }); + + suggestTokenStream.reset(); + ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class); + PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class); + int maxPos = 0; + int count = 0; + while(suggestTokenStream.incrementToken()) { + count++; + assertNotNull(attr.getBytesRef()); + assertTrue(attr.getBytesRef().length > 0); + maxPos += posAttr.getPositionIncrement(); + } + suggestTokenStream.close(); + assertEquals(count, 256); + assertEquals(count, maxPos); + + } + + @Test(expected = IllegalArgumentException.class) + public void testInValidNumberOfExpansions() throws IOException { + Builder builder = new SynonymMap.Builder(true); + for (int i = 0; i < 256; i++) { + builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true); + } + StringBuilder valueBuilder = new StringBuilder(); + for (int i = 0 ; i < 9 ; i++) { // 9 -> expands to 512 + valueBuilder.append(i+1); + valueBuilder.append(" "); + } + MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()), MockTokenizer.WHITESPACE, true); + SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); + + TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { + @Override + public Set toFiniteStrings(TokenStream stream) throws IOException { + Set finiteStrings = suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); + return finiteStrings; + } + }); + + suggestTokenStream.reset(); + suggestTokenStream.incrementToken(); + suggestTokenStream.close(); + + } + + public final static class ByteTermAttrToCharTermAttrFilter extends TokenFilter { + private CharTermAttribute attr = addAttribute(CharTermAttribute.class); + private ByteTermAttribute byteAttr = addAttribute(ByteTermAttribute.class); + private PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private TypeAttribute type = addAttribute(TypeAttribute.class); + protected ByteTermAttrToCharTermAttrFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + BytesRef bytesRef = byteAttr.getBytesRef(); + attr.append(bytesRef.utf8ToString()); + // we move them over so we can assert them more easily in the tests + type.setType(payload.getPayload().utf8ToString()); + return true; + } + return false; + } + + } +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java b/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java new file mode 100644 index 00000000000..b6ce6b95685 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java @@ -0,0 +1,88 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.test.unit.index.mapper.completion; + +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.test.unit.index.mapper.MapperTestUtils; +import org.junit.Test; + +import java.io.IOException; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.is; + +public class CompletionFieldMapperTests { + + @Test + public void testDefaultConfiguration() throws IOException { + String mapping = jsonBuilder().startObject().startObject("type1") + .startObject("properties").startObject("completion") + .field("type", "completion") + .endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper defaultMapper = MapperTestUtils.newParser().parse(mapping); + + FieldMapper fieldMapper = defaultMapper.mappers().name("completion").mapper(); + assertThat(fieldMapper, instanceOf(CompletionFieldMapper.class)); + + CompletionFieldMapper completionFieldMapper = (CompletionFieldMapper) fieldMapper; + assertThat(completionFieldMapper.isStoringPayloads(), is(false)); + } + + @Test + public void testThatSerializationIncludesAllElements() throws Exception { + String mapping = jsonBuilder().startObject().startObject("type1") + .startObject("properties").startObject("completion") + .field("type", "completion") + .field("index_analyzer", "simple") + .field("search_analyzer", "standard") + .field("payloads", true) + .field("preserve_separators", false) + .field("preserve_position_increments", true) + .endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper defaultMapper = MapperTestUtils.newParser().parse(mapping); + + FieldMapper fieldMapper = defaultMapper.mappers().name("completion").mapper(); + assertThat(fieldMapper, instanceOf(CompletionFieldMapper.class)); + + CompletionFieldMapper completionFieldMapper = (CompletionFieldMapper) fieldMapper; + XContentBuilder builder = jsonBuilder().startObject(); + completionFieldMapper.toXContent(builder, null).endObject(); + builder.close(); + Map serializedMap = JsonXContent.jsonXContent.createParser(builder.bytes()).mapAndClose(); + Map configMap = (Map) serializedMap.get("completion"); + assertThat(configMap.get("index_analyzer").toString(), is("simple")); + assertThat(configMap.get("search_analyzer").toString(), is("standard")); + assertThat(Boolean.valueOf(configMap.get("payloads").toString()), is(true)); + assertThat(Boolean.valueOf(configMap.get("preserve_separators").toString()), is(false)); + assertThat(Boolean.valueOf(configMap.get("preserve_position_increments").toString()), is(true)); + } + +}