mirror of https://github.com/apache/lucene.git
LUCENE-5337 Add Payload support to FileDictionary (Suggest) and make it more configurable
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1541341 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3835bff32d
commit
99255ff0da
|
@ -155,6 +155,9 @@ New Features
|
||||||
|
|
||||||
* LUCENE-5297: Allow to range-facet on any ValueSource, not just
|
* LUCENE-5297: Allow to range-facet on any ValueSource, not just
|
||||||
NumericDocValues fields. (Shai Erera)
|
NumericDocValues fields. (Shai Erera)
|
||||||
|
|
||||||
|
* LUCENE-5337: Add Payload support to FileDictionary (Suggest) and make it more
|
||||||
|
configurable (Areek Zilluer via Erick Erickson)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
|
|
|
@ -28,46 +28,126 @@ import org.apache.lucene.util.IOUtils;
|
||||||
/**
|
/**
|
||||||
* Dictionary represented by a text file.
|
* Dictionary represented by a text file.
|
||||||
*
|
*
|
||||||
* <p/>Format allowed: 1 string per line, optionally with a tab-separated integer value:<br/>
|
* <p/>Format allowed: 1 entry per line:<br/>
|
||||||
* word1 TAB 100<br/>
|
* An entry can be: <br/>
|
||||||
* word2 word3 TAB 101<br/>
|
* <ul>
|
||||||
* word4 word5 TAB 102<br/>
|
* <li>suggestion</li>
|
||||||
|
* <li>suggestion <code>fieldDelimiter</code> weight</li>
|
||||||
|
* <li>suggestion <code>fieldDelimiter</code> weight <code>fieldDelimiter</code> payload</li>
|
||||||
|
* </ul>
|
||||||
|
* where the default <code>fieldDelimiter</code> is {@value #DEFAULT_FIELD_DELIMITER}<br/>
|
||||||
|
* <p/>
|
||||||
|
* <b>NOTE:</b>
|
||||||
|
* <ul>
|
||||||
|
* <li>In order to have payload enabled, the first entry has to have a payload</li>
|
||||||
|
* <li>If the weight for an entry is not specified then a value of 1 is used</li>
|
||||||
|
* <li>A payload cannot be specified without having the weight specified for an entry</li>
|
||||||
|
* <li>If the payload for an entry is not specified (assuming payload is enabled)
|
||||||
|
* then an empty payload is returned</li>
|
||||||
|
* <li>An entry cannot have more than two <code>fieldDelimiter</code></li>
|
||||||
|
* </ul>
|
||||||
|
* <p/>
|
||||||
|
* <b>Example:</b><br/>
|
||||||
|
* word1 word2 TAB 100 TAB payload1<br/>
|
||||||
|
* word3 TAB 101<br/>
|
||||||
|
* word4 word3 TAB 102<br/>
|
||||||
*/
|
*/
|
||||||
public class FileDictionary implements Dictionary {
|
public class FileDictionary implements Dictionary {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tab-delimited fields are most common thus the default, but one can override this via the constructor
|
||||||
|
*/
|
||||||
|
public final static String DEFAULT_FIELD_DELIMITER = "\t";
|
||||||
private BufferedReader in;
|
private BufferedReader in;
|
||||||
private String line;
|
private String line;
|
||||||
private boolean done = false;
|
private boolean done = false;
|
||||||
|
private final String fieldDelimiter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a dictionary based on an inputstream.
|
* Creates a dictionary based on an inputstream.
|
||||||
|
* Using {@link #DEFAULT_FIELD_DELIMITER} as the
|
||||||
|
* field seperator in a line.
|
||||||
* <p>
|
* <p>
|
||||||
* NOTE: content is treated as UTF-8
|
* NOTE: content is treated as UTF-8
|
||||||
*/
|
*/
|
||||||
public FileDictionary(InputStream dictFile) {
|
public FileDictionary(InputStream dictFile) {
|
||||||
in = new BufferedReader(IOUtils.getDecodingReader(dictFile, IOUtils.CHARSET_UTF_8));
|
this(dictFile, DEFAULT_FIELD_DELIMITER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a dictionary based on a reader.
|
* Creates a dictionary based on a reader.
|
||||||
|
* Using {@link #DEFAULT_FIELD_DELIMITER} as the
|
||||||
|
* field seperator in a line.
|
||||||
*/
|
*/
|
||||||
public FileDictionary(Reader reader) {
|
public FileDictionary(Reader reader) {
|
||||||
|
this(reader, DEFAULT_FIELD_DELIMITER);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a dictionary based on a reader.
|
||||||
|
* Using <code>fieldDelimiter</code> to seperate out the
|
||||||
|
* fields in a line.
|
||||||
|
*/
|
||||||
|
public FileDictionary(Reader reader, String fieldDelimiter) {
|
||||||
in = new BufferedReader(reader);
|
in = new BufferedReader(reader);
|
||||||
|
this.fieldDelimiter = fieldDelimiter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a dictionary based on an inputstream.
|
||||||
|
* Using <code>fieldDelimiter</code> to seperate out the
|
||||||
|
* fields in a line.
|
||||||
|
* <p>
|
||||||
|
* NOTE: content is treated as UTF-8
|
||||||
|
*/
|
||||||
|
public FileDictionary(InputStream dictFile, String fieldDelimiter) {
|
||||||
|
in = new BufferedReader(IOUtils.getDecodingReader(dictFile, IOUtils.CHARSET_UTF_8));
|
||||||
|
this.fieldDelimiter = fieldDelimiter;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public InputIterator getWordsIterator() {
|
public InputIterator getWordsIterator() {
|
||||||
return new FileIterator();
|
try {
|
||||||
|
return new FileIterator();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final class FileIterator implements InputIterator {
|
final class FileIterator implements InputIterator {
|
||||||
private long curFreq;
|
private long curWeight;
|
||||||
private final BytesRef spare = new BytesRef();
|
private final BytesRef spare = new BytesRef();
|
||||||
|
private BytesRef curPayload = new BytesRef();
|
||||||
|
private boolean isFirstLine = true;
|
||||||
|
private boolean hasPayloads = false;
|
||||||
|
|
||||||
|
private FileIterator() throws IOException {
|
||||||
|
line = in.readLine();
|
||||||
|
if (line == null) {
|
||||||
|
done = true;
|
||||||
|
IOUtils.close(in);
|
||||||
|
} else {
|
||||||
|
String[] fields = line.split(fieldDelimiter);
|
||||||
|
if (fields.length > 3) {
|
||||||
|
throw new IllegalArgumentException("More than 3 fields in one line");
|
||||||
|
} else if (fields.length == 3) { // term, weight, payload
|
||||||
|
hasPayloads = true;
|
||||||
|
spare.copyChars(fields[0]);
|
||||||
|
readWeight(fields[1]);
|
||||||
|
curPayload.copyChars(fields[2]);
|
||||||
|
} else if (fields.length == 2) { // term, weight
|
||||||
|
spare.copyChars(fields[0]);
|
||||||
|
readWeight(fields[1]);
|
||||||
|
} else { // only term
|
||||||
|
spare.copyChars(fields[0]);
|
||||||
|
curWeight = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long weight() {
|
public long weight() {
|
||||||
return curFreq;
|
return curWeight;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -75,20 +155,33 @@ public class FileDictionary implements Dictionary {
|
||||||
if (done) {
|
if (done) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
if (isFirstLine) {
|
||||||
|
isFirstLine = false;
|
||||||
|
return spare;
|
||||||
|
}
|
||||||
line = in.readLine();
|
line = in.readLine();
|
||||||
if (line != null) {
|
if (line != null) {
|
||||||
String[] fields = line.split("\t");
|
String[] fields = line.split(fieldDelimiter);
|
||||||
if (fields.length > 1) {
|
if (fields.length > 3) {
|
||||||
// keep reading floats for bw compat
|
throw new IllegalArgumentException("More than 3 fields in one line");
|
||||||
try {
|
} else if (fields.length == 3) { // term, weight and payload
|
||||||
curFreq = Long.parseLong(fields[1]);
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
curFreq = (long)Double.parseDouble(fields[1]);
|
|
||||||
}
|
|
||||||
spare.copyChars(fields[0]);
|
spare.copyChars(fields[0]);
|
||||||
} else {
|
readWeight(fields[1]);
|
||||||
spare.copyChars(line);
|
if (hasPayloads) {
|
||||||
curFreq = 1;
|
curPayload.copyChars(fields[2]);
|
||||||
|
}
|
||||||
|
} else if (fields.length == 2) { // term, weight
|
||||||
|
spare.copyChars(fields[0]);
|
||||||
|
readWeight(fields[1]);
|
||||||
|
if (hasPayloads) { // have an empty payload
|
||||||
|
curPayload = new BytesRef();
|
||||||
|
}
|
||||||
|
} else { // only term
|
||||||
|
spare.copyChars(fields[0]);
|
||||||
|
curWeight = 1;
|
||||||
|
if (hasPayloads) {
|
||||||
|
curPayload = new BytesRef();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return spare;
|
return spare;
|
||||||
} else {
|
} else {
|
||||||
|
@ -100,12 +193,21 @@ public class FileDictionary implements Dictionary {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BytesRef payload() {
|
public BytesRef payload() {
|
||||||
return null;
|
return (hasPayloads) ? curPayload : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasPayloads() {
|
public boolean hasPayloads() {
|
||||||
return false;
|
return hasPayloads;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readWeight(String weight) {
|
||||||
|
// keep reading floats for bw compat
|
||||||
|
try {
|
||||||
|
curWeight = Long.parseLong(weight);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
curWeight = (long)Double.parseDouble(weight);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
package org.apache.lucene.search.suggest;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.AbstractMap.SimpleEntry;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class FileDictionaryTest extends LuceneTestCase {
|
||||||
|
|
||||||
|
private Map.Entry<List<String>, String> generateFileEntry(String fieldDelimiter, boolean hasWeight, boolean hasPayload) {
|
||||||
|
List<String> entryValues = new ArrayList<>();
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
String term = _TestUtil.randomSimpleString(random(), 1, 300);
|
||||||
|
sb.append(term);
|
||||||
|
entryValues.add(term);
|
||||||
|
if (hasWeight) {
|
||||||
|
sb.append(fieldDelimiter);
|
||||||
|
long weight = _TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE);
|
||||||
|
sb.append(weight);
|
||||||
|
entryValues.add(String.valueOf(weight));
|
||||||
|
}
|
||||||
|
if (hasPayload) {
|
||||||
|
sb.append(fieldDelimiter);
|
||||||
|
String payload = _TestUtil.randomSimpleString(random(), 1, 300);
|
||||||
|
sb.append(payload);
|
||||||
|
entryValues.add(payload);
|
||||||
|
}
|
||||||
|
sb.append("\n");
|
||||||
|
return new SimpleEntry<List<String>, String>(entryValues, sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map.Entry<List<List<String>>,String> generateFileInput(int count, String fieldDelimiter, boolean hasWeights, boolean hasPayloads) {
|
||||||
|
List<List<String>> entries = new ArrayList<>();
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
boolean hasPayload = hasPayloads;
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
if (hasPayloads) {
|
||||||
|
hasPayload = (i==0) ? true : random().nextBoolean();
|
||||||
|
}
|
||||||
|
Map.Entry<List<String>, String> entrySet = generateFileEntry(fieldDelimiter, (!hasPayloads && hasWeights) ? random().nextBoolean() : hasWeights, hasPayload);
|
||||||
|
entries.add(entrySet.getKey());
|
||||||
|
sb.append(entrySet.getValue());
|
||||||
|
}
|
||||||
|
return new SimpleEntry<List<List<String>>, String>(entries, sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFileWithTerm() throws IOException {
|
||||||
|
Map.Entry<List<List<String>>,String> fileInput = generateFileInput(atLeast(100), FileDictionary.DEFAULT_FIELD_DELIMITER, false, false);
|
||||||
|
InputStream inputReader = new ByteArrayInputStream(fileInput.getValue().getBytes("UTF-8"));
|
||||||
|
FileDictionary dictionary = new FileDictionary(inputReader);
|
||||||
|
List<List<String>> entries = fileInput.getKey();
|
||||||
|
InputIterator inputIter = dictionary.getWordsIterator();
|
||||||
|
assertFalse(inputIter.hasPayloads());
|
||||||
|
BytesRef term;
|
||||||
|
int count = 0;
|
||||||
|
while((term = inputIter.next()) != null) {
|
||||||
|
assertTrue(entries.size() > count);
|
||||||
|
List<String> entry = entries.get(count);
|
||||||
|
assertTrue(entry.size() >= 1); // at least a term
|
||||||
|
assertEquals(entry.get(0), term.utf8ToString());
|
||||||
|
assertEquals(1, inputIter.weight());
|
||||||
|
assertNull(inputIter.payload());
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, entries.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFileWithWeight() throws IOException {
|
||||||
|
Map.Entry<List<List<String>>,String> fileInput = generateFileInput(atLeast(100), FileDictionary.DEFAULT_FIELD_DELIMITER, true, false);
|
||||||
|
InputStream inputReader = new ByteArrayInputStream(fileInput.getValue().getBytes("UTF-8"));
|
||||||
|
FileDictionary dictionary = new FileDictionary(inputReader);
|
||||||
|
List<List<String>> entries = fileInput.getKey();
|
||||||
|
InputIterator inputIter = dictionary.getWordsIterator();
|
||||||
|
assertFalse(inputIter.hasPayloads());
|
||||||
|
BytesRef term;
|
||||||
|
int count = 0;
|
||||||
|
while((term = inputIter.next()) != null) {
|
||||||
|
assertTrue(entries.size() > count);
|
||||||
|
List<String> entry = entries.get(count);
|
||||||
|
assertTrue(entry.size() >= 1); // at least a term
|
||||||
|
assertEquals(entry.get(0), term.utf8ToString());
|
||||||
|
assertEquals((entry.size() == 2) ? Long.parseLong(entry.get(1)) : 1, inputIter.weight());
|
||||||
|
assertNull(inputIter.payload());
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, entries.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFileWithWeightAndPayload() throws IOException {
|
||||||
|
Map.Entry<List<List<String>>,String> fileInput = generateFileInput(atLeast(100), FileDictionary.DEFAULT_FIELD_DELIMITER, true, true);
|
||||||
|
InputStream inputReader = new ByteArrayInputStream(fileInput.getValue().getBytes("UTF-8"));
|
||||||
|
FileDictionary dictionary = new FileDictionary(inputReader);
|
||||||
|
List<List<String>> entries = fileInput.getKey();
|
||||||
|
InputIterator inputIter = dictionary.getWordsIterator();
|
||||||
|
assertTrue(inputIter.hasPayloads());
|
||||||
|
BytesRef term;
|
||||||
|
int count = 0;
|
||||||
|
while((term = inputIter.next()) != null) {
|
||||||
|
assertTrue(entries.size() > count);
|
||||||
|
List<String> entry = entries.get(count);
|
||||||
|
assertTrue(entry.size() >= 2); // at least term and weight
|
||||||
|
assertEquals(entry.get(0), term.utf8ToString());
|
||||||
|
assertEquals(Long.parseLong(entry.get(1)), inputIter.weight());
|
||||||
|
if (entry.size() == 3) {
|
||||||
|
assertEquals(entry.get(2), inputIter.payload().utf8ToString());
|
||||||
|
} else {
|
||||||
|
assertEquals(inputIter.payload().length, 0);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, entries.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFileWithOneEntry() throws IOException {
|
||||||
|
Map.Entry<List<List<String>>,String> fileInput = generateFileInput(1, FileDictionary.DEFAULT_FIELD_DELIMITER, true, true);
|
||||||
|
InputStream inputReader = new ByteArrayInputStream(fileInput.getValue().getBytes("UTF-8"));
|
||||||
|
FileDictionary dictionary = new FileDictionary(inputReader);
|
||||||
|
List<List<String>> entries = fileInput.getKey();
|
||||||
|
InputIterator inputIter = dictionary.getWordsIterator();
|
||||||
|
assertTrue(inputIter.hasPayloads());
|
||||||
|
BytesRef term;
|
||||||
|
int count = 0;
|
||||||
|
while((term = inputIter.next()) != null) {
|
||||||
|
assertTrue(entries.size() > count);
|
||||||
|
List<String> entry = entries.get(count);
|
||||||
|
assertTrue(entry.size() >= 2); // at least term and weight
|
||||||
|
assertEquals(entry.get(0), term.utf8ToString());
|
||||||
|
assertEquals(Long.parseLong(entry.get(1)), inputIter.weight());
|
||||||
|
if (entry.size() == 3) {
|
||||||
|
assertEquals(entry.get(2), inputIter.payload().utf8ToString());
|
||||||
|
} else {
|
||||||
|
assertEquals(inputIter.payload().length, 0);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, entries.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFileWithDifferentDelimiter() throws IOException {
|
||||||
|
Map.Entry<List<List<String>>,String> fileInput = generateFileInput(atLeast(100), " , ", true, true);
|
||||||
|
InputStream inputReader = new ByteArrayInputStream(fileInput.getValue().getBytes("UTF-8"));
|
||||||
|
FileDictionary dictionary = new FileDictionary(inputReader, " , ");
|
||||||
|
List<List<String>> entries = fileInput.getKey();
|
||||||
|
InputIterator inputIter = dictionary.getWordsIterator();
|
||||||
|
assertTrue(inputIter.hasPayloads());
|
||||||
|
BytesRef term;
|
||||||
|
int count = 0;
|
||||||
|
while((term = inputIter.next()) != null) {
|
||||||
|
assertTrue(entries.size() > count);
|
||||||
|
List<String> entry = entries.get(count);
|
||||||
|
assertTrue(entry.size() >= 2); // at least term and weight
|
||||||
|
assertEquals(entry.get(0), term.utf8ToString());
|
||||||
|
assertEquals(Long.parseLong(entry.get(1)), inputIter.weight());
|
||||||
|
if (entry.size() == 3) {
|
||||||
|
assertEquals(entry.get(2), inputIter.payload().utf8ToString());
|
||||||
|
} else {
|
||||||
|
assertEquals(inputIter.payload().length, 0);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, entries.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue