merged with trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/realtime_search@1092636 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-04-15 09:00:20 +00:00
commit f0b56fd92e
40 changed files with 52365 additions and 220 deletions

View File

@ -45,7 +45,14 @@ API Changes
======================= Lucene 3.x (not yet released) =======================
(No changes)
Bug fixes
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
New Features
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
======================= Lucene 3.1.0 =======================

View File

@ -58,6 +58,7 @@ public final class FieldInfo {
this.omitNorms = false;
this.omitTermFreqAndPositions = false;
}
assert !omitTermFreqAndPositions || !storePayloads;
}
void setCodecId(int codecId) {
@ -80,6 +81,7 @@ public final class FieldInfo {
// should only be called by FieldInfos#addOrUpdate
void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) {
if (this.isIndexed != isIndexed) {
this.isIndexed = true; // once indexed, always index
}
@ -101,7 +103,9 @@ public final class FieldInfo {
}
if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) {
this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life
this.storePayloads = false;
}
}
assert !this.omitTermFreqAndPositions || !this.storePayloads;
}
}

View File

@ -424,8 +424,8 @@ public final class FieldInfos implements Iterable<FieldInfo> {
}
synchronized private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, boolean isIndexed,
boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) {
boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) {
if (globalFieldNumbers == null) {
throw new IllegalStateException("FieldInfos are read-only, create a new instance with a global field map to make modifications to FieldInfos");
}
@ -567,6 +567,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
output.writeVInt(FORMAT_CURRENT);
output.writeVInt(size());
for (FieldInfo fi : this) {
assert !fi.omitTermFreqAndPositions || !fi.storePayloads;
byte bits = 0x0;
if (fi.isIndexed) bits |= IS_INDEXED;
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
@ -607,6 +608,14 @@ public final class FieldInfos implements Iterable<FieldInfo> {
boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0;
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
if (omitTermFreqAndPositions) {
storePayloads = false;
}
final FieldInfo addInternal = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions);
addInternal.setCodecId(codecId);
}

View File

@ -74,8 +74,13 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) {
final FieldInfo fieldInfo = allFields.get(fieldNumber).fieldInfo;
FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber);
fieldInfo.storePayloads |= fieldWriter.hasPayloads;
final FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber);
// Aggregate the storePayload as seen by the same
// field across multiple threads
if (!fieldInfo.omitTermFreqAndPositions) {
fieldInfo.storePayloads |= fieldWriter.hasPayloads;
}
// If this field has postings then add them to the
// segment

View File

@ -151,10 +151,10 @@ public class BlockTermsReader extends FieldsProducer {
}
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(in, BlockTermsWriter.CODEC_NAME,
CodecUtil.checkHeader(input, BlockTermsWriter.CODEC_NAME,
BlockTermsWriter.VERSION_START,
BlockTermsWriter.VERSION_CURRENT);
dirOffset = in.readLong();
dirOffset = input.readLong();
}
protected void seekDir(IndexInput input, long dirOffset)
@ -842,6 +842,11 @@ public class BlockTermsReader extends FieldsProducer {
private void decodeMetaData() throws IOException {
//System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termCount + " state=" + state);
if (!seekPending) {
// TODO: cutover to random-access API
// here.... really stupid that we have to decode N
// wasted term metadata just to get to the N+1th
// that we really need...
// lazily catch up on metadata decode:
final int limit = state.termCount;
// We must set/incr state.termCount because

View File

@ -177,6 +177,7 @@ class SepSkipListReader extends MultiLevelSkipListReader {
@Override
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
int delta;
assert !omitTF || !currentFieldStoresPayloads;
if (currentFieldStoresPayloads) {
// the current field stores payloads.
// if the doc delta is odd then we have

View File

@ -34,8 +34,6 @@ import java.util.HashMap;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.junit.Assert;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CheckIndex;
@ -188,22 +186,35 @@ public class _TestUtil {
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
int t = r.nextInt(5);
randomFixedLengthUnicodeString(r, buffer, 0, buffer.length);
return new String(buffer, 0, end);
}
if (0 == t && i < end - 1) {
/**
* Fills provided char[] with valid random unicode code
* unit sequence.
*/
public static void randomFixedLengthUnicodeString(Random random, char[] chars, int offset, int length) {
int i = offset;
final int end = offset + length;
while(i < end) {
final int t = random.nextInt(5);
if (0 == t && i < length - 1) {
// Make a surrogate pair
// High surrogate
buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff);
chars[i++] = (char) nextInt(random, 0xd800, 0xdbff);
// Low surrogate
buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff);
chars[i++] = (char) nextInt(random, 0xdc00, 0xdfff);
} else if (t <= 1) {
chars[i++] = (char) random.nextInt(0x80);
} else if (2 == t) {
chars[i++] = (char) nextInt(random, 0x80, 0x800);
} else if (3 == t) {
chars[i++] = (char) nextInt(random, 0x800, 0xd7ff);
} else if (4 == t) {
chars[i++] = (char) nextInt(random, 0xe000, 0xffff);
}
else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800);
else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff);
else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff);
}
return new String(buffer, 0, end);
}
private static final int[] blockStarts = {

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.*;
import org.apache.lucene.index.codecs.CodecProvider;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@ -77,6 +78,7 @@ public class Test2BTerms extends LuceneTestCase {
tokenCount++;
if (--nextSave == 0) {
savedTerms.add(new BytesRef(bytes));
System.out.println("TEST: save term=" + bytes);
nextSave = _TestUtil.nextInt(random, 500000, 1000000);
}
return true;
@ -153,13 +155,16 @@ public class Test2BTerms extends LuceneTestCase {
Directory dir = newFSDirectory(_TestUtil.getTempDir("2BTerms"));
//Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
if (true) {
IndexWriter w = new IndexWriter(dir,
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
.setRAMBufferSizeMB(256.0)
.setMergeScheduler(new ConcurrentMergeScheduler())
.setMergePolicy(newLogMergePolicy(false, 10)));
.setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
MergePolicy mp = w.getConfig().getMergePolicy();
if (mp instanceof LogByteSizeMergePolicy) {
@ -211,6 +216,7 @@ public class Test2BTerms extends LuceneTestCase {
assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);
dir.close();
System.out.println("TEST: done!");
}
private List<BytesRef> findTerms(IndexReader r) throws IOException {
@ -234,15 +240,29 @@ public class Test2BTerms extends LuceneTestCase {
IndexSearcher s = new IndexSearcher(r);
Collections.shuffle(terms);
TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
boolean failed = false;
for(int iter=0;iter<10*terms.size();iter++) {
final BytesRef term = terms.get(random.nextInt(terms.size()));
System.out.println("TEST: search " + term);
final long t0 = System.currentTimeMillis();
assertTrue(s.search(new TermQuery(new Term("field", term)), 1).totalHits > 0);
final int count = s.search(new TermQuery(new Term("field", term)), 1).totalHits;
if (count <= 0) {
System.out.println(" FAILED: count=" + count);
failed = true;
}
final long t1 = System.currentTimeMillis();
System.out.println(" took " + (t1-t0) + " millis");
assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seek(term));
TermsEnum.SeekStatus result = termsEnum.seek(term);
if (result != TermsEnum.SeekStatus.FOUND) {
if (result == TermsEnum.SeekStatus.END) {
System.out.println(" FAILED: got END");
} else {
System.out.println(" FAILED: wrong term: got " + termsEnum.term());
}
failed = true;
}
}
assertFalse(failed);
}
}

View File

@ -536,6 +536,7 @@ public class TestIndexWriterDelete extends LuceneTestCase {
fail(testName + " hit IOException after disk space was freed up");
}
}
// prevent throwing a random exception here!!
final double randomIOExceptionRate = dir.getRandomIOExceptionRate();
final long maxSizeInBytes = dir.getMaxSizeInBytes();
dir.setRandomIOExceptionRate(0.0);

View File

@ -119,6 +119,7 @@ public class TestOmitTf extends LuceneTestCase {
setMaxBufferedDocs(3).
setMergePolicy(newLogMergePolicy(2))
);
writer.setInfoStream(VERBOSE ? System.out : null);
Document d = new Document();
// this field will have Tf

View File

@ -0,0 +1,129 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Latvian.
*/
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
/** File containing default Latvian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
DEFAULT_STOPWORD_FILE);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public LatvianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
*/
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates a
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link LatvianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class LatvianStemFilter extends TokenFilter {
private final LatvianStemmer stemmer = new LatvianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public LatvianStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,174 @@
package org.apache.lucene.analysis.lv;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Light stemmer for Latvian.
* <p>
* This is a light version of the algorithm in Karlis Kreslin's PhD thesis
* <i>A stemming algorithm for Latvian</i> with the following modifications:
* <ul>
* <li>Only explicitly stems noun and adjective morphology
* <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
* <li>Removes only the primary inflectional suffixes: case and number for nouns ;
* case, number, gender, and definitiveness for adjectives.
* <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
* </ul>
*/
public class LatvianStemmer {
/**
* Stem a latvian word. returns the new adjusted length.
*/
public int stem(char s[], int len) {
int numVowels = numVowels(s, len);
for (int i = 0; i < affixes.length; i++) {
Affix affix = affixes[i];
if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
len -= affix.affix.length;
return affix.palatalizes ? unpalatalize(s, len) : len;
}
}
return len;
}
static final Affix affixes[] = {
new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
new Affix("ajam", 2, false), new Affix("ajām", 2, false),
new Affix("ajos", 2, false), new Affix("ajās", 2, false),
new Affix("iem", 2, true), new Affix("ajā", 2, false),
new Affix("ais", 2, false), new Affix("ai", 2, false),
new Affix("ei", 2, false), new Affix("ām", 1, false),
new Affix("am", 1, false), new Affix("ēm", 1, false),
new Affix("īm", 1, false), new Affix("im", 1, false),
new Affix("um", 1, false), new Affix("us", 1, true),
new Affix("as", 1, false), new Affix("ās", 1, false),
new Affix("es", 1, false), new Affix("os", 1, true),
new Affix("ij", 1, false), new Affix("īs", 1, false),
new Affix("ēs", 1, false), new Affix("is", 1, false),
new Affix("ie", 1, false), new Affix("u", 1, true),
new Affix("a", 1, true), new Affix("i", 1, true),
new Affix("e", 1, false), new Affix("ā", 1, false),
new Affix("ē", 1, false), new Affix("ī", 1, false),
new Affix("ū", 1, false), new Affix("o", 1, false),
new Affix("s", 0, false), new Affix("š", 0, false),
};
static class Affix {
char affix[]; // suffix
int vc; // vowel count of the suffix
boolean palatalizes; // true if we should fire palatalization rules.
Affix(String affix, int vc, boolean palatalizes) {
this.affix = affix.toCharArray();
this.vc = vc;
this.palatalizes = palatalizes;
}
}
/**
* Most cases are handled except for the ambiguous ones:
* <ul>
* <li> s -> š
* <li> t -> š
* <li> d -> ž
* <li> z -> ž
* </ul>
*/
private int unpalatalize(char s[], int len) {
// we check the character removed: if its -u then
// its 2,5, or 6 gen pl., and these two can only apply then.
if (s[len] == 'u') {
// -> kst
if (endsWith(s, len, "")) {
len++;
s[len-2] = 's';
s[len-1] = 't';
return len;
}
// ņņ -> nn
if (endsWith(s, len, "ņņ")) {
s[len-2] = 'n';
s[len-1] = 'n';
return len;
}
}
// otherwise all other rules
if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
|| endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
// labial consonant
return len-1;
} else if (endsWith(s, len, "šņ")) {
s[len-2] = 's';
s[len-1] = 'n';
return len;
} else if (endsWith(s, len, "žņ")) {
s[len-2] = 'z';
s[len-1] = 'n';
return len;
} else if (endsWith(s, len, "šļ")) {
s[len-2] = 's';
s[len-1] = 'l';
return len;
} else if (endsWith(s, len, "žļ")) {
s[len-2] = 'z';
s[len-1] = 'l';
return len;
} else if (endsWith(s, len, "ļņ")) {
s[len-2] = 'l';
s[len-1] = 'n';
return len;
} else if (endsWith(s, len, "ļļ")) {
s[len-2] = 'l';
s[len-1] = 'l';
return len;
} else if (s[len-1] == 'č') {
s[len-1] = 'c';
return len;
} else if (s[len-1] == 'ļ') {
s[len-1] = 'l';
return len;
} else if (s[len-1] == 'ņ') {
s[len-1] = 'n';
return len;
}
return len;
}
/**
* Count the vowels in the string, we always require at least
* one in the remaining stem to accept it.
*/
private int numVowels(char s[], int len) {
int n = 0;
for (int i = 0; i < len; i++) {
switch(s[i]) {
case 'a': case 'e': case 'i':
case 'o': case 'u': case 'ā':
case 'ī': case 'ē': case 'ū':
n++;
}
}
return n;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Latvian.
</body>
</html>

View File

@ -0,0 +1,172 @@
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
# the original list of over 800 forms was refined:
# pronouns, adverbs, interjections were removed
#
# prepositions
aiz
ap
ar
apakš
ārpus
augšpus
bez
caur
dēļ
gar
iekš
iz
kopš
labad
lejpus
līdz
no
otrpus
pa
par
pār
pēc
pie
pirms
pret
priekš
starp
šaipus
uz
viņpus
virs
virspus
zem
apakšpus
# Conjunctions
un
bet
jo
ja
ka
lai
tomēr
tikko
turpretī
arī
kaut
gan
tādēļ
ne
tikvien
vien
ir
te
vai
kamēr
# Particles
ar
diezin
droši
diemžēl
nebūt
ik
it
taču
nu
pat
tiklab
iekšpus
nedz
tik
nevis
turpretim
jeb
iekam
iekām
iekāms
kolīdz
līdzko
tiklīdz
jebšu
tālab
tāpēc
nekā
itin
jau
jel
nezin
tad
tikai
vis
tak
iekams
vien
# modal verbs
būt
biju
biji
bija
bijām
bijāt
esmu
esi
esam
esat
būšu
būsi
būs
būsim
būsiet
tikt
tiku
tiki
tika
tikām
tikāt
tieku
tiec
tiek
tiekam
tiekat
tikšu
tiks
tiksim
tiksiet
tapt
tapi
tapāt
topat
tapšu
tapsi
taps
tapsim
tapsiet
kļūt
kļuvu
kļuvi
kļuva
kļuvām
kļuvāt
kļūstu
kļūsti
kļūst
kļūstam
kļūstat
kļūšu
kļūsi
kļūs
kļūsim
kļūsiet
# verbs
varēt
varēju
varējām
varēšu
varēsim
var
varēji
varējāt
varēsi
varēsiet
varat
varēja
varēs

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new LatvianAnalyzer(TEST_VERSION_CURRENT);
}
/** test stopwords and stemming */
public void testBasics() throws IOException {
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "tirgiem", "tirg");
checkOneTermReuse(a, "tirgus", "tirg");
// stopword
assertAnalyzesTo(a, "un", new String[] {});
}
/** test use of exclusion set */
public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>();
exclusionSet.add("tirgiem");
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "tirgiem", "tirgiem");
checkOneTermReuse(a, "tirgus", "tirg");
}
}

View File

@ -0,0 +1,272 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
* Basic tests for {@link LatvianStemmer}
*/
public class TestLatvianStemmer extends BaseTokenStreamTestCase {
private Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
}
};
public void testNouns1() throws IOException {
// decl. I
checkOneTerm(a, "tēvs", "tēv"); // nom. sing.
checkOneTerm(a, "tēvi", "tēv"); // nom. pl.
checkOneTerm(a, "tēva", "tēv"); // gen. sing.
checkOneTerm(a, "tēvu", "tēv"); // gen. pl.
checkOneTerm(a, "tēvam", "tēv"); // dat. sing.
checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
checkOneTerm(a, "tēvu", "tēv"); // acc. sing.
checkOneTerm(a, "tēvus", "tēv"); // acc. pl.
checkOneTerm(a, "tēvā", "tēv"); // loc. sing.
checkOneTerm(a, "tēvos", "tēv"); // loc. pl.
checkOneTerm(a, "tēvs", "tēv"); // voc. sing.
checkOneTerm(a, "tēvi", "tēv"); // voc. pl.
}
/**
* decl II nouns with (s,t) -> š and (d,z) -> ž
* palatalization will generally conflate to two stems
* due to the ambiguity (plural and singular).
*/
public void testNouns2() throws IOException {
// decl. II
// c -> č palatalization
checkOneTerm(a, "lācis", "lāc"); // nom. sing.
checkOneTerm(a, "lāči", "lāc"); // nom. pl.
checkOneTerm(a, "lāča", "lāc"); // gen. sing.
checkOneTerm(a, "lāču", "lāc"); // gen. pl.
checkOneTerm(a, "lācim", "lāc"); // dat. sing.
checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
checkOneTerm(a, "lāci", "lāc"); // acc. sing.
checkOneTerm(a, "lāčus", "lāc"); // acc. pl.
checkOneTerm(a, "lācī", "lāc"); // loc. sing.
checkOneTerm(a, "lāčos", "lāc"); // loc. pl.
checkOneTerm(a, "lāci", "lāc"); // voc. sing.
checkOneTerm(a, "lāči", "lāc"); // voc. pl.
// n -> ņ palatalization
checkOneTerm(a, "akmens", "akmen"); // nom. sing.
checkOneTerm(a, "akmeņi", "akmen"); // nom. pl.
checkOneTerm(a, "akmens", "akmen"); // gen. sing.
checkOneTerm(a, "akmeņu", "akmen"); // gen. pl.
checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
checkOneTerm(a, "akmeņus", "akmen"); // acc. pl.
checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
checkOneTerm(a, "akmeņos", "akmen"); // loc. pl.
checkOneTerm(a, "akmens", "akmen"); // voc. sing.
checkOneTerm(a, "akmeņi", "akmen"); // voc. pl.
// no palatalization
checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
}
public void testNouns3() throws IOException {
// decl III
checkOneTerm(a, "lietus", "liet"); // nom. sing.
checkOneTerm(a, "lieti", "liet"); // nom. pl.
checkOneTerm(a, "lietus", "liet"); // gen. sing.
checkOneTerm(a, "lietu", "liet"); // gen. pl.
checkOneTerm(a, "lietum", "liet"); // dat. sing.
checkOneTerm(a, "lietiem", "liet"); // dat. pl.
checkOneTerm(a, "lietu", "liet"); // acc. sing.
checkOneTerm(a, "lietus", "liet"); // acc. pl.
checkOneTerm(a, "lietū", "liet"); // loc. sing.
checkOneTerm(a, "lietos", "liet"); // loc. pl.
checkOneTerm(a, "lietus", "liet"); // voc. sing.
checkOneTerm(a, "lieti", "liet"); // voc. pl.
}
public void testNouns4() throws IOException {
// decl IV
checkOneTerm(a, "lapa", "lap"); // nom. sing.
checkOneTerm(a, "lapas", "lap"); // nom. pl.
checkOneTerm(a, "lapas", "lap"); // gen. sing.
checkOneTerm(a, "lapu", "lap"); // gen. pl.
checkOneTerm(a, "lapai", "lap"); // dat. sing.
checkOneTerm(a, "lapām", "lap"); // dat. pl.
checkOneTerm(a, "lapu", "lap"); // acc. sing.
checkOneTerm(a, "lapas", "lap"); // acc. pl.
checkOneTerm(a, "lapā", "lap"); // loc. sing.
checkOneTerm(a, "lapās", "lap"); // loc. pl.
checkOneTerm(a, "lapa", "lap"); // voc. sing.
checkOneTerm(a, "lapas", "lap"); // voc. pl.
checkOneTerm(a, "puika", "puik"); // nom. sing.
checkOneTerm(a, "puikas", "puik"); // nom. pl.
checkOneTerm(a, "puikas", "puik"); // gen. sing.
checkOneTerm(a, "puiku", "puik"); // gen. pl.
checkOneTerm(a, "puikam", "puik"); // dat. sing.
checkOneTerm(a, "puikām", "puik"); // dat. pl.
checkOneTerm(a, "puiku", "puik"); // acc. sing.
checkOneTerm(a, "puikas", "puik"); // acc. pl.
checkOneTerm(a, "puikā", "puik"); // loc. sing.
checkOneTerm(a, "puikās", "puik"); // loc. pl.
checkOneTerm(a, "puika", "puik"); // voc. sing.
checkOneTerm(a, "puikas", "puik"); // voc. pl.
}
/**
* Genitive plural forms with (s,t) -> š and (d,z) -> ž
* will not conflate due to ambiguity.
*/
public void testNouns5() throws IOException {
// decl V
// l -> ļ palatalization
checkOneTerm(a, "egle", "egl"); // nom. sing.
checkOneTerm(a, "egles", "egl"); // nom. pl.
checkOneTerm(a, "egles", "egl"); // gen. sing.
checkOneTerm(a, "egļu", "egl"); // gen. pl.
checkOneTerm(a, "eglei", "egl"); // dat. sing.
checkOneTerm(a, "eglēm", "egl"); // dat. pl.
checkOneTerm(a, "egli", "egl"); // acc. sing.
checkOneTerm(a, "egles", "egl"); // acc. pl.
checkOneTerm(a, "eglē", "egl"); // loc. sing.
checkOneTerm(a, "eglēs", "egl"); // loc. pl.
checkOneTerm(a, "egle", "egl"); // voc. sing.
checkOneTerm(a, "egles", "egl"); // voc. pl.
}
public void testNouns6() throws IOException {
// decl VI
// no palatalization
checkOneTerm(a, "govs", "gov"); // nom. sing.
checkOneTerm(a, "govis", "gov"); // nom. pl.
checkOneTerm(a, "govs", "gov"); // gen. sing.
checkOneTerm(a, "govju", "gov"); // gen. pl.
checkOneTerm(a, "govij", "gov"); // dat. sing.
checkOneTerm(a, "govīm", "gov"); // dat. pl.
checkOneTerm(a, "govi ", "gov"); // acc. sing.
checkOneTerm(a, "govis", "gov"); // acc. pl.
checkOneTerm(a, "govi ", "gov"); // inst. sing.
checkOneTerm(a, "govīm", "gov"); // inst. pl.
checkOneTerm(a, "govī", "gov"); // loc. sing.
checkOneTerm(a, "govīs", "gov"); // loc. pl.
checkOneTerm(a, "govs", "gov"); // voc. sing.
checkOneTerm(a, "govis", "gov"); // voc. pl.
}
public void testAdjectives() throws IOException {
checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing.
checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl.
checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing.
checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing.
checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
checkOneTerm(a, "zilo", "zil"); // def. gen. fem. pl.
checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl.
checkOneTerm(a, "zilajām", "zil"); // def. dat. fem. pl.
checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl.
checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl.
checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing.
checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing.
checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing.
checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing.
checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl.
checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl.
checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
checkOneTerm(a, "zilā", "zil"); // voc. fem. sing.
checkOneTerm(a, "zilās", "zil"); // voc. fem. pl.
}
/**
* Note: we intentionally don't handle the ambiguous
* (s,t) -> š and (d,z) -> ž
*/
public void testPalatalization() throws IOException {
checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl.
checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
checkOneTerm(a, "vilnis", "viln"); // nom. sing.
checkOneTerm(a, "viļņu", "viln"); // gen. pl.
checkOneTerm(a, "lelle", "lell"); // nom. sing.
checkOneTerm(a, "leļļu", "lell"); // gen. pl.
checkOneTerm(a, "pinne", "pinn"); // nom. sing.
checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
}
/**
* Test some length restrictions, we require a 3+ char stem,
* with at least one vowel.
*/
public void testLength() throws IOException {
checkOneTerm(a, "usa", "usa"); // length
checkOneTerm(a, "60ms", "60ms"); // vowel count
}
}

View File

@ -75,7 +75,7 @@ class SegGraph {
List<SegToken> result = new ArrayList<SegToken>();
int s = -1, count = 0, size = tokenListTable.size();
List<SegToken> tokenList;
short index = 0;
int index = 0;
while (count < size) {
if (isStartExist(s)) {
tokenList = tokenListTable.get(s);

View File

@ -17,8 +17,11 @@
package org.apache.lucene.analysis.cn.smart;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
@ -166,4 +169,30 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
new int[] { 0, 1, 3, 4, 6, 7 },
new int[] { 1, 3, 4, 6, 7, 9 });
}
// LUCENE-3026
public void testLargeDocument() throws Exception {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 5000; i++) {
sb.append("我购买了道具和服装。");
}
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
stream.reset();
while (stream.incrementToken()) {
}
}
// LUCENE-3026
public void testLargeSentence() throws Exception {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 5000; i++) {
sb.append("我购买了道具和服装");
}
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
stream.reset();
while (stream.incrementToken()) {
}
}
}

View File

@ -60,6 +60,11 @@ Detailed Change List
New Features
----------------------
* SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)
component, offering an order of magnitude smaller memory consumption
compared to ternary trees and jaspell and very fast lookups at runtime.
(Dawid Weiss)
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
supports "percentages" which get evaluated relative the current size of
@ -75,7 +80,7 @@ New Features
* SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing.
(Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar,
Koji Sekiguchi, Iv<EFBFBD>n de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
Koji Sekiguchi, Iván de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger,
Harish Agarwal, yonik)
@ -110,7 +115,7 @@ New Features
* SOLR-1566: Transforming documents in the ResponseWriters. This will allow
for more complex results in responses and open the door for function queries
as results. (ryan with patches from grant, noble, cmale, yonik)
as results. (ryan with patches from grant, noble, cmale, yonik, Jan Høydahl)
* SOLR-2417: Add explain info directly to return documents using ?fl=_explain_ (ryan)

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.lv.LatvianStemFilter;
/**
* Factory for {@link LatvianStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.LatvianStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new LatvianStemFilter(input);
}
}

View File

@ -162,7 +162,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
} else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
"Specified dictionary does not exist.");
"Specified dictionary does not exist: " + getDictionaryName(params));
}
}
}

View File

@ -19,8 +19,10 @@ package org.apache.solr.response;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSlice;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
public class PageTool {
private long start;
@ -42,10 +44,16 @@ public class PageTool {
DocSlice doc_slice = (DocSlice) docs;
results_found = doc_slice.matches();
start = doc_slice.offset();
} else {
} else if(docs instanceof ResultContext) {
DocList dl = ((ResultContext) docs).docs;
results_found = dl.matches();
start = dl.offset();
} else if(docs instanceof SolrDocumentList) {
SolrDocumentList doc_list = (SolrDocumentList) docs;
results_found = doc_list.getNumFound();
start = doc_list.getStart();
} else {
throw new SolrException(SolrException.ErrorCode.UNKNOWN, "Unknown response type "+docs+". Expected one of DocSlice, ResultContext or SolrDocumentList");
}
}

View File

@ -12,7 +12,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.util.TermFreqIterator;
public abstract class Lookup {
/**
* Result of a lookup.
*/

View File

@ -0,0 +1,556 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.FST.Arc;
import org.apache.lucene.util.automaton.fst.NoOutputs;
import org.apache.lucene.util.automaton.fst.Outputs;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
/**
* Finite state automata based implementation of {@link Lookup} query
* suggestion/ autocomplete interface.
*
* <h2>Implementation details</h2>
*
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
* <ul>
* <li>A set of input terms (String) and weights (float) is given.</li>
* <li>The range of weights is determined and then all weights are discretized into a fixed set
* of values ({@link #buckets}).
* Note that this means that minor changes in weights may be lost during automaton construction.
* In general, this is not a big problem because the "priorities" of completions can be split
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
* of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
* become <code>1abc</code>.</li>
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
* term in front).</li>
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
* </ul>
*
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
* <ul>
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
* starting with the highest one, we descend along the path of the input key. If the key is not
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
* No completions.
* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
* root path we started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
* suggestions the user requested.
* <li>In case the number of suggestions collected in the step above is still insufficient,
* we proceed to the next (smaller) weight leaving the root node and repeat the same
* algorithm again.
* </li>
* </ul>
*
* <h2>Runtime behavior and performance characteristic</h2>
*
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
* in a top-weights-first order. This is probably the most common use case: it allows
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
*
* <p>If there is an exact match in the automaton, it is returned first on the results
* list (even with by-weight sorting).
*
* <p>Note that the maximum lookup time for <b>any prefix</b>
* is the time of descending to the subtree, plus traversal of the subtree up to the number
* of requested suggestions (because they are already presorted by weight on the root level
* and alphabetically at any node level).
*
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
* used, but the algorithm for doing this is suboptimal.
*
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
*/
public class FSTLookup extends Lookup {
/** A structure for a single entry (for sorting/ preprocessing). */
private static class Entry {
char [] term;
float weight;
public Entry(char [] term, float freq) {
this.term = term;
this.weight = freq;
}
}
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
* will not decrease for prefixes which have highly-weighted completions (because these
* are filled-in first), but will decrease significantly for low-weighted terms (but
* these should be infrequent, so it is all right).
*
* <p>The number of buckets must be within [1, 255] range.
*/
public static final String WEIGHT_BUCKETS = "weightBuckets";
/**
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
/**
* @see #WEIGHT_BUCKETS
*/
private int buckets = 10;
/**
* #see #EXACT_MATCH_FIRST
*/
private boolean exactMatchFirst = true;
/**
* Finite state automaton encoding all the lookup terms. See class
* notes for details.
*/
private FST<Object> automaton;
/**
* An array of arcs leaving the root automaton state and encoding weights of all
* completions in their sub-trees.
*/
private Arc<Object> [] rootArcs;
/* */
@Override
@SuppressWarnings("rawtypes")
public void init(NamedList config, SolrCore core) {
this.buckets = config.get(WEIGHT_BUCKETS) != null
? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
: 10;
this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
: true;
}
/* */
@Override
public void build(TermFreqIterator tfit) throws IOException {
// Buffer the input because we will need it twice: for calculating
// weights distribution and for the actual automata building.
List<Entry> entries = Lists.newArrayList();
while (tfit.hasNext()) {
String term = tfit.next();
char [] termChars = new char [term.length() + 1]; // add padding for weight.
for (int i = 0; i < term.length(); i++)
termChars[i + 1] = term.charAt(i);
entries.add(new Entry(termChars, tfit.freq()));
}
// Distribute weights into at most N buckets. This is a form of discretization to
// limit the number of possible weights so that they can be efficiently encoded in the
// automaton.
//
// It is assumed the distribution of weights is _linear_ so proportional division
// of [min, max] range will be enough here. Other approaches could be to sort
// weights and divide into proportional ranges.
if (entries.size() > 0) {
redistributeWeightsProportionalMinMax(entries, buckets);
encodeWeightPrefix(entries);
}
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
// to the lowest weight.
this.automaton = buildAutomaton(entries);
cacheRootArcs();
}
/**
* Cache the root node's output arcs starting with completions with the highest weights.
*/
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
if (automaton != null) {
List<Arc<Object>> rootArcs = Lists.newArrayList();
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
automaton.readFirstTargetArc(arc, arc);
while (true) {
rootArcs.add(new Arc<Object>().copyFrom(arc));
if (arc.isLast())
break;
automaton.readNextArc(arc);
}
Collections.reverse(rootArcs); // we want highest weights first.
this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
}
}
/**
* Not implemented.
*/
@Override
public boolean add(String key, Object value) {
// This implementation does not support ad-hoc additions (all input
// must be sorted for the builder).
return false;
}
/**
* Get the (approximated) weight of a single key (if there is a perfect match
* for it in the automaton).
*
* @return Returns the approximated weight of the input key or <code>null</code>
* if not found.
*/
@Override
public Float get(String key) {
return getExactMatchStartingFromRootArc(0, key);
}
/**
* Returns the first exact match by traversing root arcs, starting from
* the arc <code>i</code>.
*
* @param i The first root arc index in {@link #rootArcs} to consider when
* matching.
*/
private Float getExactMatchStartingFromRootArc(int i, String key) {
// Get the UTF-8 bytes representation of the input key.
try {
final FST.Arc<Object> scratch = new FST.Arc<Object>();
for (; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
automaton.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) {
// Prefix-encoded weight.
return rootArc.label / (float) buckets;
}
}
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
return null;
}
/**
* Lookup autocomplete suggestions to <code>key</code>.
*
* @param key The prefix to which suggestions should be sought.
* @param onlyMorePopular Return most popular suggestions first. This is the default
* behavior for this implementation. Setting it to <code>false</code> has no effect (use
* constant term weights to sort alphabetically only).
* @param num At most this number of suggestions will be returned.
* @return Returns the suggestions, sorted by their approximated weight first (decreasing)
* and then alphabetically (utf16 codepoint order).
*/
@Override
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
if (key.length() == 0 || automaton == null) {
// Keep the result an ArrayList to keep calls monomorphic.
return EMPTY_RESULT;
}
try {
if (!onlyMorePopular && rootArcs.length > 1) {
// We could emit a warning here (?). An optimal strategy for alphabetically sorted
// suggestions would be to add them with a constant weight -- this saves unnecessary
// traversals and sorting.
return lookupSortedAlphabetically(key, num);
} else {
return lookupSortedByWeight(key, num, true);
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
}
/**
* Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
* is a workaround: in general, use constant weights for alphabetically sorted result.
*/
private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
// Greedily get num results from each weight branch.
List<LookupResult> res = lookupSortedByWeight(key, num, false);
// Sort and trim.
Collections.sort(res, new Comparator<LookupResult>() {
@Override
public int compare(LookupResult o1, LookupResult o2) {
return o1.key.compareTo(o2.key);
}
});
if (res.size() > num) {
res = res.subList(0, num);
}
return res;
}
/**
* Lookup suggestions sorted by weight (descending order).
*
* @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code>
* suggestions have been collected. If <code>false</code>, it will collect suggestions from
* all weight arcs (needed for {@link #lookupSortedAlphabetically}.
*/
private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
final StringBuilder output = new StringBuilder(key);
final int matchLength = key.length() - 1;
for (int i = 0; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
// Prefix-encoded weight.
final float weight = rootArc.label / (float) buckets;
// A subgraph starting from the current node has the completions
// of the key prefix. The arc we're at is the last key's byte,
// so we will collect it too.
output.setLength(matchLength);
if (collect(res, num, weight, output, arc) && greedy) {
// We have enough suggestion to return immediately. Keep on looking for an
// exact match, if requested.
if (exactMatchFirst) {
Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
if (exactMatchWeight != null) {
res.add(0, new LookupResult(key, exactMatchWeight));
while (res.size() > num) {
res.remove(res.size() - 1);
}
}
}
break;
}
}
}
return res;
}
/**
* Descend along the path starting at <code>arc</code> and going through
* bytes in <code>utf8</code> argument.
*
* @param arc The starting arc. This argument is modified in-place.
* @param term The term to descend with.
* @return If <code>true</code>, <code>arc</code> will be set to the arc matching
* last byte of <code>utf8</code>. <code>false</code> is returned if no such
* prefix <code>utf8</code> exists.
*/
private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
final int max = term.length();
for (int i = 0; i < max; i++) {
if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
// No matching prefixes, return an empty result.
return false;
}
}
return true;
}
/**
* Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
*
* @param num Maximum number of results needed (early termination).
* @param weight Weight of all results found during this collection.
*/
private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
output.append((char) arc.label);
automaton.readFirstTargetArc(arc, arc);
while (true) {
if (arc.label == FST.END_LABEL) {
res.add(new LookupResult(output.toString(), weight));
if (res.size() >= num)
return true;
} else {
int save = output.length();
if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
return true;
}
output.setLength(save);
}
if (arc.isLast()) {
break;
}
automaton.readNextArc(arc);
}
return false;
}
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
if (entries.size() == 0)
return null;
// Sort by utf16 (raw char value)
final Comparator<Entry> comp = new Comparator<Entry>() {
public int compare(Entry o1, Entry o2) {
char [] ch1 = o1.term;
char [] ch2 = o2.term;
int len1 = ch1.length;
int len2 = ch2.length;
int max = Math.min(len1, len2);
for (int i = 0; i < max; i++) {
int v = ch1[i] - ch2[i];
if (v != 0) return v;
}
return len1 - len2;
}
};
Collections.sort(entries, comp);
// Avoid duplicated identical entries, if possible. This is required because
// it breaks automaton construction otherwise.
int len = entries.size();
int j = 0;
for (int i = 1; i < len; i++) {
if (comp.compare(entries.get(j), entries.get(i)) != 0) {
entries.set(++j, entries.get(i));
}
}
entries = entries.subList(0, j + 1);
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder =
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) {
final int termLength = scratchIntsRef.length = e.term.length;
scratchIntsRef.grow(termLength);
final int [] ints = scratchIntsRef.ints;
final char [] chars = e.term;
for (int i = termLength; --i >= 0;) {
ints[i] = chars[i];
}
builder.add(scratchIntsRef, empty);
}
return builder.finish();
}
/**
* Prepends the entry's weight to each entry, encoded as a single byte, so that the
* root automaton node fans out to all possible priorities, starting with the arc that has
* the highest weights.
*/
private void encodeWeightPrefix(List<Entry> entries) {
for (Entry e : entries) {
int weight = (int) e.weight;
assert (weight >= 0 && weight <= buckets) :
"Weight out of range: " + weight + " [" + buckets + "]";
// There should be a single empty char reserved in front for the weight.
e.term[0] = (char) weight;
}
}
/**
* Split [min, max] range into buckets, reassigning weights. Entries' weights are
* remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
*/
private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
float min = entries.get(0).weight;
float max = min;
for (Entry e : entries) {
min = Math.min(e.weight, min);
max = Math.max(e.weight, max);
}
final float range = max - min;
for (Entry e : entries) {
e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
}
}
/**
* Deserialization from disk.
*/
@Override
public synchronized boolean load(File storeDir) throws IOException {
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
InputStream is = new BufferedInputStream(new FileInputStream(data));
try {
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
cacheRootArcs();
} finally {
Closeables.closeQuietly(is);
}
return true;
}
/**
* Serialization to disk.
*/
@Override
public synchronized boolean store(File storeDir) throws IOException {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
if (this.automaton == null)
return false;
File data = new File(storeDir, FILENAME);
OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
try {
this.automaton.save(new OutputStreamDataOutput(os));
} finally {
Closeables.closeQuietly(os);
}
return true;
}
}

View File

@ -0,0 +1,31 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.store.DataInput;
import com.google.common.io.ByteStreams;
/**
* A {@link DataInput} wrapping a plain {@link InputStream}.
*/
public class InputStreamDataInput extends DataInput {
private final InputStream is;
public InputStreamDataInput(InputStream is) {
this.is = is;
}
@Override
public byte readByte() throws IOException {
int v = is.read();
if (v == -1) throw new EOFException();
return (byte) v;
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
ByteStreams.readFully(is, b, offset, len);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.lucene.store.DataOutput;
/**
* A {@link DataOutput} wrapping a plain {@link OutputStream}.
*/
public class OutputStreamDataOutput extends DataOutput {
private final OutputStream os;
public OutputStreamDataOutput(OutputStream os) {
this.os = os;
}
@Override
public void writeByte(byte b) throws IOException {
os.write(b);
}
@Override
public void writeBytes(byte[] b, int offset, int length) throws IOException {
os.write(b, offset, length);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@
<requestHandler name="standard" class="solr.StandardRequestHandler" />
<!-- Suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest">
<searchComponent class="solr.SpellCheckComponent" name="suggest_jaspell">
<lst name="spellchecker">
<str name="name">suggest</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
@ -45,6 +45,38 @@
</lst>
</searchComponent>
<!-- TSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_tst">
<lst name="spellchecker">
<str name="name">suggest_tst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest_tst</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->
<float name="threshold">0.0</float>
</lst>
</searchComponent>
<!-- FSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_fst">
<lst name="spellchecker">
<str name="name">suggest_fst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FSTLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest_fst</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->
<int name="weightBuckets">5</int>
<bool name="exactMatchFirst">true</bool>
</lst>
</searchComponent>
<!-- The default (jaspell) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest">
<lst name="defaults">
<str name="spellcheck">true</str>
@ -52,8 +84,32 @@
<str name="spellcheck.collate">true</str>
</lst>
<arr name="components">
<str>suggest</str>
<str>suggest_jaspell</str>
</arr>
</requestHandler>
<!-- tst (ternary tree based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_tst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_tst</str>
<str name="spellcheck.collate">true</str>
</lst>
<arr name="components">
<str>suggest_tst</str>
</arr>
</requestHandler>
<!-- fst (finite state automaton based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_fst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_fst</str>
<str name="spellcheck.collate">false</str>
</lst>
<arr name="components">
<str>suggest_fst</str>
</arr>
</requestHandler>
</config>

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Latvian stem factory is working.
*/
public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("tirgiem tirgus");
LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.search.Similarity;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.Ignore;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
@ -524,9 +525,10 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
*/
@Test
public void testExternalFieldValueSourceParser() {
clearIndex();
String field = "CoMpleX \" fieldName _extf";
String fieldAsFunc = "field(\"CoMpleX \\\" fieldName _extf\")";
String field = "CoMpleX fieldName _extf";
String fieldAsFunc = "field(\"CoMpleX fieldName _extf\")";
float[] ids = {100,-4,0,10,25,5,77,23,55,-78,-45,-24,63,78,94,22,34,54321,261,-627};
@ -543,7 +545,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
singleTest(fieldAsFunc, "sqrt(\0)");
assertTrue(orig == FileFloatSource.onlyForTesting);
makeExternalFile(fieldAsFunc, "0=1","UTF-8");
makeExternalFile(field, "0=1","UTF-8");
assertU(adoc("id", "10000")); // will get same reader if no index change
assertU(commit());
singleTest(fieldAsFunc, "sqrt(\0)");
@ -552,4 +554,31 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
purgeFieldCache(FieldCache.DEFAULT); // avoid FC insanity
}
/**
* some platforms don't allow quote characters in filenames, so
* in addition to testExternalFieldValueSourceParser above, test a field
* name with quotes in it that does NOT use ExternalFileField
* @see #testExternalFieldValueSourceParser
*/
@Test
public void testFieldValueSourceParser() {
clearIndex();
String field = "CoMpleX \" fieldName _f";
String fieldAsFunc = "field(\"CoMpleX \\\" fieldName _f\")";
float[] ids = {100,-4,0,10,25,5,77,1};
createIndex(field, ids);
// test identity (straight field value)
singleTest(fieldAsFunc, "\0",
100,100, -4,-4, 0,0, 10,10, 25,25, 5,5, 77,77, 1,1);
singleTest(fieldAsFunc, "sqrt(\0)",
100,10, 25,5, 0,0, 1,1);
singleTest(fieldAsFunc, "log(\0)", 1,0);
purgeFieldCache(FieldCache.DEFAULT); // avoid FC insanity
}
}

View File

@ -0,0 +1,52 @@
package org.apache.solr.spelling.suggest;
import java.util.List;
import java.util.Locale;
/**
* Average with standard deviation.
*/
final class Average
{
/**
* Average (in milliseconds).
*/
public final double avg;
/**
* Standard deviation (in milliseconds).
*/
public final double stddev;
/**
*
*/
Average(double avg, double stddev)
{
this.avg = avg;
this.stddev = stddev;
}
public String toString()
{
return String.format(Locale.ENGLISH, "%.0f [+- %.2f]",
avg, stddev);
}
static Average from(List<Double> values)
{
double sum = 0;
double sumSquares = 0;
for (double l : values)
{
sum += l;
sumSquares += l * l;
}
double avg = sum / (double) values.size();
return new Average(
(sum / (double) values.size()),
Math.sqrt(sumSquares / (double) values.size() - avg * avg));
}
}

View File

@ -0,0 +1,230 @@
package org.apache.solr.spelling.suggest;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.Callable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
@Ignore // COMMENT ME TO RUN BENCHMARKS!
public class LookupBenchmarkTest {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
JaspellLookup.class,
TSTLookup.class,
FSTLookup.class);
private final static int rounds = 15;
private final static int warmup = 5;
private final int num = 7;
private final boolean onlyMorePopular = true;
private final static Random random = new Random(0xdeadbeef);
/**
* Input term/weight pairs.
*/
private static TermFreq [] dictionaryInput;
/**
* Benchmark term/weight pairs (randomized order).
*/
private static List<TermFreq> benchmarkInput;
/**
* Loads terms and frequencies from Wikipedia (cached).
*/
@BeforeClass
public static void setup() throws Exception {
List<TermFreq> input = readTop50KWiki();
Collections.shuffle(input, random);
LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
Collections.shuffle(input, random);
LookupBenchmarkTest.benchmarkInput = input;
}
/**
* Collect the multilingual input for benchmarks/ tests.
*/
public static List<TermFreq> readTop50KWiki() throws Exception {
List<TermFreq> input = Lists.newArrayList();
URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
assert resource != null : "Resource missing: Top50KWiki.utf8";
for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
int tab = line.indexOf('|');
Assert.assertTrue("No | separator?: " + line, tab >= 0);
float weight = Float.parseFloat(line.substring(tab + 1));
String key = line.substring(0, tab);
input.add(new TermFreq(key, weight));
}
return input;
}
/**
* Test construction time.
*/
@Test
public void testConstructionTime() throws Exception {
System.err.println("-- construction time");
for (final Class<? extends Lookup> cls : benchmarkClasses) {
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
final Lookup lookup = buildLookup(cls, dictionaryInput);
return lookup.hashCode();
}
});
System.err.println(
String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
cls.getSimpleName(),
dictionaryInput.length,
result.average.toString()));
}
}
/**
* Test memory required for the storage.
*/
@Test
public void testStorageNeeds() throws Exception {
System.err.println("-- RAM consumption");
final RamUsageEstimator rue = new RamUsageEstimator();
for (Class<? extends Lookup> cls : benchmarkClasses) {
Lookup lookup = buildLookup(cls, dictionaryInput);
System.err.println(
String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
lookup.getClass().getSimpleName(),
rue.estimateRamUsage(lookup)));
}
}
/**
* Create {@link Lookup} instance and populate it.
*/
private Lookup buildLookup(Class<? extends Lookup> cls, TermFreq[] input) throws Exception {
Lookup lookup = cls.newInstance();
lookup.build(new TermFreqArrayIterator(input));
return lookup;
}
/**
* Test performance of lookup on full hits.
*/
@Test
public void testPerformanceOnFullHits() throws Exception {
final int minPrefixLen = 100;
final int maxPrefixLen = 200;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Test performance of lookup on longer term prefixes (6-9 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes6_9() throws Exception {
final int minPrefixLen = 6;
final int maxPrefixLen = 9;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Test performance of lookup on short term prefixes (2-4 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes2_4() throws Exception {
final int minPrefixLen = 2;
final int maxPrefixLen = 4;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Run the actual benchmark.
*/
public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen,
final int num, final boolean onlyMorePopular) throws Exception {
System.err.println(String.format(Locale.ENGLISH,
"-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
minPrefixLen, maxPrefixLen, num, onlyMorePopular));
for (Class<? extends Lookup> cls : benchmarkClasses) {
final Lookup lookup = buildLookup(cls, dictionaryInput);
final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
public String apply(TermFreq tf) {
return tf.term.substring(0, Math.min(tf.term.length(),
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
}
}));
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
int v = 0;
for (String term : input) {
v += lookup.lookup(term, onlyMorePopular, num).size();
}
return v;
}
});
System.err.println(
String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
lookup.getClass().getSimpleName(),
input.size(),
result.average.toString(),
input.size() / result.average.avg));
}
}
/**
* Do the measurements.
*/
private BenchmarkResult measure(Callable<Integer> callable) {
final double NANOS_PER_MS = 1000000;
try {
List<Double> times = Lists.newArrayList();
for (int i = 0; i < warmup + rounds; i++) {
final long start = System.nanoTime();
guard = callable.call().intValue();
times.add((System.nanoTime() - start) / NANOS_PER_MS);
}
return new BenchmarkResult(times, warmup, rounds);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/** Guard against opts. */
@SuppressWarnings("unused")
private static volatile int guard;
private static class BenchmarkResult {
/** Average time per round (ms). */
public final Average average;
public BenchmarkResult(List<Double> times, int warmup, int rounds) {
this.average = Average.from(times.subList(warmup, times.size()));
}
}
}

View File

@ -19,62 +19,74 @@ package org.apache.solr.spelling.suggest;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Test;
public class PersistenceTest extends SolrTestCaseJ4 {
public static final String[] keys = new String[] {
"one",
"two",
"three",
"four",
"oneness",
"onerous",
"onesimus",
"twofold",
"twonk",
"thrive",
"through",
"threat",
"foundation",
"fourier",
"fourty"
};
public final String[] keys = new String[] {
"one",
"two",
"three",
"four",
"oneness",
"onerous",
"onesimus",
"twofold",
"twonk",
"thrive",
"through",
"threat",
"foundation",
"fourier",
"fourty"};
@Test
public void testTSTPersistence() throws Exception {
TSTLookup lookup = new TSTLookup();
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
lookup = new TSTLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
runTest(TSTLookup.class, true);
}
@Test
public void testJaspellPersistence() throws Exception {
JaspellLookup lookup = new JaspellLookup();
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
lookup = new JaspellLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
runTest(JaspellLookup.class, true);
}
@Test
public void testFSTPersistence() throws Exception {
runTest(FSTLookup.class, false);
}
private void runTest(Class<? extends Lookup> lookupClass,
boolean supportsExactWeights) throws Exception {
// Add all input keys.
Lookup lookup = lookupClass.newInstance();
TermFreq[] keys = new TermFreq[this.keys.length];
for (int i = 0; i < keys.length; i++)
keys[i] = new TermFreq(this.keys[i], (float) i);
lookup.build(new TermFreqArrayIterator(keys));
// Store the suggester.
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
// Re-read it from disk.
lookup = lookupClass.newInstance();
lookup.load(storeDir);
// Assert validity.
float previous = Float.NEGATIVE_INFINITY;
for (TermFreq k : keys) {
Float val = (Float) lookup.get(k.term);
assertNotNull(k.term, val);
if (supportsExactWeights) {
assertEquals(k.term, Float.valueOf(k.v), val);
} else {
assertTrue(val + ">=" + previous, val >= previous);
previous = val.floatValue();
}
}
}
}

View File

@ -0,0 +1,7 @@
package org.apache.solr.spelling.suggest;
public class SuggesterFSTTest extends SuggesterTest {
public SuggesterFSTTest() {
super.requestUri = "/suggest_fst";
}
}

View File

@ -0,0 +1,7 @@
package org.apache.solr.spelling.suggest;
public class SuggesterTSTTest extends SuggesterTest {
public SuggesterTSTTest() {
super.requestUri = "/suggest_tst";
}
}

View File

@ -17,28 +17,19 @@
package org.apache.solr.spelling.suggest;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.collect.Lists;
import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
public class SuggesterTest extends SolrTestCaseJ4 {
/**
* Expected URI at which the given suggester will live.
*/
protected String requestUri = "/suggest";
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-spellchecker.xml","schema-spellchecker.xml");
@ -59,10 +50,9 @@ public class SuggesterTest extends SolrTestCaseJ4 {
@Test
public void testSuggestions() throws Exception {
addDocs();
assertU(commit()); // configured to do a rebuild on commit
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
@ -82,12 +72,12 @@ public class SuggesterTest extends SolrTestCaseJ4 {
dataDir = data;
configString = config;
initCore();
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
);
// restore the property
System.setProperty("solr.test.leavedatadir", leaveData);
}
@ -96,132 +86,13 @@ public class SuggesterTest extends SolrTestCaseJ4 {
public void testRebuild() throws Exception {
addDocs();
assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
assertU(adoc("id", "4",
"text", "actually"
));
assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
}
private TermFreqIterator getTFIT() {
final int count = 100000;
TermFreqIterator tfit = new TermFreqIterator() {
Random r = new Random(1234567890L);
Random r1 = new Random(1234567890L);
int pos;
public float freq() {
return r1.nextInt(4);
}
public boolean hasNext() {
return pos < count;
}
public String next() {
pos++;
return Long.toString(r.nextLong());
}
public void remove() {
throw new UnsupportedOperationException();
}
};
return tfit;
}
static class Bench {
long buildTime;
long lookupTime;
}
@Test @Ignore
public void testBenchmark() throws Exception {
final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList();
benchmarkClasses.add(JaspellLookup.class);
benchmarkClasses.add(TSTLookup.class);
// Run a single pass just to see if everything works fine and provide size estimates.
final RamUsageEstimator rue = new RamUsageEstimator();
for (Class<? extends Lookup> cls : benchmarkClasses) {
Lookup lookup = singleBenchmark(cls, null);
System.err.println(
String.format(Locale.ENGLISH,
"%20s, size[B]=%,d",
lookup.getClass().getSimpleName(),
rue.estimateRamUsage(lookup)));
}
int warmupCount = 10;
int measuredCount = 100;
for (Class<? extends Lookup> cls : benchmarkClasses) {
Bench b = fullBenchmark(cls, warmupCount, measuredCount);
System.err.println(String.format(Locale.ENGLISH,
"%s: buildTime[ms]=%,d lookupTime[ms]=%,d",
cls.getSimpleName(),
(b.buildTime / measuredCount),
(b.lookupTime / measuredCount / 1000000)));
}
}
private Lookup singleBenchmark(Class<? extends Lookup> cls, Bench bench) throws Exception {
Lookup lookup = cls.newInstance();
long start = System.currentTimeMillis();
lookup.build(getTFIT());
long buildTime = System.currentTimeMillis() - start;
TermFreqIterator tfit = getTFIT();
long elapsed = 0;
while (tfit.hasNext()) {
String key = tfit.next();
// take only the first part of the key
int len = key.length() > 4 ? key.length() / 3 : 2;
String prefix = key.substring(0, len);
start = System.nanoTime();
List<LookupResult> res = lookup.lookup(prefix, true, 10);
elapsed += System.nanoTime() - start;
assertTrue(res.size() > 0);
for (LookupResult lr : res) {
assertTrue(lr.key.startsWith(prefix));
}
}
if (bench != null) {
bench.buildTime += buildTime;
bench.lookupTime += elapsed;
}
return lookup;
}
private Bench fullBenchmark(Class<? extends Lookup> cls, int warmupCount, int measuredCount) throws Exception {
System.err.println("* Running " + measuredCount + " iterations for " + cls.getSimpleName() + " ...");
System.err.println(" - warm-up " + warmupCount + " iterations...");
for (int i = 0; i < warmupCount; i++) {
System.runFinalization();
System.gc();
singleBenchmark(cls, null);
}
Bench b = new Bench();
System.err.print(" - main iterations:"); System.err.flush();
for (int i = 0; i < measuredCount; i++) {
System.runFinalization();
System.gc();
singleBenchmark(cls, b);
if (i > 0 && (i % 10 == 0)) {
System.err.print(" " + i);
System.err.flush();
}
}
System.err.println();
return b;
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
}
}

View File

@ -0,0 +1,11 @@
package org.apache.solr.spelling.suggest;
public final class TermFreq {
public final String term;
public final float v;
public TermFreq(String term, float v) {
this.term = term;
this.v = v;
}
}

View File

@ -0,0 +1,40 @@
package org.apache.solr.spelling.suggest;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.solr.util.TermFreqIterator;
/**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
*/
public final class TermFreqArrayIterator implements TermFreqIterator {
private final Iterator<TermFreq> i;
private TermFreq current;
public TermFreqArrayIterator(Iterator<TermFreq> i) {
this.i = i;
}
public TermFreqArrayIterator(TermFreq [] i) {
this(Arrays.asList(i));
}
public TermFreqArrayIterator(Iterable<TermFreq> i) {
this(i.iterator());
}
public float freq() {
return current.v;
}
public boolean hasNext() {
return i.hasNext();
}
public String next() {
return (current = i.next()).term;
}
public void remove() { throw new UnsupportedOperationException(); }
}

View File

@ -0,0 +1,155 @@
package org.apache.solr.spelling.suggest.fst;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
import org.apache.solr.spelling.suggest.TermFreq;
import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
/**
* Unit tests for {@link FSTLookup}.
*/
public class FSTLookupTest extends LuceneTestCase {
public static TermFreq tf(String t, float v) {
return new TermFreq(t, v);
}
private FSTLookup lookup;
@Before
public void prepare() throws Exception {
final TermFreq[] keys = new TermFreq[] {
tf("one", 0.5f),
tf("oneness", 1),
tf("onerous", 1),
tf("onesimus", 1),
tf("two", 1),
tf("twofold", 1),
tf("twonk", 1),
tf("thrive", 1),
tf("through", 1),
tf("threat", 1),
tf("three", 1),
tf("foundation", 1),
tf("fourier", 1),
tf("four", 1),
tf("fourty", 1),
tf("xo", 1),
};
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(keys));
}
@Test
public void testExactMatchHighPriority() throws Exception {
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
}
@Test
public void testExactMatchLowPriority() throws Exception {
assertMatchEquals(lookup.lookup("one", true, 2),
"one/0.0",
"oneness/1.0");
}
@Test
public void testMiss() throws Exception {
assertMatchEquals(lookup.lookup("xyz", true, 1));
}
@Test
public void testAlphabeticWithWeights() throws Exception {
assertEquals(0, lookup.lookup("xyz", false, 1).size());
}
@Test
public void testFullMatchList() throws Exception {
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
"oneness/1.0",
"onerous/1.0",
"onesimus/1.0",
"one/0.0");
}
@Test
public void testMultilingualInput() throws Exception {
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
}
}
@Test
public void testEmptyInput() throws Exception {
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
assertMatchEquals(lookup.lookup("", true, 10));
}
@Test
public void testRandom() throws Exception {
List<TermFreq> freqs = Lists.newArrayList();
Random rnd = random;
for (int i = 0; i < 5000; i++) {
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
}
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
for (TermFreq tf : freqs) {
final String term = tf.term;
for (int i = 1; i < term.length(); i++) {
String prefix = term.substring(0, i);
for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
Assert.assertTrue(lr.key.startsWith(prefix));
}
}
}
}
private void assertMatchEquals(List<LookupResult> res, String... expected) {
String [] result = new String [res.size()];
for (int i = 0; i < res.size(); i++)
result[i] = res.get(i).toString();
if (!Arrays.equals(expected, result)) {
int colLen = Math.max(maxLen(expected), maxLen(result));
StringBuilder b = new StringBuilder();
String format = "%" + colLen + "s " + "%" + colLen + "s\n";
b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
for (int i = 0; i < Math.max(result.length, expected.length); i++) {
b.append(String.format(Locale.ENGLISH, format,
i < expected.length ? expected[i] : "--",
i < result.length ? result[i] : "--"));
}
System.err.println(b.toString());
fail("Expected different output:\n" + b.toString());
}
}
private int maxLen(String[] result) {
int len = 0;
for (String s : result)
len = Math.max(len, s.length());
return len;
}
}