mirror of https://github.com/apache/lucene.git
merged with trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/realtime_search@1092636 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
f0b56fd92e
|
@ -45,7 +45,14 @@ API Changes
|
|||
|
||||
======================= Lucene 3.x (not yet released) =======================
|
||||
|
||||
(No changes)
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
|
||||
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
|
||||
|
||||
======================= Lucene 3.1.0 =======================
|
||||
|
||||
|
|
|
@ -58,6 +58,7 @@ public final class FieldInfo {
|
|||
this.omitNorms = false;
|
||||
this.omitTermFreqAndPositions = false;
|
||||
}
|
||||
assert !omitTermFreqAndPositions || !storePayloads;
|
||||
}
|
||||
|
||||
void setCodecId(int codecId) {
|
||||
|
@ -80,6 +81,7 @@ public final class FieldInfo {
|
|||
// should only be called by FieldInfos#addOrUpdate
|
||||
void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector,
|
||||
boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) {
|
||||
|
||||
if (this.isIndexed != isIndexed) {
|
||||
this.isIndexed = true; // once indexed, always index
|
||||
}
|
||||
|
@ -101,7 +103,9 @@ public final class FieldInfo {
|
|||
}
|
||||
if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) {
|
||||
this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life
|
||||
this.storePayloads = false;
|
||||
}
|
||||
}
|
||||
assert !this.omitTermFreqAndPositions || !this.storePayloads;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -424,8 +424,8 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
}
|
||||
|
||||
synchronized private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, boolean isIndexed,
|
||||
boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||
boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) {
|
||||
boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||
boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) {
|
||||
if (globalFieldNumbers == null) {
|
||||
throw new IllegalStateException("FieldInfos are read-only, create a new instance with a global field map to make modifications to FieldInfos");
|
||||
}
|
||||
|
@ -567,6 +567,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
output.writeVInt(FORMAT_CURRENT);
|
||||
output.writeVInt(size());
|
||||
for (FieldInfo fi : this) {
|
||||
assert !fi.omitTermFreqAndPositions || !fi.storePayloads;
|
||||
byte bits = 0x0;
|
||||
if (fi.isIndexed) bits |= IS_INDEXED;
|
||||
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
|
||||
|
@ -607,6 +608,14 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
boolean omitNorms = (bits & OMIT_NORMS) != 0;
|
||||
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
|
||||
boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0;
|
||||
|
||||
// LUCENE-3027: past indices were able to write
|
||||
// storePayloads=true when omitTFAP is also true,
|
||||
// which is invalid. We correct that, here:
|
||||
if (omitTermFreqAndPositions) {
|
||||
storePayloads = false;
|
||||
}
|
||||
|
||||
final FieldInfo addInternal = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions);
|
||||
addInternal.setCodecId(codecId);
|
||||
}
|
||||
|
|
|
@ -74,8 +74,13 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) {
|
||||
final FieldInfo fieldInfo = allFields.get(fieldNumber).fieldInfo;
|
||||
|
||||
FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber);
|
||||
fieldInfo.storePayloads |= fieldWriter.hasPayloads;
|
||||
final FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber);
|
||||
|
||||
// Aggregate the storePayload as seen by the same
|
||||
// field across multiple threads
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
fieldInfo.storePayloads |= fieldWriter.hasPayloads;
|
||||
}
|
||||
|
||||
// If this field has postings then add them to the
|
||||
// segment
|
||||
|
|
|
@ -151,10 +151,10 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
protected void readHeader(IndexInput input) throws IOException {
|
||||
CodecUtil.checkHeader(in, BlockTermsWriter.CODEC_NAME,
|
||||
CodecUtil.checkHeader(input, BlockTermsWriter.CODEC_NAME,
|
||||
BlockTermsWriter.VERSION_START,
|
||||
BlockTermsWriter.VERSION_CURRENT);
|
||||
dirOffset = in.readLong();
|
||||
dirOffset = input.readLong();
|
||||
}
|
||||
|
||||
protected void seekDir(IndexInput input, long dirOffset)
|
||||
|
@ -842,6 +842,11 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
private void decodeMetaData() throws IOException {
|
||||
//System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termCount + " state=" + state);
|
||||
if (!seekPending) {
|
||||
// TODO: cutover to random-access API
|
||||
// here.... really stupid that we have to decode N
|
||||
// wasted term metadata just to get to the N+1th
|
||||
// that we really need...
|
||||
|
||||
// lazily catch up on metadata decode:
|
||||
final int limit = state.termCount;
|
||||
// We must set/incr state.termCount because
|
||||
|
|
|
@ -177,6 +177,7 @@ class SepSkipListReader extends MultiLevelSkipListReader {
|
|||
@Override
|
||||
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||
int delta;
|
||||
assert !omitTF || !currentFieldStoresPayloads;
|
||||
if (currentFieldStoresPayloads) {
|
||||
// the current field stores payloads.
|
||||
// if the doc delta is odd then we have
|
||||
|
|
|
@ -34,8 +34,6 @@ import java.util.HashMap;
|
|||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
import org.junit.Assert;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.CheckIndex;
|
||||
|
@ -188,22 +186,35 @@ public class _TestUtil {
|
|||
return "";
|
||||
}
|
||||
final char[] buffer = new char[end];
|
||||
for (int i = 0; i < end; i++) {
|
||||
int t = r.nextInt(5);
|
||||
randomFixedLengthUnicodeString(r, buffer, 0, buffer.length);
|
||||
return new String(buffer, 0, end);
|
||||
}
|
||||
|
||||
if (0 == t && i < end - 1) {
|
||||
/**
|
||||
* Fills provided char[] with valid random unicode code
|
||||
* unit sequence.
|
||||
*/
|
||||
public static void randomFixedLengthUnicodeString(Random random, char[] chars, int offset, int length) {
|
||||
int i = offset;
|
||||
final int end = offset + length;
|
||||
while(i < end) {
|
||||
final int t = random.nextInt(5);
|
||||
if (0 == t && i < length - 1) {
|
||||
// Make a surrogate pair
|
||||
// High surrogate
|
||||
buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff);
|
||||
chars[i++] = (char) nextInt(random, 0xd800, 0xdbff);
|
||||
// Low surrogate
|
||||
buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff);
|
||||
chars[i++] = (char) nextInt(random, 0xdc00, 0xdfff);
|
||||
} else if (t <= 1) {
|
||||
chars[i++] = (char) random.nextInt(0x80);
|
||||
} else if (2 == t) {
|
||||
chars[i++] = (char) nextInt(random, 0x80, 0x800);
|
||||
} else if (3 == t) {
|
||||
chars[i++] = (char) nextInt(random, 0x800, 0xd7ff);
|
||||
} else if (4 == t) {
|
||||
chars[i++] = (char) nextInt(random, 0xe000, 0xffff);
|
||||
}
|
||||
else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
|
||||
else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800);
|
||||
else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff);
|
||||
else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff);
|
||||
}
|
||||
return new String(buffer, 0, end);
|
||||
}
|
||||
|
||||
private static final int[] blockStarts = {
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.*;
|
|||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
@ -77,6 +78,7 @@ public class Test2BTerms extends LuceneTestCase {
|
|||
tokenCount++;
|
||||
if (--nextSave == 0) {
|
||||
savedTerms.add(new BytesRef(bytes));
|
||||
System.out.println("TEST: save term=" + bytes);
|
||||
nextSave = _TestUtil.nextInt(random, 500000, 1000000);
|
||||
}
|
||||
return true;
|
||||
|
@ -153,13 +155,16 @@ public class Test2BTerms extends LuceneTestCase {
|
|||
|
||||
Directory dir = newFSDirectory(_TestUtil.getTempDir("2BTerms"));
|
||||
//Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
|
||||
|
||||
if (true) {
|
||||
|
||||
IndexWriter w = new IndexWriter(dir,
|
||||
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
|
||||
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
|
||||
.setRAMBufferSizeMB(256.0)
|
||||
.setMergeScheduler(new ConcurrentMergeScheduler())
|
||||
.setMergePolicy(newLogMergePolicy(false, 10)));
|
||||
.setMergePolicy(newLogMergePolicy(false, 10))
|
||||
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
|
||||
|
||||
MergePolicy mp = w.getConfig().getMergePolicy();
|
||||
if (mp instanceof LogByteSizeMergePolicy) {
|
||||
|
@ -211,6 +216,7 @@ public class Test2BTerms extends LuceneTestCase {
|
|||
assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);
|
||||
|
||||
dir.close();
|
||||
System.out.println("TEST: done!");
|
||||
}
|
||||
|
||||
private List<BytesRef> findTerms(IndexReader r) throws IOException {
|
||||
|
@ -234,15 +240,29 @@ public class Test2BTerms extends LuceneTestCase {
|
|||
IndexSearcher s = new IndexSearcher(r);
|
||||
Collections.shuffle(terms);
|
||||
TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
|
||||
boolean failed = false;
|
||||
for(int iter=0;iter<10*terms.size();iter++) {
|
||||
final BytesRef term = terms.get(random.nextInt(terms.size()));
|
||||
System.out.println("TEST: search " + term);
|
||||
final long t0 = System.currentTimeMillis();
|
||||
assertTrue(s.search(new TermQuery(new Term("field", term)), 1).totalHits > 0);
|
||||
final int count = s.search(new TermQuery(new Term("field", term)), 1).totalHits;
|
||||
if (count <= 0) {
|
||||
System.out.println(" FAILED: count=" + count);
|
||||
failed = true;
|
||||
}
|
||||
final long t1 = System.currentTimeMillis();
|
||||
System.out.println(" took " + (t1-t0) + " millis");
|
||||
|
||||
assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seek(term));
|
||||
TermsEnum.SeekStatus result = termsEnum.seek(term);
|
||||
if (result != TermsEnum.SeekStatus.FOUND) {
|
||||
if (result == TermsEnum.SeekStatus.END) {
|
||||
System.out.println(" FAILED: got END");
|
||||
} else {
|
||||
System.out.println(" FAILED: wrong term: got " + termsEnum.term());
|
||||
}
|
||||
failed = true;
|
||||
}
|
||||
}
|
||||
assertFalse(failed);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -536,6 +536,7 @@ public class TestIndexWriterDelete extends LuceneTestCase {
|
|||
fail(testName + " hit IOException after disk space was freed up");
|
||||
}
|
||||
}
|
||||
// prevent throwing a random exception here!!
|
||||
final double randomIOExceptionRate = dir.getRandomIOExceptionRate();
|
||||
final long maxSizeInBytes = dir.getMaxSizeInBytes();
|
||||
dir.setRandomIOExceptionRate(0.0);
|
||||
|
|
|
@ -119,6 +119,7 @@ public class TestOmitTf extends LuceneTestCase {
|
|||
setMaxBufferedDocs(3).
|
||||
setMergePolicy(newLogMergePolicy(2))
|
||||
);
|
||||
writer.setInfoStream(VERBOSE ? System.out : null);
|
||||
Document d = new Document();
|
||||
|
||||
// this field will have Tf
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Latvian.
|
||||
*/
|
||||
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Latvian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A
|
||||
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link LatvianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new LatvianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
public final class LatvianStemFilter extends TokenFilter {
|
||||
private final LatvianStemmer stemmer = new LatvianStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public LatvianStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Light stemmer for Latvian.
|
||||
* <p>
|
||||
* This is a light version of the algorithm in Karlis Kreslin's PhD thesis
|
||||
* <i>A stemming algorithm for Latvian</i> with the following modifications:
|
||||
* <ul>
|
||||
* <li>Only explicitly stems noun and adjective morphology
|
||||
* <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
|
||||
* <li>Removes only the primary inflectional suffixes: case and number for nouns ;
|
||||
* case, number, gender, and definitiveness for adjectives.
|
||||
* <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
|
||||
* </ul>
|
||||
*/
|
||||
public class LatvianStemmer {
|
||||
/**
|
||||
* Stem a latvian word. returns the new adjusted length.
|
||||
*/
|
||||
public int stem(char s[], int len) {
|
||||
int numVowels = numVowels(s, len);
|
||||
|
||||
for (int i = 0; i < affixes.length; i++) {
|
||||
Affix affix = affixes[i];
|
||||
if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
|
||||
len -= affix.affix.length;
|
||||
return affix.palatalizes ? unpalatalize(s, len) : len;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static final Affix affixes[] = {
|
||||
new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
|
||||
new Affix("ajam", 2, false), new Affix("ajām", 2, false),
|
||||
new Affix("ajos", 2, false), new Affix("ajās", 2, false),
|
||||
new Affix("iem", 2, true), new Affix("ajā", 2, false),
|
||||
new Affix("ais", 2, false), new Affix("ai", 2, false),
|
||||
new Affix("ei", 2, false), new Affix("ām", 1, false),
|
||||
new Affix("am", 1, false), new Affix("ēm", 1, false),
|
||||
new Affix("īm", 1, false), new Affix("im", 1, false),
|
||||
new Affix("um", 1, false), new Affix("us", 1, true),
|
||||
new Affix("as", 1, false), new Affix("ās", 1, false),
|
||||
new Affix("es", 1, false), new Affix("os", 1, true),
|
||||
new Affix("ij", 1, false), new Affix("īs", 1, false),
|
||||
new Affix("ēs", 1, false), new Affix("is", 1, false),
|
||||
new Affix("ie", 1, false), new Affix("u", 1, true),
|
||||
new Affix("a", 1, true), new Affix("i", 1, true),
|
||||
new Affix("e", 1, false), new Affix("ā", 1, false),
|
||||
new Affix("ē", 1, false), new Affix("ī", 1, false),
|
||||
new Affix("ū", 1, false), new Affix("o", 1, false),
|
||||
new Affix("s", 0, false), new Affix("š", 0, false),
|
||||
};
|
||||
|
||||
static class Affix {
|
||||
char affix[]; // suffix
|
||||
int vc; // vowel count of the suffix
|
||||
boolean palatalizes; // true if we should fire palatalization rules.
|
||||
|
||||
Affix(String affix, int vc, boolean palatalizes) {
|
||||
this.affix = affix.toCharArray();
|
||||
this.vc = vc;
|
||||
this.palatalizes = palatalizes;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Most cases are handled except for the ambiguous ones:
|
||||
* <ul>
|
||||
* <li> s -> š
|
||||
* <li> t -> š
|
||||
* <li> d -> ž
|
||||
* <li> z -> ž
|
||||
* </ul>
|
||||
*/
|
||||
private int unpalatalize(char s[], int len) {
|
||||
// we check the character removed: if its -u then
|
||||
// its 2,5, or 6 gen pl., and these two can only apply then.
|
||||
if (s[len] == 'u') {
|
||||
// kš -> kst
|
||||
if (endsWith(s, len, "kš")) {
|
||||
len++;
|
||||
s[len-2] = 's';
|
||||
s[len-1] = 't';
|
||||
return len;
|
||||
}
|
||||
// ņņ -> nn
|
||||
if (endsWith(s, len, "ņņ")) {
|
||||
s[len-2] = 'n';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise all other rules
|
||||
if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
|
||||
|| endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
|
||||
// labial consonant
|
||||
return len-1;
|
||||
} else if (endsWith(s, len, "šņ")) {
|
||||
s[len-2] = 's';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "žņ")) {
|
||||
s[len-2] = 'z';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "šļ")) {
|
||||
s[len-2] = 's';
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "žļ")) {
|
||||
s[len-2] = 'z';
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "ļņ")) {
|
||||
s[len-2] = 'l';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "ļļ")) {
|
||||
s[len-2] = 'l';
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (s[len-1] == 'č') {
|
||||
s[len-1] = 'c';
|
||||
return len;
|
||||
} else if (s[len-1] == 'ļ') {
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (s[len-1] == 'ņ') {
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count the vowels in the string, we always require at least
|
||||
* one in the remaining stem to accept it.
|
||||
*/
|
||||
private int numVowels(char s[], int len) {
|
||||
int n = 0;
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch(s[i]) {
|
||||
case 'a': case 'e': case 'i':
|
||||
case 'o': case 'u': case 'ā':
|
||||
case 'ī': case 'ē': case 'ū':
|
||||
n++;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Latvian.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,172 @@
|
|||
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
|
||||
# the original list of over 800 forms was refined:
|
||||
# pronouns, adverbs, interjections were removed
|
||||
#
|
||||
# prepositions
|
||||
aiz
|
||||
ap
|
||||
ar
|
||||
apakš
|
||||
ārpus
|
||||
augšpus
|
||||
bez
|
||||
caur
|
||||
dēļ
|
||||
gar
|
||||
iekš
|
||||
iz
|
||||
kopš
|
||||
labad
|
||||
lejpus
|
||||
līdz
|
||||
no
|
||||
otrpus
|
||||
pa
|
||||
par
|
||||
pār
|
||||
pēc
|
||||
pie
|
||||
pirms
|
||||
pret
|
||||
priekš
|
||||
starp
|
||||
šaipus
|
||||
uz
|
||||
viņpus
|
||||
virs
|
||||
virspus
|
||||
zem
|
||||
apakšpus
|
||||
# Conjunctions
|
||||
un
|
||||
bet
|
||||
jo
|
||||
ja
|
||||
ka
|
||||
lai
|
||||
tomēr
|
||||
tikko
|
||||
turpretī
|
||||
arī
|
||||
kaut
|
||||
gan
|
||||
tādēļ
|
||||
tā
|
||||
ne
|
||||
tikvien
|
||||
vien
|
||||
kā
|
||||
ir
|
||||
te
|
||||
vai
|
||||
kamēr
|
||||
# Particles
|
||||
ar
|
||||
diezin
|
||||
droši
|
||||
diemžēl
|
||||
nebūt
|
||||
ik
|
||||
it
|
||||
taču
|
||||
nu
|
||||
pat
|
||||
tiklab
|
||||
iekšpus
|
||||
nedz
|
||||
tik
|
||||
nevis
|
||||
turpretim
|
||||
jeb
|
||||
iekam
|
||||
iekām
|
||||
iekāms
|
||||
kolīdz
|
||||
līdzko
|
||||
tiklīdz
|
||||
jebšu
|
||||
tālab
|
||||
tāpēc
|
||||
nekā
|
||||
itin
|
||||
jā
|
||||
jau
|
||||
jel
|
||||
nē
|
||||
nezin
|
||||
tad
|
||||
tikai
|
||||
vis
|
||||
tak
|
||||
iekams
|
||||
vien
|
||||
# modal verbs
|
||||
būt
|
||||
biju
|
||||
biji
|
||||
bija
|
||||
bijām
|
||||
bijāt
|
||||
esmu
|
||||
esi
|
||||
esam
|
||||
esat
|
||||
būšu
|
||||
būsi
|
||||
būs
|
||||
būsim
|
||||
būsiet
|
||||
tikt
|
||||
tiku
|
||||
tiki
|
||||
tika
|
||||
tikām
|
||||
tikāt
|
||||
tieku
|
||||
tiec
|
||||
tiek
|
||||
tiekam
|
||||
tiekat
|
||||
tikšu
|
||||
tiks
|
||||
tiksim
|
||||
tiksiet
|
||||
tapt
|
||||
tapi
|
||||
tapāt
|
||||
topat
|
||||
tapšu
|
||||
tapsi
|
||||
taps
|
||||
tapsim
|
||||
tapsiet
|
||||
kļūt
|
||||
kļuvu
|
||||
kļuvi
|
||||
kļuva
|
||||
kļuvām
|
||||
kļuvāt
|
||||
kļūstu
|
||||
kļūsti
|
||||
kļūst
|
||||
kļūstam
|
||||
kļūstat
|
||||
kļūšu
|
||||
kļūsi
|
||||
kļūs
|
||||
kļūsim
|
||||
kļūsiet
|
||||
# verbs
|
||||
varēt
|
||||
varēju
|
||||
varējām
|
||||
varēšu
|
||||
varēsim
|
||||
var
|
||||
varēji
|
||||
varējāt
|
||||
varēsi
|
||||
varēsiet
|
||||
varat
|
||||
varēja
|
||||
varēs
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "tirgiem", "tirg");
|
||||
checkOneTermReuse(a, "tirgus", "tirg");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "un", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("tirgiem");
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
||||
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||
checkOneTermReuse(a, "tirgus", "tirg");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,272 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
/**
|
||||
* Basic tests for {@link LatvianStemmer}
|
||||
*/
|
||||
public class TestLatvianStemmer extends BaseTokenStreamTestCase {
|
||||
private Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
||||
public void testNouns1() throws IOException {
|
||||
// decl. I
|
||||
checkOneTerm(a, "tēvs", "tēv"); // nom. sing.
|
||||
checkOneTerm(a, "tēvi", "tēv"); // nom. pl.
|
||||
checkOneTerm(a, "tēva", "tēv"); // gen. sing.
|
||||
checkOneTerm(a, "tēvu", "tēv"); // gen. pl.
|
||||
checkOneTerm(a, "tēvam", "tēv"); // dat. sing.
|
||||
checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
|
||||
checkOneTerm(a, "tēvu", "tēv"); // acc. sing.
|
||||
checkOneTerm(a, "tēvus", "tēv"); // acc. pl.
|
||||
checkOneTerm(a, "tēvā", "tēv"); // loc. sing.
|
||||
checkOneTerm(a, "tēvos", "tēv"); // loc. pl.
|
||||
checkOneTerm(a, "tēvs", "tēv"); // voc. sing.
|
||||
checkOneTerm(a, "tēvi", "tēv"); // voc. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* decl II nouns with (s,t) -> š and (d,z) -> ž
|
||||
* palatalization will generally conflate to two stems
|
||||
* due to the ambiguity (plural and singular).
|
||||
*/
|
||||
public void testNouns2() throws IOException {
|
||||
// decl. II
|
||||
|
||||
// c -> č palatalization
|
||||
checkOneTerm(a, "lācis", "lāc"); // nom. sing.
|
||||
checkOneTerm(a, "lāči", "lāc"); // nom. pl.
|
||||
checkOneTerm(a, "lāča", "lāc"); // gen. sing.
|
||||
checkOneTerm(a, "lāču", "lāc"); // gen. pl.
|
||||
checkOneTerm(a, "lācim", "lāc"); // dat. sing.
|
||||
checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
|
||||
checkOneTerm(a, "lāci", "lāc"); // acc. sing.
|
||||
checkOneTerm(a, "lāčus", "lāc"); // acc. pl.
|
||||
checkOneTerm(a, "lācī", "lāc"); // loc. sing.
|
||||
checkOneTerm(a, "lāčos", "lāc"); // loc. pl.
|
||||
checkOneTerm(a, "lāci", "lāc"); // voc. sing.
|
||||
checkOneTerm(a, "lāči", "lāc"); // voc. pl.
|
||||
|
||||
// n -> ņ palatalization
|
||||
checkOneTerm(a, "akmens", "akmen"); // nom. sing.
|
||||
checkOneTerm(a, "akmeņi", "akmen"); // nom. pl.
|
||||
checkOneTerm(a, "akmens", "akmen"); // gen. sing.
|
||||
checkOneTerm(a, "akmeņu", "akmen"); // gen. pl.
|
||||
checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
|
||||
checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
|
||||
checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
|
||||
checkOneTerm(a, "akmeņus", "akmen"); // acc. pl.
|
||||
checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
|
||||
checkOneTerm(a, "akmeņos", "akmen"); // loc. pl.
|
||||
checkOneTerm(a, "akmens", "akmen"); // voc. sing.
|
||||
checkOneTerm(a, "akmeņi", "akmen"); // voc. pl.
|
||||
|
||||
// no palatalization
|
||||
checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
|
||||
checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
|
||||
checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
|
||||
checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
|
||||
checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
|
||||
checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
|
||||
checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
|
||||
checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
|
||||
checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
|
||||
checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
|
||||
checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
|
||||
checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNouns3() throws IOException {
|
||||
// decl III
|
||||
checkOneTerm(a, "lietus", "liet"); // nom. sing.
|
||||
checkOneTerm(a, "lieti", "liet"); // nom. pl.
|
||||
checkOneTerm(a, "lietus", "liet"); // gen. sing.
|
||||
checkOneTerm(a, "lietu", "liet"); // gen. pl.
|
||||
checkOneTerm(a, "lietum", "liet"); // dat. sing.
|
||||
checkOneTerm(a, "lietiem", "liet"); // dat. pl.
|
||||
checkOneTerm(a, "lietu", "liet"); // acc. sing.
|
||||
checkOneTerm(a, "lietus", "liet"); // acc. pl.
|
||||
checkOneTerm(a, "lietū", "liet"); // loc. sing.
|
||||
checkOneTerm(a, "lietos", "liet"); // loc. pl.
|
||||
checkOneTerm(a, "lietus", "liet"); // voc. sing.
|
||||
checkOneTerm(a, "lieti", "liet"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNouns4() throws IOException {
|
||||
// decl IV
|
||||
checkOneTerm(a, "lapa", "lap"); // nom. sing.
|
||||
checkOneTerm(a, "lapas", "lap"); // nom. pl.
|
||||
checkOneTerm(a, "lapas", "lap"); // gen. sing.
|
||||
checkOneTerm(a, "lapu", "lap"); // gen. pl.
|
||||
checkOneTerm(a, "lapai", "lap"); // dat. sing.
|
||||
checkOneTerm(a, "lapām", "lap"); // dat. pl.
|
||||
checkOneTerm(a, "lapu", "lap"); // acc. sing.
|
||||
checkOneTerm(a, "lapas", "lap"); // acc. pl.
|
||||
checkOneTerm(a, "lapā", "lap"); // loc. sing.
|
||||
checkOneTerm(a, "lapās", "lap"); // loc. pl.
|
||||
checkOneTerm(a, "lapa", "lap"); // voc. sing.
|
||||
checkOneTerm(a, "lapas", "lap"); // voc. pl.
|
||||
|
||||
checkOneTerm(a, "puika", "puik"); // nom. sing.
|
||||
checkOneTerm(a, "puikas", "puik"); // nom. pl.
|
||||
checkOneTerm(a, "puikas", "puik"); // gen. sing.
|
||||
checkOneTerm(a, "puiku", "puik"); // gen. pl.
|
||||
checkOneTerm(a, "puikam", "puik"); // dat. sing.
|
||||
checkOneTerm(a, "puikām", "puik"); // dat. pl.
|
||||
checkOneTerm(a, "puiku", "puik"); // acc. sing.
|
||||
checkOneTerm(a, "puikas", "puik"); // acc. pl.
|
||||
checkOneTerm(a, "puikā", "puik"); // loc. sing.
|
||||
checkOneTerm(a, "puikās", "puik"); // loc. pl.
|
||||
checkOneTerm(a, "puika", "puik"); // voc. sing.
|
||||
checkOneTerm(a, "puikas", "puik"); // voc. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* Genitive plural forms with (s,t) -> š and (d,z) -> ž
|
||||
* will not conflate due to ambiguity.
|
||||
*/
|
||||
public void testNouns5() throws IOException {
|
||||
// decl V
|
||||
// l -> ļ palatalization
|
||||
checkOneTerm(a, "egle", "egl"); // nom. sing.
|
||||
checkOneTerm(a, "egles", "egl"); // nom. pl.
|
||||
checkOneTerm(a, "egles", "egl"); // gen. sing.
|
||||
checkOneTerm(a, "egļu", "egl"); // gen. pl.
|
||||
checkOneTerm(a, "eglei", "egl"); // dat. sing.
|
||||
checkOneTerm(a, "eglēm", "egl"); // dat. pl.
|
||||
checkOneTerm(a, "egli", "egl"); // acc. sing.
|
||||
checkOneTerm(a, "egles", "egl"); // acc. pl.
|
||||
checkOneTerm(a, "eglē", "egl"); // loc. sing.
|
||||
checkOneTerm(a, "eglēs", "egl"); // loc. pl.
|
||||
checkOneTerm(a, "egle", "egl"); // voc. sing.
|
||||
checkOneTerm(a, "egles", "egl"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNouns6() throws IOException {
|
||||
// decl VI
|
||||
|
||||
// no palatalization
|
||||
checkOneTerm(a, "govs", "gov"); // nom. sing.
|
||||
checkOneTerm(a, "govis", "gov"); // nom. pl.
|
||||
checkOneTerm(a, "govs", "gov"); // gen. sing.
|
||||
checkOneTerm(a, "govju", "gov"); // gen. pl.
|
||||
checkOneTerm(a, "govij", "gov"); // dat. sing.
|
||||
checkOneTerm(a, "govīm", "gov"); // dat. pl.
|
||||
checkOneTerm(a, "govi ", "gov"); // acc. sing.
|
||||
checkOneTerm(a, "govis", "gov"); // acc. pl.
|
||||
checkOneTerm(a, "govi ", "gov"); // inst. sing.
|
||||
checkOneTerm(a, "govīm", "gov"); // inst. pl.
|
||||
checkOneTerm(a, "govī", "gov"); // loc. sing.
|
||||
checkOneTerm(a, "govīs", "gov"); // loc. pl.
|
||||
checkOneTerm(a, "govs", "gov"); // voc. sing.
|
||||
checkOneTerm(a, "govis", "gov"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testAdjectives() throws IOException {
|
||||
checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
|
||||
checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
|
||||
checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
|
||||
checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
|
||||
checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
|
||||
checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing.
|
||||
checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
|
||||
checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl.
|
||||
checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
|
||||
checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
|
||||
checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. gen. fem. pl.
|
||||
checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
|
||||
checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
|
||||
checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
|
||||
checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
|
||||
checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
|
||||
checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
|
||||
checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl.
|
||||
checkOneTerm(a, "zilajām", "zil"); // def. dat. fem. pl.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
|
||||
checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
|
||||
checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl.
|
||||
checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl.
|
||||
checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing.
|
||||
checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing.
|
||||
checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
|
||||
checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
|
||||
checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing.
|
||||
checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl.
|
||||
checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl.
|
||||
checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
|
||||
checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
|
||||
checkOneTerm(a, "zilā", "zil"); // voc. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // voc. fem. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: we intentionally don't handle the ambiguous
|
||||
* (s,t) -> š and (d,z) -> ž
|
||||
*/
|
||||
public void testPalatalization() throws IOException {
|
||||
checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
|
||||
checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
|
||||
checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
|
||||
checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
|
||||
checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
|
||||
checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl.
|
||||
checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
|
||||
checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
|
||||
checkOneTerm(a, "vilnis", "viln"); // nom. sing.
|
||||
checkOneTerm(a, "viļņu", "viln"); // gen. pl.
|
||||
checkOneTerm(a, "lelle", "lell"); // nom. sing.
|
||||
checkOneTerm(a, "leļļu", "lell"); // gen. pl.
|
||||
checkOneTerm(a, "pinne", "pinn"); // nom. sing.
|
||||
checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
|
||||
checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
|
||||
checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* Test some length restrictions, we require a 3+ char stem,
|
||||
* with at least one vowel.
|
||||
*/
|
||||
public void testLength() throws IOException {
|
||||
checkOneTerm(a, "usa", "usa"); // length
|
||||
checkOneTerm(a, "60ms", "60ms"); // vowel count
|
||||
}
|
||||
}
|
|
@ -75,7 +75,7 @@ class SegGraph {
|
|||
List<SegToken> result = new ArrayList<SegToken>();
|
||||
int s = -1, count = 0, size = tokenListTable.size();
|
||||
List<SegToken> tokenList;
|
||||
short index = 0;
|
||||
int index = 0;
|
||||
while (count < size) {
|
||||
if (isStartExist(s)) {
|
||||
tokenList = tokenListTable.get(s);
|
||||
|
|
|
@ -17,8 +17,11 @@
|
|||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
@ -166,4 +169,30 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 1, 3, 4, 6, 7 },
|
||||
new int[] { 1, 3, 4, 6, 7, 9 });
|
||||
}
|
||||
|
||||
// LUCENE-3026
|
||||
public void testLargeDocument() throws Exception {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
sb.append("我购买了道具和服装。");
|
||||
}
|
||||
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
||||
TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-3026
|
||||
public void testLargeSentence() throws Exception {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
sb.append("我购买了道具和服装");
|
||||
}
|
||||
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
||||
TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -60,6 +60,11 @@ Detailed Change List
|
|||
|
||||
New Features
|
||||
----------------------
|
||||
|
||||
* SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)
|
||||
component, offering an order of magnitude smaller memory consumption
|
||||
compared to ternary trees and jaspell and very fast lookups at runtime.
|
||||
(Dawid Weiss)
|
||||
|
||||
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
|
||||
supports "percentages" which get evaluated relative the current size of
|
||||
|
@ -75,7 +80,7 @@ New Features
|
|||
|
||||
* SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing.
|
||||
(Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar,
|
||||
Koji Sekiguchi, Iv<EFBFBD>n de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
|
||||
Koji Sekiguchi, Iván de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
|
||||
Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
|
||||
Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger,
|
||||
Harish Agarwal, yonik)
|
||||
|
@ -110,7 +115,7 @@ New Features
|
|||
|
||||
* SOLR-1566: Transforming documents in the ResponseWriters. This will allow
|
||||
for more complex results in responses and open the door for function queries
|
||||
as results. (ryan with patches from grant, noble, cmale, yonik)
|
||||
as results. (ryan with patches from grant, noble, cmale, yonik, Jan Høydahl)
|
||||
|
||||
* SOLR-2417: Add explain info directly to return documents using ?fl=_explain_ (ryan)
|
||||
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.lv.LatvianStemFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link LatvianStemFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.LatvianStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new LatvianStemFilter(input);
|
||||
}
|
||||
}
|
|
@ -162,7 +162,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
|
||||
} else {
|
||||
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
|
||||
"Specified dictionary does not exist.");
|
||||
"Specified dictionary does not exist: " + getDictionaryName(params));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,8 +19,10 @@ package org.apache.solr.response;
|
|||
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.DocSlice;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
public class PageTool {
|
||||
private long start;
|
||||
|
@ -42,10 +44,16 @@ public class PageTool {
|
|||
DocSlice doc_slice = (DocSlice) docs;
|
||||
results_found = doc_slice.matches();
|
||||
start = doc_slice.offset();
|
||||
} else {
|
||||
} else if(docs instanceof ResultContext) {
|
||||
DocList dl = ((ResultContext) docs).docs;
|
||||
results_found = dl.matches();
|
||||
start = dl.offset();
|
||||
} else if(docs instanceof SolrDocumentList) {
|
||||
SolrDocumentList doc_list = (SolrDocumentList) docs;
|
||||
results_found = doc_list.getNumFound();
|
||||
start = doc_list.getStart();
|
||||
} else {
|
||||
throw new SolrException(SolrException.ErrorCode.UNKNOWN, "Unknown response type "+docs+". Expected one of DocSlice, ResultContext or SolrDocumentList");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@ import org.apache.solr.core.SolrCore;
|
|||
import org.apache.solr.util.TermFreqIterator;
|
||||
|
||||
public abstract class Lookup {
|
||||
|
||||
/**
|
||||
* Result of a lookup.
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,556 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.fst.Builder;
|
||||
import org.apache.lucene.util.automaton.fst.FST;
|
||||
import org.apache.lucene.util.automaton.fst.FST.Arc;
|
||||
import org.apache.lucene.util.automaton.fst.NoOutputs;
|
||||
import org.apache.lucene.util.automaton.fst.Outputs;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.Lookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.io.Closeables;
|
||||
|
||||
/**
|
||||
* Finite state automata based implementation of {@link Lookup} query
|
||||
* suggestion/ autocomplete interface.
|
||||
*
|
||||
* <h2>Implementation details</h2>
|
||||
*
|
||||
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
|
||||
* <ul>
|
||||
* <li>A set of input terms (String) and weights (float) is given.</li>
|
||||
* <li>The range of weights is determined and then all weights are discretized into a fixed set
|
||||
* of values ({@link #buckets}).
|
||||
* Note that this means that minor changes in weights may be lost during automaton construction.
|
||||
* In general, this is not a big problem because the "priorities" of completions can be split
|
||||
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
|
||||
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
|
||||
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
|
||||
* of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
|
||||
* become <code>1abc</code>.</li>
|
||||
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
|
||||
* term in front).</li>
|
||||
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
|
||||
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
|
||||
* <ul>
|
||||
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
|
||||
* starting with the highest one, we descend along the path of the input key. If the key is not
|
||||
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
|
||||
* No completions.
|
||||
* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
|
||||
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
|
||||
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
|
||||
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
|
||||
* root path we started from). The tricky part is that because automaton edges are sorted and
|
||||
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
|
||||
* suggestions the user requested.
|
||||
* <li>In case the number of suggestions collected in the step above is still insufficient,
|
||||
* we proceed to the next (smaller) weight leaving the root node and repeat the same
|
||||
* algorithm again.
|
||||
* </li>
|
||||
* </ul>
|
||||
*
|
||||
* <h2>Runtime behavior and performance characteristic</h2>
|
||||
*
|
||||
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
|
||||
* in a top-weights-first order. This is probably the most common use case: it allows
|
||||
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
|
||||
*
|
||||
* <p>If there is an exact match in the automaton, it is returned first on the results
|
||||
* list (even with by-weight sorting).
|
||||
*
|
||||
* <p>Note that the maximum lookup time for <b>any prefix</b>
|
||||
* is the time of descending to the subtree, plus traversal of the subtree up to the number
|
||||
* of requested suggestions (because they are already presorted by weight on the root level
|
||||
* and alphabetically at any node level).
|
||||
*
|
||||
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
|
||||
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
|
||||
* used, but the algorithm for doing this is suboptimal.
|
||||
*
|
||||
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
|
||||
* nothing else.
|
||||
*/
|
||||
public class FSTLookup extends Lookup {
|
||||
/** A structure for a single entry (for sorting/ preprocessing). */
|
||||
private static class Entry {
|
||||
char [] term;
|
||||
float weight;
|
||||
|
||||
public Entry(char [] term, float freq) {
|
||||
this.term = term;
|
||||
this.weight = freq;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The number of separate buckets for weights (discretization). The more buckets,
|
||||
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
|
||||
* will not decrease for prefixes which have highly-weighted completions (because these
|
||||
* are filled-in first), but will decrease significantly for low-weighted terms (but
|
||||
* these should be infrequent, so it is all right).
|
||||
*
|
||||
* <p>The number of buckets must be within [1, 255] range.
|
||||
*/
|
||||
public static final String WEIGHT_BUCKETS = "weightBuckets";
|
||||
|
||||
/**
|
||||
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
|
||||
* of other strings in the automaton (possibly with larger weights).
|
||||
*/
|
||||
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
|
||||
|
||||
/** Serialized automaton file name (storage). */
|
||||
public static final String FILENAME = "fst.dat";
|
||||
|
||||
/** An empty result. */
|
||||
private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
|
||||
|
||||
/**
|
||||
* @see #WEIGHT_BUCKETS
|
||||
*/
|
||||
private int buckets = 10;
|
||||
|
||||
/**
|
||||
* #see #EXACT_MATCH_FIRST
|
||||
*/
|
||||
private boolean exactMatchFirst = true;
|
||||
|
||||
/**
|
||||
* Finite state automaton encoding all the lookup terms. See class
|
||||
* notes for details.
|
||||
*/
|
||||
private FST<Object> automaton;
|
||||
|
||||
/**
|
||||
* An array of arcs leaving the root automaton state and encoding weights of all
|
||||
* completions in their sub-trees.
|
||||
*/
|
||||
private Arc<Object> [] rootArcs;
|
||||
|
||||
/* */
|
||||
@Override
|
||||
@SuppressWarnings("rawtypes")
|
||||
public void init(NamedList config, SolrCore core) {
|
||||
this.buckets = config.get(WEIGHT_BUCKETS) != null
|
||||
? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
|
||||
: 10;
|
||||
|
||||
this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
|
||||
? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
|
||||
: true;
|
||||
}
|
||||
|
||||
/* */
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
// Buffer the input because we will need it twice: for calculating
|
||||
// weights distribution and for the actual automata building.
|
||||
List<Entry> entries = Lists.newArrayList();
|
||||
while (tfit.hasNext()) {
|
||||
String term = tfit.next();
|
||||
char [] termChars = new char [term.length() + 1]; // add padding for weight.
|
||||
for (int i = 0; i < term.length(); i++)
|
||||
termChars[i + 1] = term.charAt(i);
|
||||
entries.add(new Entry(termChars, tfit.freq()));
|
||||
}
|
||||
|
||||
// Distribute weights into at most N buckets. This is a form of discretization to
|
||||
// limit the number of possible weights so that they can be efficiently encoded in the
|
||||
// automaton.
|
||||
//
|
||||
// It is assumed the distribution of weights is _linear_ so proportional division
|
||||
// of [min, max] range will be enough here. Other approaches could be to sort
|
||||
// weights and divide into proportional ranges.
|
||||
if (entries.size() > 0) {
|
||||
redistributeWeightsProportionalMinMax(entries, buckets);
|
||||
encodeWeightPrefix(entries);
|
||||
}
|
||||
|
||||
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
|
||||
// to the lowest weight.
|
||||
this.automaton = buildAutomaton(entries);
|
||||
cacheRootArcs();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cache the root node's output arcs starting with completions with the highest weights.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void cacheRootArcs() throws IOException {
|
||||
if (automaton != null) {
|
||||
List<Arc<Object>> rootArcs = Lists.newArrayList();
|
||||
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
rootArcs.add(new Arc<Object>().copyFrom(arc));
|
||||
if (arc.isLast())
|
||||
break;
|
||||
automaton.readNextArc(arc);
|
||||
}
|
||||
|
||||
Collections.reverse(rootArcs); // we want highest weights first.
|
||||
this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Not implemented.
|
||||
*/
|
||||
@Override
|
||||
public boolean add(String key, Object value) {
|
||||
// This implementation does not support ad-hoc additions (all input
|
||||
// must be sorted for the builder).
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the (approximated) weight of a single key (if there is a perfect match
|
||||
* for it in the automaton).
|
||||
*
|
||||
* @return Returns the approximated weight of the input key or <code>null</code>
|
||||
* if not found.
|
||||
*/
|
||||
@Override
|
||||
public Float get(String key) {
|
||||
return getExactMatchStartingFromRootArc(0, key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the first exact match by traversing root arcs, starting from
|
||||
* the arc <code>i</code>.
|
||||
*
|
||||
* @param i The first root arc index in {@link #rootArcs} to consider when
|
||||
* matching.
|
||||
*/
|
||||
private Float getExactMatchStartingFromRootArc(int i, String key) {
|
||||
// Get the UTF-8 bytes representation of the input key.
|
||||
try {
|
||||
final FST.Arc<Object> scratch = new FST.Arc<Object>();
|
||||
for (; i < rootArcs.length; i++) {
|
||||
final FST.Arc<Object> rootArc = rootArcs[i];
|
||||
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
|
||||
|
||||
// Descend into the automaton using the key as prefix.
|
||||
if (descendWithPrefix(arc, key)) {
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
// Prefix-encoded weight.
|
||||
return rootArc.label / (float) buckets;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Should never happen, but anyway.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup autocomplete suggestions to <code>key</code>.
|
||||
*
|
||||
* @param key The prefix to which suggestions should be sought.
|
||||
* @param onlyMorePopular Return most popular suggestions first. This is the default
|
||||
* behavior for this implementation. Setting it to <code>false</code> has no effect (use
|
||||
* constant term weights to sort alphabetically only).
|
||||
* @param num At most this number of suggestions will be returned.
|
||||
* @return Returns the suggestions, sorted by their approximated weight first (decreasing)
|
||||
* and then alphabetically (utf16 codepoint order).
|
||||
*/
|
||||
@Override
|
||||
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
|
||||
if (key.length() == 0 || automaton == null) {
|
||||
// Keep the result an ArrayList to keep calls monomorphic.
|
||||
return EMPTY_RESULT;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!onlyMorePopular && rootArcs.length > 1) {
|
||||
// We could emit a warning here (?). An optimal strategy for alphabetically sorted
|
||||
// suggestions would be to add them with a constant weight -- this saves unnecessary
|
||||
// traversals and sorting.
|
||||
return lookupSortedAlphabetically(key, num);
|
||||
} else {
|
||||
return lookupSortedByWeight(key, num, true);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Should never happen, but anyway.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
|
||||
* is a workaround: in general, use constant weights for alphabetically sorted result.
|
||||
*/
|
||||
private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
|
||||
// Greedily get num results from each weight branch.
|
||||
List<LookupResult> res = lookupSortedByWeight(key, num, false);
|
||||
|
||||
// Sort and trim.
|
||||
Collections.sort(res, new Comparator<LookupResult>() {
|
||||
@Override
|
||||
public int compare(LookupResult o1, LookupResult o2) {
|
||||
return o1.key.compareTo(o2.key);
|
||||
}
|
||||
});
|
||||
if (res.size() > num) {
|
||||
res = res.subList(0, num);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions sorted by weight (descending order).
|
||||
*
|
||||
* @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code>
|
||||
* suggestions have been collected. If <code>false</code>, it will collect suggestions from
|
||||
* all weight arcs (needed for {@link #lookupSortedAlphabetically}.
|
||||
*/
|
||||
private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
|
||||
final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
|
||||
final StringBuilder output = new StringBuilder(key);
|
||||
final int matchLength = key.length() - 1;
|
||||
|
||||
for (int i = 0; i < rootArcs.length; i++) {
|
||||
final FST.Arc<Object> rootArc = rootArcs[i];
|
||||
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
|
||||
|
||||
// Descend into the automaton using the key as prefix.
|
||||
if (descendWithPrefix(arc, key)) {
|
||||
// Prefix-encoded weight.
|
||||
final float weight = rootArc.label / (float) buckets;
|
||||
|
||||
// A subgraph starting from the current node has the completions
|
||||
// of the key prefix. The arc we're at is the last key's byte,
|
||||
// so we will collect it too.
|
||||
output.setLength(matchLength);
|
||||
if (collect(res, num, weight, output, arc) && greedy) {
|
||||
// We have enough suggestion to return immediately. Keep on looking for an
|
||||
// exact match, if requested.
|
||||
if (exactMatchFirst) {
|
||||
Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
|
||||
if (exactMatchWeight != null) {
|
||||
res.add(0, new LookupResult(key, exactMatchWeight));
|
||||
while (res.size() > num) {
|
||||
res.remove(res.size() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Descend along the path starting at <code>arc</code> and going through
|
||||
* bytes in <code>utf8</code> argument.
|
||||
*
|
||||
* @param arc The starting arc. This argument is modified in-place.
|
||||
* @param term The term to descend with.
|
||||
* @return If <code>true</code>, <code>arc</code> will be set to the arc matching
|
||||
* last byte of <code>utf8</code>. <code>false</code> is returned if no such
|
||||
* prefix <code>utf8</code> exists.
|
||||
*/
|
||||
private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
|
||||
final int max = term.length();
|
||||
|
||||
for (int i = 0; i < max; i++) {
|
||||
if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
|
||||
// No matching prefixes, return an empty result.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
|
||||
*
|
||||
* @param num Maximum number of results needed (early termination).
|
||||
* @param weight Weight of all results found during this collection.
|
||||
*/
|
||||
private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
|
||||
output.append((char) arc.label);
|
||||
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
res.add(new LookupResult(output.toString(), weight));
|
||||
if (res.size() >= num)
|
||||
return true;
|
||||
} else {
|
||||
int save = output.length();
|
||||
if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
|
||||
return true;
|
||||
}
|
||||
output.setLength(save);
|
||||
}
|
||||
|
||||
if (arc.isLast()) {
|
||||
break;
|
||||
}
|
||||
automaton.readNextArc(arc);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the final automaton from a list of entries.
|
||||
*/
|
||||
private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
|
||||
if (entries.size() == 0)
|
||||
return null;
|
||||
|
||||
// Sort by utf16 (raw char value)
|
||||
final Comparator<Entry> comp = new Comparator<Entry>() {
|
||||
public int compare(Entry o1, Entry o2) {
|
||||
char [] ch1 = o1.term;
|
||||
char [] ch2 = o2.term;
|
||||
int len1 = ch1.length;
|
||||
int len2 = ch2.length;
|
||||
|
||||
int max = Math.min(len1, len2);
|
||||
for (int i = 0; i < max; i++) {
|
||||
int v = ch1[i] - ch2[i];
|
||||
if (v != 0) return v;
|
||||
}
|
||||
return len1 - len2;
|
||||
}
|
||||
};
|
||||
Collections.sort(entries, comp);
|
||||
|
||||
// Avoid duplicated identical entries, if possible. This is required because
|
||||
// it breaks automaton construction otherwise.
|
||||
int len = entries.size();
|
||||
int j = 0;
|
||||
for (int i = 1; i < len; i++) {
|
||||
if (comp.compare(entries.get(j), entries.get(i)) != 0) {
|
||||
entries.set(++j, entries.get(i));
|
||||
}
|
||||
}
|
||||
entries = entries.subList(0, j + 1);
|
||||
|
||||
// Build the automaton.
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder =
|
||||
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
|
||||
final IntsRef scratchIntsRef = new IntsRef(10);
|
||||
for (Entry e : entries) {
|
||||
final int termLength = scratchIntsRef.length = e.term.length;
|
||||
|
||||
scratchIntsRef.grow(termLength);
|
||||
final int [] ints = scratchIntsRef.ints;
|
||||
final char [] chars = e.term;
|
||||
for (int i = termLength; --i >= 0;) {
|
||||
ints[i] = chars[i];
|
||||
}
|
||||
builder.add(scratchIntsRef, empty);
|
||||
}
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepends the entry's weight to each entry, encoded as a single byte, so that the
|
||||
* root automaton node fans out to all possible priorities, starting with the arc that has
|
||||
* the highest weights.
|
||||
*/
|
||||
private void encodeWeightPrefix(List<Entry> entries) {
|
||||
for (Entry e : entries) {
|
||||
int weight = (int) e.weight;
|
||||
assert (weight >= 0 && weight <= buckets) :
|
||||
"Weight out of range: " + weight + " [" + buckets + "]";
|
||||
|
||||
// There should be a single empty char reserved in front for the weight.
|
||||
e.term[0] = (char) weight;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Split [min, max] range into buckets, reassigning weights. Entries' weights are
|
||||
* remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
|
||||
*/
|
||||
private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
|
||||
float min = entries.get(0).weight;
|
||||
float max = min;
|
||||
for (Entry e : entries) {
|
||||
min = Math.min(e.weight, min);
|
||||
max = Math.max(e.weight, max);
|
||||
}
|
||||
|
||||
final float range = max - min;
|
||||
for (Entry e : entries) {
|
||||
e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deserialization from disk.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean load(File storeDir) throws IOException {
|
||||
File data = new File(storeDir, FILENAME);
|
||||
if (!data.exists() || !data.canRead()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(data));
|
||||
try {
|
||||
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
|
||||
cacheRootArcs();
|
||||
} finally {
|
||||
Closeables.closeQuietly(is);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialization to disk.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean store(File storeDir) throws IOException {
|
||||
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.automaton == null)
|
||||
return false;
|
||||
|
||||
File data = new File(storeDir, FILENAME);
|
||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
|
||||
try {
|
||||
this.automaton.save(new OutputStreamDataOutput(os));
|
||||
} finally {
|
||||
Closeables.closeQuietly(os);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import com.google.common.io.ByteStreams;
|
||||
|
||||
/**
|
||||
* A {@link DataInput} wrapping a plain {@link InputStream}.
|
||||
*/
|
||||
public class InputStreamDataInput extends DataInput {
|
||||
|
||||
private final InputStream is;
|
||||
|
||||
public InputStreamDataInput(InputStream is) {
|
||||
this.is = is;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() throws IOException {
|
||||
int v = is.read();
|
||||
if (v == -1) throw new EOFException();
|
||||
return (byte) v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
ByteStreams.readFully(is, b, offset, len);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* A {@link DataOutput} wrapping a plain {@link OutputStream}.
|
||||
*/
|
||||
public class OutputStreamDataOutput extends DataOutput {
|
||||
|
||||
private final OutputStream os;
|
||||
|
||||
public OutputStreamDataOutput(OutputStream os) {
|
||||
this.os = os;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeByte(byte b) throws IOException {
|
||||
os.write(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeBytes(byte[] b, int offset, int length) throws IOException {
|
||||
os.write(b, offset, length);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -31,7 +31,7 @@
|
|||
<requestHandler name="standard" class="solr.StandardRequestHandler" />
|
||||
|
||||
<!-- Suggest component -->
|
||||
<searchComponent class="solr.SpellCheckComponent" name="suggest">
|
||||
<searchComponent class="solr.SpellCheckComponent" name="suggest_jaspell">
|
||||
<lst name="spellchecker">
|
||||
<str name="name">suggest</str>
|
||||
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
|
||||
|
@ -45,6 +45,38 @@
|
|||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<!-- TSTLookup suggest component -->
|
||||
<searchComponent class="solr.SpellCheckComponent" name="suggest_tst">
|
||||
<lst name="spellchecker">
|
||||
<str name="name">suggest_tst</str>
|
||||
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
|
||||
<str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
|
||||
<str name="field">suggest</str>
|
||||
<str name="storeDir">suggest_tst</str>
|
||||
<str name="buildOnCommit">true</str>
|
||||
|
||||
<!-- Suggester properties -->
|
||||
<float name="threshold">0.0</float>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<!-- FSTLookup suggest component -->
|
||||
<searchComponent class="solr.SpellCheckComponent" name="suggest_fst">
|
||||
<lst name="spellchecker">
|
||||
<str name="name">suggest_fst</str>
|
||||
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
|
||||
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FSTLookup</str>
|
||||
<str name="field">suggest</str>
|
||||
<str name="storeDir">suggest_fst</str>
|
||||
<str name="buildOnCommit">true</str>
|
||||
|
||||
<!-- Suggester properties -->
|
||||
<int name="weightBuckets">5</int>
|
||||
<bool name="exactMatchFirst">true</bool>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<!-- The default (jaspell) -->
|
||||
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest">
|
||||
<lst name="defaults">
|
||||
<str name="spellcheck">true</str>
|
||||
|
@ -52,8 +84,32 @@
|
|||
<str name="spellcheck.collate">true</str>
|
||||
</lst>
|
||||
<arr name="components">
|
||||
<str>suggest</str>
|
||||
<str>suggest_jaspell</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<!-- tst (ternary tree based) -->
|
||||
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_tst">
|
||||
<lst name="defaults">
|
||||
<str name="spellcheck">true</str>
|
||||
<str name="spellcheck.dictionary">suggest_tst</str>
|
||||
<str name="spellcheck.collate">true</str>
|
||||
</lst>
|
||||
<arr name="components">
|
||||
<str>suggest_tst</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<!-- fst (finite state automaton based) -->
|
||||
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_fst">
|
||||
<lst name="defaults">
|
||||
<str name="spellcheck">true</str>
|
||||
<str name="spellcheck.dictionary">suggest_fst</str>
|
||||
<str name="spellcheck.collate">false</str>
|
||||
</lst>
|
||||
<arr name="components">
|
||||
<str>suggest_fst</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
</config>
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Latvian stem factory is working.
|
||||
*/
|
||||
public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("tirgiem tirgus");
|
||||
LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
|
||||
}
|
||||
}
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.search.Similarity;
|
|||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Ignore;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
|
@ -524,9 +525,10 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
|
|||
*/
|
||||
@Test
|
||||
public void testExternalFieldValueSourceParser() {
|
||||
clearIndex();
|
||||
|
||||
String field = "CoMpleX \" fieldName _extf";
|
||||
String fieldAsFunc = "field(\"CoMpleX \\\" fieldName _extf\")";
|
||||
String field = "CoMpleX fieldName _extf";
|
||||
String fieldAsFunc = "field(\"CoMpleX fieldName _extf\")";
|
||||
|
||||
float[] ids = {100,-4,0,10,25,5,77,23,55,-78,-45,-24,63,78,94,22,34,54321,261,-627};
|
||||
|
||||
|
@ -543,7 +545,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
|
|||
singleTest(fieldAsFunc, "sqrt(\0)");
|
||||
assertTrue(orig == FileFloatSource.onlyForTesting);
|
||||
|
||||
makeExternalFile(fieldAsFunc, "0=1","UTF-8");
|
||||
makeExternalFile(field, "0=1","UTF-8");
|
||||
assertU(adoc("id", "10000")); // will get same reader if no index change
|
||||
assertU(commit());
|
||||
singleTest(fieldAsFunc, "sqrt(\0)");
|
||||
|
@ -552,4 +554,31 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
|
|||
purgeFieldCache(FieldCache.DEFAULT); // avoid FC insanity
|
||||
}
|
||||
|
||||
/**
|
||||
* some platforms don't allow quote characters in filenames, so
|
||||
* in addition to testExternalFieldValueSourceParser above, test a field
|
||||
* name with quotes in it that does NOT use ExternalFileField
|
||||
* @see #testExternalFieldValueSourceParser
|
||||
*/
|
||||
@Test
|
||||
public void testFieldValueSourceParser() {
|
||||
clearIndex();
|
||||
|
||||
String field = "CoMpleX \" fieldName _f";
|
||||
String fieldAsFunc = "field(\"CoMpleX \\\" fieldName _f\")";
|
||||
|
||||
float[] ids = {100,-4,0,10,25,5,77,1};
|
||||
|
||||
createIndex(field, ids);
|
||||
|
||||
// test identity (straight field value)
|
||||
singleTest(fieldAsFunc, "\0",
|
||||
100,100, -4,-4, 0,0, 10,10, 25,25, 5,5, 77,77, 1,1);
|
||||
singleTest(fieldAsFunc, "sqrt(\0)",
|
||||
100,10, 25,5, 0,0, 1,1);
|
||||
singleTest(fieldAsFunc, "log(\0)", 1,0);
|
||||
|
||||
purgeFieldCache(FieldCache.DEFAULT); // avoid FC insanity
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Average with standard deviation.
|
||||
*/
|
||||
final class Average
|
||||
{
|
||||
/**
|
||||
* Average (in milliseconds).
|
||||
*/
|
||||
public final double avg;
|
||||
|
||||
/**
|
||||
* Standard deviation (in milliseconds).
|
||||
*/
|
||||
public final double stddev;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
Average(double avg, double stddev)
|
||||
{
|
||||
this.avg = avg;
|
||||
this.stddev = stddev;
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
return String.format(Locale.ENGLISH, "%.0f [+- %.2f]",
|
||||
avg, stddev);
|
||||
}
|
||||
|
||||
static Average from(List<Double> values)
|
||||
{
|
||||
double sum = 0;
|
||||
double sumSquares = 0;
|
||||
|
||||
for (double l : values)
|
||||
{
|
||||
sum += l;
|
||||
sumSquares += l * l;
|
||||
}
|
||||
|
||||
double avg = sum / (double) values.size();
|
||||
return new Average(
|
||||
(sum / (double) values.size()),
|
||||
Math.sqrt(sumSquares / (double) values.size() - avg * avg));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,230 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
import java.net.URL;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.solr.spelling.suggest.fst.FSTLookup;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.io.Resources;
|
||||
|
||||
/**
|
||||
* Benchmarks tests for implementations of {@link Lookup} interface.
|
||||
*/
|
||||
@Ignore // COMMENT ME TO RUN BENCHMARKS!
|
||||
public class LookupBenchmarkTest {
|
||||
@SuppressWarnings("unchecked")
|
||||
private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
|
||||
JaspellLookup.class,
|
||||
TSTLookup.class,
|
||||
FSTLookup.class);
|
||||
|
||||
private final static int rounds = 15;
|
||||
private final static int warmup = 5;
|
||||
|
||||
private final int num = 7;
|
||||
private final boolean onlyMorePopular = true;
|
||||
|
||||
private final static Random random = new Random(0xdeadbeef);
|
||||
|
||||
/**
|
||||
* Input term/weight pairs.
|
||||
*/
|
||||
private static TermFreq [] dictionaryInput;
|
||||
|
||||
/**
|
||||
* Benchmark term/weight pairs (randomized order).
|
||||
*/
|
||||
private static List<TermFreq> benchmarkInput;
|
||||
|
||||
/**
|
||||
* Loads terms and frequencies from Wikipedia (cached).
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void setup() throws Exception {
|
||||
List<TermFreq> input = readTop50KWiki();
|
||||
Collections.shuffle(input, random);
|
||||
LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
|
||||
Collections.shuffle(input, random);
|
||||
LookupBenchmarkTest.benchmarkInput = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect the multilingual input for benchmarks/ tests.
|
||||
*/
|
||||
public static List<TermFreq> readTop50KWiki() throws Exception {
|
||||
List<TermFreq> input = Lists.newArrayList();
|
||||
URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
|
||||
assert resource != null : "Resource missing: Top50KWiki.utf8";
|
||||
|
||||
for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
|
||||
int tab = line.indexOf('|');
|
||||
Assert.assertTrue("No | separator?: " + line, tab >= 0);
|
||||
float weight = Float.parseFloat(line.substring(tab + 1));
|
||||
String key = line.substring(0, tab);
|
||||
input.add(new TermFreq(key, weight));
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test construction time.
|
||||
*/
|
||||
@Test
|
||||
public void testConstructionTime() throws Exception {
|
||||
System.err.println("-- construction time");
|
||||
for (final Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
BenchmarkResult result = measure(new Callable<Integer>() {
|
||||
public Integer call() throws Exception {
|
||||
final Lookup lookup = buildLookup(cls, dictionaryInput);
|
||||
return lookup.hashCode();
|
||||
}
|
||||
});
|
||||
|
||||
System.err.println(
|
||||
String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
|
||||
cls.getSimpleName(),
|
||||
dictionaryInput.length,
|
||||
result.average.toString()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test memory required for the storage.
|
||||
*/
|
||||
@Test
|
||||
public void testStorageNeeds() throws Exception {
|
||||
System.err.println("-- RAM consumption");
|
||||
final RamUsageEstimator rue = new RamUsageEstimator();
|
||||
for (Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
Lookup lookup = buildLookup(cls, dictionaryInput);
|
||||
System.err.println(
|
||||
String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
|
||||
lookup.getClass().getSimpleName(),
|
||||
rue.estimateRamUsage(lookup)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create {@link Lookup} instance and populate it.
|
||||
*/
|
||||
private Lookup buildLookup(Class<? extends Lookup> cls, TermFreq[] input) throws Exception {
|
||||
Lookup lookup = cls.newInstance();
|
||||
lookup.build(new TermFreqArrayIterator(input));
|
||||
return lookup;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test performance of lookup on full hits.
|
||||
*/
|
||||
@Test
|
||||
public void testPerformanceOnFullHits() throws Exception {
|
||||
final int minPrefixLen = 100;
|
||||
final int maxPrefixLen = 200;
|
||||
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test performance of lookup on longer term prefixes (6-9 letters or shorter).
|
||||
*/
|
||||
@Test
|
||||
public void testPerformanceOnPrefixes6_9() throws Exception {
|
||||
final int minPrefixLen = 6;
|
||||
final int maxPrefixLen = 9;
|
||||
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test performance of lookup on short term prefixes (2-4 letters or shorter).
|
||||
*/
|
||||
@Test
|
||||
public void testPerformanceOnPrefixes2_4() throws Exception {
|
||||
final int minPrefixLen = 2;
|
||||
final int maxPrefixLen = 4;
|
||||
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the actual benchmark.
|
||||
*/
|
||||
public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen,
|
||||
final int num, final boolean onlyMorePopular) throws Exception {
|
||||
System.err.println(String.format(Locale.ENGLISH,
|
||||
"-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
|
||||
minPrefixLen, maxPrefixLen, num, onlyMorePopular));
|
||||
|
||||
for (Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
final Lookup lookup = buildLookup(cls, dictionaryInput);
|
||||
|
||||
final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
|
||||
public String apply(TermFreq tf) {
|
||||
return tf.term.substring(0, Math.min(tf.term.length(),
|
||||
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
|
||||
}
|
||||
}));
|
||||
|
||||
BenchmarkResult result = measure(new Callable<Integer>() {
|
||||
public Integer call() throws Exception {
|
||||
int v = 0;
|
||||
for (String term : input) {
|
||||
v += lookup.lookup(term, onlyMorePopular, num).size();
|
||||
}
|
||||
return v;
|
||||
}
|
||||
});
|
||||
|
||||
System.err.println(
|
||||
String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
|
||||
lookup.getClass().getSimpleName(),
|
||||
input.size(),
|
||||
result.average.toString(),
|
||||
input.size() / result.average.avg));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Do the measurements.
|
||||
*/
|
||||
private BenchmarkResult measure(Callable<Integer> callable) {
|
||||
final double NANOS_PER_MS = 1000000;
|
||||
|
||||
try {
|
||||
List<Double> times = Lists.newArrayList();
|
||||
for (int i = 0; i < warmup + rounds; i++) {
|
||||
final long start = System.nanoTime();
|
||||
guard = callable.call().intValue();
|
||||
times.add((System.nanoTime() - start) / NANOS_PER_MS);
|
||||
}
|
||||
return new BenchmarkResult(times, warmup, rounds);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Guard against opts. */
|
||||
@SuppressWarnings("unused")
|
||||
private static volatile int guard;
|
||||
|
||||
private static class BenchmarkResult {
|
||||
/** Average time per round (ms). */
|
||||
public final Average average;
|
||||
|
||||
public BenchmarkResult(List<Double> times, int warmup, int rounds) {
|
||||
this.average = Average.from(times.subList(warmup, times.size()));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -19,62 +19,74 @@ package org.apache.solr.spelling.suggest;
|
|||
import java.io.File;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.spelling.suggest.fst.FSTLookup;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.junit.Test;
|
||||
|
||||
public class PersistenceTest extends SolrTestCaseJ4 {
|
||||
|
||||
public static final String[] keys = new String[] {
|
||||
"one",
|
||||
"two",
|
||||
"three",
|
||||
"four",
|
||||
"oneness",
|
||||
"onerous",
|
||||
"onesimus",
|
||||
"twofold",
|
||||
"twonk",
|
||||
"thrive",
|
||||
"through",
|
||||
"threat",
|
||||
"foundation",
|
||||
"fourier",
|
||||
"fourty"
|
||||
};
|
||||
public final String[] keys = new String[] {
|
||||
"one",
|
||||
"two",
|
||||
"three",
|
||||
"four",
|
||||
"oneness",
|
||||
"onerous",
|
||||
"onesimus",
|
||||
"twofold",
|
||||
"twonk",
|
||||
"thrive",
|
||||
"through",
|
||||
"threat",
|
||||
"foundation",
|
||||
"fourier",
|
||||
"fourty"};
|
||||
|
||||
@Test
|
||||
public void testTSTPersistence() throws Exception {
|
||||
TSTLookup lookup = new TSTLookup();
|
||||
for (String k : keys) {
|
||||
lookup.add(k, new Float(k.length()));
|
||||
}
|
||||
File storeDir = new File(TEST_HOME());
|
||||
lookup.store(storeDir);
|
||||
lookup = new TSTLookup();
|
||||
lookup.load(storeDir);
|
||||
for (String k : keys) {
|
||||
Float val = (Float)lookup.get(k);
|
||||
assertNotNull(k, val);
|
||||
assertEquals(k, k.length(), val.intValue());
|
||||
}
|
||||
runTest(TSTLookup.class, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaspellPersistence() throws Exception {
|
||||
JaspellLookup lookup = new JaspellLookup();
|
||||
for (String k : keys) {
|
||||
lookup.add(k, new Float(k.length()));
|
||||
}
|
||||
File storeDir = new File(TEST_HOME());
|
||||
lookup.store(storeDir);
|
||||
lookup = new JaspellLookup();
|
||||
lookup.load(storeDir);
|
||||
for (String k : keys) {
|
||||
Float val = (Float)lookup.get(k);
|
||||
assertNotNull(k, val);
|
||||
assertEquals(k, k.length(), val.intValue());
|
||||
}
|
||||
runTest(JaspellLookup.class, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFSTPersistence() throws Exception {
|
||||
runTest(FSTLookup.class, false);
|
||||
}
|
||||
|
||||
private void runTest(Class<? extends Lookup> lookupClass,
|
||||
boolean supportsExactWeights) throws Exception {
|
||||
|
||||
// Add all input keys.
|
||||
Lookup lookup = lookupClass.newInstance();
|
||||
TermFreq[] keys = new TermFreq[this.keys.length];
|
||||
for (int i = 0; i < keys.length; i++)
|
||||
keys[i] = new TermFreq(this.keys[i], (float) i);
|
||||
lookup.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
// Store the suggester.
|
||||
File storeDir = new File(TEST_HOME());
|
||||
lookup.store(storeDir);
|
||||
|
||||
// Re-read it from disk.
|
||||
lookup = lookupClass.newInstance();
|
||||
lookup.load(storeDir);
|
||||
|
||||
// Assert validity.
|
||||
float previous = Float.NEGATIVE_INFINITY;
|
||||
for (TermFreq k : keys) {
|
||||
Float val = (Float) lookup.get(k.term);
|
||||
assertNotNull(k.term, val);
|
||||
|
||||
if (supportsExactWeights) {
|
||||
assertEquals(k.term, Float.valueOf(k.v), val);
|
||||
} else {
|
||||
assertTrue(val + ">=" + previous, val >= previous);
|
||||
previous = val.floatValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
public class SuggesterFSTTest extends SuggesterTest {
|
||||
public SuggesterFSTTest() {
|
||||
super.requestUri = "/suggest_fst";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
public class SuggesterTSTTest extends SuggesterTest {
|
||||
public SuggesterTSTTest() {
|
||||
super.requestUri = "/suggest_tst";
|
||||
}
|
||||
}
|
|
@ -17,28 +17,19 @@
|
|||
|
||||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.params.SpellingParams;
|
||||
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
|
||||
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.solr.spelling.suggest.tst.TSTLookup;
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
public class SuggesterTest extends SolrTestCaseJ4 {
|
||||
/**
|
||||
* Expected URI at which the given suggester will live.
|
||||
*/
|
||||
protected String requestUri = "/suggest";
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-spellchecker.xml","schema-spellchecker.xml");
|
||||
|
@ -59,10 +50,9 @@ public class SuggesterTest extends SolrTestCaseJ4 {
|
|||
@Test
|
||||
public void testSuggestions() throws Exception {
|
||||
addDocs();
|
||||
|
||||
assertU(commit()); // configured to do a rebuild on commit
|
||||
|
||||
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
|
||||
|
@ -82,12 +72,12 @@ public class SuggesterTest extends SolrTestCaseJ4 {
|
|||
dataDir = data;
|
||||
configString = config;
|
||||
initCore();
|
||||
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
|
||||
);
|
||||
|
||||
|
||||
// restore the property
|
||||
System.setProperty("solr.test.leavedatadir", leaveData);
|
||||
}
|
||||
|
@ -96,132 +86,13 @@ public class SuggesterTest extends SolrTestCaseJ4 {
|
|||
public void testRebuild() throws Exception {
|
||||
addDocs();
|
||||
assertU(commit());
|
||||
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
|
||||
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
|
||||
assertU(adoc("id", "4",
|
||||
"text", "actually"
|
||||
));
|
||||
assertU(commit());
|
||||
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
|
||||
}
|
||||
|
||||
|
||||
private TermFreqIterator getTFIT() {
|
||||
final int count = 100000;
|
||||
TermFreqIterator tfit = new TermFreqIterator() {
|
||||
Random r = new Random(1234567890L);
|
||||
Random r1 = new Random(1234567890L);
|
||||
int pos;
|
||||
|
||||
public float freq() {
|
||||
return r1.nextInt(4);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return pos < count;
|
||||
}
|
||||
|
||||
public String next() {
|
||||
pos++;
|
||||
return Long.toString(r.nextLong());
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
return tfit;
|
||||
}
|
||||
|
||||
static class Bench {
|
||||
long buildTime;
|
||||
long lookupTime;
|
||||
}
|
||||
|
||||
@Test @Ignore
|
||||
public void testBenchmark() throws Exception {
|
||||
final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList();
|
||||
benchmarkClasses.add(JaspellLookup.class);
|
||||
benchmarkClasses.add(TSTLookup.class);
|
||||
|
||||
// Run a single pass just to see if everything works fine and provide size estimates.
|
||||
final RamUsageEstimator rue = new RamUsageEstimator();
|
||||
for (Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
Lookup lookup = singleBenchmark(cls, null);
|
||||
System.err.println(
|
||||
String.format(Locale.ENGLISH,
|
||||
"%20s, size[B]=%,d",
|
||||
lookup.getClass().getSimpleName(),
|
||||
rue.estimateRamUsage(lookup)));
|
||||
}
|
||||
|
||||
int warmupCount = 10;
|
||||
int measuredCount = 100;
|
||||
for (Class<? extends Lookup> cls : benchmarkClasses) {
|
||||
Bench b = fullBenchmark(cls, warmupCount, measuredCount);
|
||||
System.err.println(String.format(Locale.ENGLISH,
|
||||
"%s: buildTime[ms]=%,d lookupTime[ms]=%,d",
|
||||
cls.getSimpleName(),
|
||||
(b.buildTime / measuredCount),
|
||||
(b.lookupTime / measuredCount / 1000000)));
|
||||
}
|
||||
}
|
||||
|
||||
private Lookup singleBenchmark(Class<? extends Lookup> cls, Bench bench) throws Exception {
|
||||
Lookup lookup = cls.newInstance();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
lookup.build(getTFIT());
|
||||
long buildTime = System.currentTimeMillis() - start;
|
||||
|
||||
TermFreqIterator tfit = getTFIT();
|
||||
long elapsed = 0;
|
||||
while (tfit.hasNext()) {
|
||||
String key = tfit.next();
|
||||
// take only the first part of the key
|
||||
int len = key.length() > 4 ? key.length() / 3 : 2;
|
||||
String prefix = key.substring(0, len);
|
||||
start = System.nanoTime();
|
||||
List<LookupResult> res = lookup.lookup(prefix, true, 10);
|
||||
elapsed += System.nanoTime() - start;
|
||||
assertTrue(res.size() > 0);
|
||||
for (LookupResult lr : res) {
|
||||
assertTrue(lr.key.startsWith(prefix));
|
||||
}
|
||||
}
|
||||
|
||||
if (bench != null) {
|
||||
bench.buildTime += buildTime;
|
||||
bench.lookupTime += elapsed;
|
||||
}
|
||||
|
||||
return lookup;
|
||||
}
|
||||
|
||||
private Bench fullBenchmark(Class<? extends Lookup> cls, int warmupCount, int measuredCount) throws Exception {
|
||||
System.err.println("* Running " + measuredCount + " iterations for " + cls.getSimpleName() + " ...");
|
||||
System.err.println(" - warm-up " + warmupCount + " iterations...");
|
||||
for (int i = 0; i < warmupCount; i++) {
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
singleBenchmark(cls, null);
|
||||
}
|
||||
|
||||
Bench b = new Bench();
|
||||
System.err.print(" - main iterations:"); System.err.flush();
|
||||
for (int i = 0; i < measuredCount; i++) {
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
singleBenchmark(cls, b);
|
||||
if (i > 0 && (i % 10 == 0)) {
|
||||
System.err.print(" " + i);
|
||||
System.err.flush();
|
||||
}
|
||||
}
|
||||
|
||||
System.err.println();
|
||||
return b;
|
||||
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
public final class TermFreq {
|
||||
public final String term;
|
||||
public final float v;
|
||||
|
||||
public TermFreq(String term, float v) {
|
||||
this.term = term;
|
||||
this.v = v;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.solr.util.TermFreqIterator;
|
||||
|
||||
/**
|
||||
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
|
||||
*/
|
||||
public final class TermFreqArrayIterator implements TermFreqIterator {
|
||||
private final Iterator<TermFreq> i;
|
||||
private TermFreq current;
|
||||
|
||||
public TermFreqArrayIterator(Iterator<TermFreq> i) {
|
||||
this.i = i;
|
||||
}
|
||||
|
||||
public TermFreqArrayIterator(TermFreq [] i) {
|
||||
this(Arrays.asList(i));
|
||||
}
|
||||
|
||||
public TermFreqArrayIterator(Iterable<TermFreq> i) {
|
||||
this(i.iterator());
|
||||
}
|
||||
|
||||
public float freq() {
|
||||
return current.v;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return i.hasNext();
|
||||
}
|
||||
|
||||
public String next() {
|
||||
return (current = i.next()).term;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException(); }
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
package org.apache.solr.spelling.suggest.fst;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
|
||||
import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
|
||||
import org.apache.solr.spelling.suggest.TermFreq;
|
||||
import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link FSTLookup}.
|
||||
*/
|
||||
public class FSTLookupTest extends LuceneTestCase {
|
||||
public static TermFreq tf(String t, float v) {
|
||||
return new TermFreq(t, v);
|
||||
}
|
||||
|
||||
private FSTLookup lookup;
|
||||
|
||||
@Before
|
||||
public void prepare() throws Exception {
|
||||
final TermFreq[] keys = new TermFreq[] {
|
||||
tf("one", 0.5f),
|
||||
tf("oneness", 1),
|
||||
tf("onerous", 1),
|
||||
tf("onesimus", 1),
|
||||
tf("two", 1),
|
||||
tf("twofold", 1),
|
||||
tf("twonk", 1),
|
||||
tf("thrive", 1),
|
||||
tf("through", 1),
|
||||
tf("threat", 1),
|
||||
tf("three", 1),
|
||||
tf("foundation", 1),
|
||||
tf("fourier", 1),
|
||||
tf("four", 1),
|
||||
tf("fourty", 1),
|
||||
tf("xo", 1),
|
||||
};
|
||||
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(keys));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExactMatchHighPriority() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExactMatchLowPriority() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("one", true, 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMiss() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("xyz", true, 1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAlphabeticWithWeights() throws Exception {
|
||||
assertEquals(0, lookup.lookup("xyz", false, 1).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullMatchList() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
|
||||
"oneness/1.0",
|
||||
"onerous/1.0",
|
||||
"onesimus/1.0",
|
||||
"one/0.0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultilingualInput() throws Exception {
|
||||
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
|
||||
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(input));
|
||||
|
||||
for (TermFreq tf : input) {
|
||||
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
|
||||
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyInput() throws Exception {
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
|
||||
|
||||
assertMatchEquals(lookup.lookup("", true, 10));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRandom() throws Exception {
|
||||
List<TermFreq> freqs = Lists.newArrayList();
|
||||
Random rnd = random;
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
|
||||
}
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
|
||||
|
||||
for (TermFreq tf : freqs) {
|
||||
final String term = tf.term;
|
||||
for (int i = 1; i < term.length(); i++) {
|
||||
String prefix = term.substring(0, i);
|
||||
for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
|
||||
Assert.assertTrue(lr.key.startsWith(prefix));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void assertMatchEquals(List<LookupResult> res, String... expected) {
|
||||
String [] result = new String [res.size()];
|
||||
for (int i = 0; i < res.size(); i++)
|
||||
result[i] = res.get(i).toString();
|
||||
|
||||
if (!Arrays.equals(expected, result)) {
|
||||
int colLen = Math.max(maxLen(expected), maxLen(result));
|
||||
|
||||
StringBuilder b = new StringBuilder();
|
||||
String format = "%" + colLen + "s " + "%" + colLen + "s\n";
|
||||
b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
|
||||
for (int i = 0; i < Math.max(result.length, expected.length); i++) {
|
||||
b.append(String.format(Locale.ENGLISH, format,
|
||||
i < expected.length ? expected[i] : "--",
|
||||
i < result.length ? result[i] : "--"));
|
||||
}
|
||||
|
||||
System.err.println(b.toString());
|
||||
fail("Expected different output:\n" + b.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private int maxLen(String[] result) {
|
||||
int len = 0;
|
||||
for (String s : result)
|
||||
len = Math.max(len, s.length());
|
||||
return len;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue