mirror of https://github.com/apache/lucene.git
Merging with trunk.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/solr5914@1584603 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
bc43bebedf
|
@ -15,6 +15,7 @@
|
|||
<orderEntry type="library" scope="TEST" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr test framework library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="ICU library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-core-test-files" />
|
||||
|
@ -29,5 +30,7 @@
|
|||
<orderEntry type="module" scope="TEST" module-name="misc" />
|
||||
<orderEntry type="module" scope="TEST" module-name="join" />
|
||||
<orderEntry type="module" scope="TEST" module-name="expressions" />
|
||||
<orderEntry type="module" scope="TEST" module-name="icu" />
|
||||
<orderEntry type="module" scope="TEST" module-name="analysis-extras" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -55,6 +55,10 @@ Documentation
|
|||
* LUCENE-5392: Add/improve analysis package documentation to reflect
|
||||
analysis API changes. (Benson Margulies via Robert Muir - pull request #17)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-5563: Removed sep layout: which has fallen behind on features and doesn't
|
||||
perform as well as other options. (Robert Muir)
|
||||
|
||||
======================= Lucene 4.8.0 =======================
|
||||
|
||||
|
@ -135,6 +139,16 @@ New Features
|
|||
resort the hits from a first pass search using a Sort or an
|
||||
Expression. (Simon Willnauer, Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-5558: Add TruncateTokenFilter which truncates terms to
|
||||
the specified length. (Ahmet Arslan via Robert Muir)
|
||||
|
||||
* LUCENE-2446: Added checksums to lucene index files. As of 4.8, the last 8
|
||||
bytes of each file contain a zlib-crc32 checksum. Small metadata files are
|
||||
verified on load. Larger files can be checked on demand via
|
||||
AtomicReader.checkIntegrity. You can configure this to happen automatically
|
||||
before merges by enabling IndexWriterConfig.setCheckIntegrityAtMerge.
|
||||
(Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
|
||||
|
@ -210,8 +224,18 @@ Bug fixes
|
|||
|
||||
* LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order. (Robert Muir)
|
||||
|
||||
* LUCENE-5555: Fix SortedInputIterator to correctly encode/decode contexts in presence of payload (Areek Zillur)
|
||||
|
||||
* LUCENE-5559: Add missing argument checks to tokenfilters taking
|
||||
numeric arguments. (Ahmet Arslan via Robert Muir)
|
||||
|
||||
* LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
|
||||
|
||||
Test Framework
|
||||
|
||||
* LUCENE-5567: When a suite fails with zombie threads failure marker and count
|
||||
is not propagated properly. (Dawid Weiss)
|
||||
|
||||
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
|
||||
|
||||
* LUCENE-5501: Added random out-of-order collection testing (when the collector
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.br;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -64,7 +65,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ckb;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -61,7 +62,7 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Czech language.
|
||||
|
@ -60,7 +61,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.da;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -63,7 +64,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.de;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -68,7 +69,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.es;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -62,7 +63,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.fi;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -63,7 +64,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
|
@ -79,7 +80,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.gl;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -61,7 +62,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.hu;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -63,7 +64,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -50,6 +50,7 @@ import java.io.OutputStream;
|
|||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -672,7 +673,7 @@ public class Dictionary {
|
|||
int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
|
||||
if (flagSep == -1) {
|
||||
CharSequence cleansed = cleanInput(line, sb);
|
||||
writer.write(cleansed.toString().getBytes(IOUtils.CHARSET_UTF_8));
|
||||
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
|
||||
} else {
|
||||
String text = line.substring(0, flagSep);
|
||||
CharSequence cleansed = cleanInput(text, sb);
|
||||
|
@ -681,10 +682,10 @@ public class Dictionary {
|
|||
sb.append(cleansed);
|
||||
}
|
||||
sb.append(line.substring(flagSep));
|
||||
writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
|
||||
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
|
||||
writer.write(line.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,8 +21,7 @@ import java.nio.ByteBuffer;
|
|||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
// many hunspell dictionaries use this encoding, yet java does not have it?!?!
|
||||
final class ISO8859_14Decoder extends CharsetDecoder {
|
||||
|
@ -43,7 +42,7 @@ final class ISO8859_14Decoder extends CharsetDecoder {
|
|||
};
|
||||
|
||||
ISO8859_14Decoder() {
|
||||
super(IOUtils.CHARSET_UTF_8, 1f, 1f);
|
||||
super(StandardCharsets.ISO_8859_1 /* fake with similar properties */, 1f, 1f);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,13 +19,13 @@ package org.apache.lucene.analysis.it;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
|
@ -72,7 +72,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.lv;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -61,7 +62,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -46,6 +46,12 @@ public final class LengthFilter extends FilteringTokenFilter {
|
|||
*/
|
||||
public LengthFilter(Version version, TokenStream in, int min, int max) {
|
||||
super(version, in);
|
||||
if (min < 0) {
|
||||
throw new IllegalArgumentException("minimum length must be greater than or equal to zero");
|
||||
}
|
||||
if (min > max) {
|
||||
throw new IllegalArgumentException("maximum length must not be greater than minimum length");
|
||||
}
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
|
|
|
@ -61,6 +61,9 @@ public final class LimitTokenCountFilter extends TokenFilter {
|
|||
*/
|
||||
public LimitTokenCountFilter(TokenStream in, int maxTokenCount, boolean consumeAllTokens) {
|
||||
super(in);
|
||||
if (maxTokenCount < 1) {
|
||||
throw new IllegalArgumentException("maxTokenCount must be greater than zero");
|
||||
}
|
||||
this.maxTokenCount = maxTokenCount;
|
||||
this.consumeAllTokens = consumeAllTokens;
|
||||
}
|
||||
|
|
|
@ -67,6 +67,9 @@ public final class LimitTokenPositionFilter extends TokenFilter {
|
|||
*/
|
||||
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) {
|
||||
super(in);
|
||||
if (maxTokenPosition < 1) {
|
||||
throw new IllegalArgumentException("maxTokenPosition must be greater than zero");
|
||||
}
|
||||
this.maxTokenPosition = maxTokenPosition;
|
||||
this.consumeAllTokens = consumeAllTokens;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A token filter for truncating the terms into a specific length.
|
||||
* Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
|
||||
* It is reported that F5, using first 5 characters, produced best results in
|
||||
* <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
|
||||
* Information Retrieval on Turkish Texts</a>
|
||||
*/
|
||||
public final class TruncateTokenFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
private final int length;
|
||||
|
||||
public TruncateTokenFilter(TokenStream input, int length) {
|
||||
super(input);
|
||||
if (length < 1)
|
||||
throw new IllegalArgumentException("length parameter must be a positive number: " + length);
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword() && termAttribute.length() > length)
|
||||
termAttribute.setLength(length);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.ApostropheFilterFactory"/>
|
||||
* <filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||
* <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
|
||||
* <filter class="solr.KeywordRepeatFilterFactory"/>
|
||||
* <filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/>
|
||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class TruncateTokenFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public static final String PREFIX_LENGTH_KEY = "prefixLength";
|
||||
private final byte prefixLength;
|
||||
|
||||
public TruncateTokenFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
|
||||
if (prefixLength < 1)
|
||||
throw new IllegalArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameter(s): " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TruncateTokenFilter(input, prefixLength);
|
||||
}
|
||||
}
|
|
@ -31,16 +31,14 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Dutch language.
|
||||
|
@ -75,7 +73,7 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.no;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -63,7 +64,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.payloads;
|
|||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -28,7 +29,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
*
|
||||
**/
|
||||
public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{
|
||||
protected Charset charset = Charset.forName("UTF-8");
|
||||
protected Charset charset = StandardCharsets.UTF_8;
|
||||
|
||||
public IdentityEncoder() {
|
||||
}
|
||||
|
|
|
@ -45,8 +45,8 @@ public class TypeAsPayloadTokenFilter extends TokenFilter {
|
|||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
String type = typeAtt.type();
|
||||
if (type != null && type.equals("") == false) {
|
||||
payloadAtt.setPayload(new BytesRef(type.getBytes("UTF-8")));
|
||||
if (type != null && !type.isEmpty()) {
|
||||
payloadAtt.setPayload(new BytesRef(type));
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.pt;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -62,7 +63,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
@ -247,7 +248,7 @@ public abstract class RSLPStemmerBase {
|
|||
// TODO: this parser is ugly, but works. use a jflex grammar instead.
|
||||
try {
|
||||
InputStream is = clazz.getResourceAsStream(resource);
|
||||
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
|
||||
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
|
||||
Map<String,Step> steps = new HashMap<>();
|
||||
String step;
|
||||
while ((step = readLine(r)) != null) {
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ru;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
|
@ -53,7 +54,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -31,6 +31,12 @@ public class TokenRangeSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
|||
private int count;
|
||||
|
||||
public TokenRangeSinkFilter(int lower, int upper) {
|
||||
if (lower < 1) {
|
||||
throw new IllegalArgumentException("lower must be greater than zero");
|
||||
}
|
||||
if (lower > upper) {
|
||||
throw new IllegalArgumentException("lower must not be greater than upper");
|
||||
}
|
||||
this.lower = lower;
|
||||
this.upper = upper;
|
||||
}
|
||||
|
|
|
@ -84,6 +84,9 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
public void setMaxTokenLength(int length) {
|
||||
if (length < 1) {
|
||||
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
|
||||
}
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
|
|
|
@ -98,6 +98,9 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
public void setMaxTokenLength(int length) {
|
||||
if (length < 1) {
|
||||
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
|
||||
}
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
|
|
|
@ -84,6 +84,9 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
public void setMaxTokenLength(int length) {
|
||||
if (length < 1) {
|
||||
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
|
||||
}
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.sv;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -63,7 +64,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.Reader;
|
|||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
@ -157,8 +158,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
|||
/**
|
||||
* Load synonyms with the given {@link SynonymMap.Parser} class.
|
||||
*/
|
||||
private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.io.InputStreamReader;
|
|||
import java.io.Reader;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
@ -252,7 +253,7 @@ public abstract class AbstractAnalysisFactory {
|
|||
* Returns the resource's lines (with content treated as UTF-8)
|
||||
*/
|
||||
protected final List<String> getLines(ResourceLoader loader, String resource) throws IOException {
|
||||
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
|
||||
return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
|
||||
|
@ -272,7 +273,7 @@ public abstract class AbstractAnalysisFactory {
|
|||
Reader reader = null;
|
||||
try {
|
||||
stream = loader.openResource(file.trim());
|
||||
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
|
||||
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
reader = new InputStreamReader(stream, decoder);
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -97,7 +98,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
|||
final String comment) throws IOException {
|
||||
Reader reader = null;
|
||||
try {
|
||||
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
|
||||
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
|
||||
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_CURRENT, 16, ignoreCase));
|
||||
} finally {
|
||||
IOUtils.close(reader);
|
||||
|
@ -122,7 +123,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
|||
Version matchVersion) throws IOException {
|
||||
Reader reader = null;
|
||||
try {
|
||||
reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
|
||||
reader = IOUtils.getDecodingReader(stopwords, StandardCharsets.UTF_8);
|
||||
return WordlistLoader.getWordSet(reader, matchVersion);
|
||||
} finally {
|
||||
IOUtils.close(reader);
|
||||
|
|
|
@ -69,6 +69,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
@ -78,7 +79,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
//Some sanity checks, but not a full-fledged check
|
||||
public void testHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -95,7 +96,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testMSWord14GeneratedHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
String gold = "This is a test";
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
|
|
|
@ -15,6 +15,7 @@ import java.io.IOException;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -269,7 +270,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
String luceneResourcesWikiPage;
|
||||
try {
|
||||
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||
("LuceneResourcesWikiPage.html"), "UTF-8");
|
||||
("LuceneResourcesWikiPage.html"), StandardCharsets.UTF_8);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
|
@ -289,7 +290,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
try {
|
||||
List<String> urlList = new ArrayList<>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), StandardCharsets.UTF_8));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
|
@ -313,7 +314,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
String randomTextWithEmails;
|
||||
try {
|
||||
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||
("random.text.with.email.addresses.txt"), "UTF-8");
|
||||
("random.text.with.email.addresses.txt"), StandardCharsets.UTF_8);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
|
@ -334,7 +335,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
List<String> emailList = new ArrayList<>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream
|
||||
("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
|
||||
("email.addresses.from.random.text.with.email.addresses.txt"), StandardCharsets.UTF_8));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
|
@ -383,7 +384,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
String randomTextWithURLs;
|
||||
try {
|
||||
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||
("random.text.with.urls.txt"), "UTF-8");
|
||||
("random.text.with.urls.txt"), StandardCharsets.UTF_8);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
|
@ -404,7 +405,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
List<String> urlList = new ArrayList<>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream
|
||||
("urls.from.random.text.with.urls.txt"), "UTF-8"));
|
||||
("urls.from.random.text.with.urls.txt"), StandardCharsets.UTF_8));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.hunspell;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
|
@ -157,7 +158,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||
assert f.exists();
|
||||
|
||||
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
|
||||
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
|
||||
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
|
||||
assert dicEntry != null;
|
||||
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||
|
@ -186,7 +187,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||
assert f.exists();
|
||||
|
||||
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
|
||||
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
|
||||
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
|
||||
assert dicEntry != null;
|
||||
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.hunspell;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
|
@ -173,7 +174,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
|||
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||
assert f.exists();
|
||||
|
||||
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
|
||||
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
|
||||
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
|
||||
assert dicEntry != null;
|
||||
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||
|
@ -202,7 +203,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
|||
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||
assert f.exists();
|
||||
|
||||
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
|
||||
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
|
||||
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
|
||||
assert dicEntry != null;
|
||||
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
|
|||
import java.io.FilterInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -232,10 +233,10 @@ public class TestDictionary extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\tUTF-8\n".getBytes(IOUtils.CHARSET_UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\t UTF-8\n".getBytes(IOUtils.CHARSET_UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(IOUtils.CHARSET_UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(IOUtils.CHARSET_UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
|
||||
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
|
||||
}
|
||||
|
||||
public void testFlagWithCrazyWhitespace() throws Exception {
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestLengthFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -50,4 +51,11 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
/**
|
||||
* checking the validity of constructor arguments
|
||||
*/
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testIllegalArguments() throws Exception {
|
||||
new LengthFilter(TEST_VERSION_CURRENT, whitespaceMockTokenizer("accept only valid arguments"), -4, -1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
|
@ -31,8 +32,8 @@ public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer)stream).setReader(reader);
|
||||
stream = tokenFilterFactory("Length",
|
||||
"min", "4",
|
||||
"max", "10").create(stream);
|
||||
LengthFilterFactory.MIN_KEY, "4",
|
||||
LengthFilterFactory.MAX_KEY, "10").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
|
||||
}
|
||||
|
||||
|
@ -40,12 +41,27 @@ public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Length",
|
||||
"min", "4",
|
||||
"max", "5",
|
||||
LengthFilterFactory.MIN_KEY, "4",
|
||||
LengthFilterFactory.MAX_KEY, "5",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
|
||||
/** Test that invalid arguments result in exception */
|
||||
public void testInvalidArguments() throws Exception {
|
||||
try {
|
||||
Reader reader = new StringReader("foo foobar super-duper-trooper");
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer)stream).setReader(reader);
|
||||
tokenFilterFactory("Length",
|
||||
LengthFilterFactory.MIN_KEY, "5",
|
||||
LengthFilterFactory.MAX_KEY, "4").create(stream);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("maximum length must not be greater than minimum length"));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestLimitTokenCountFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
TokenStream stream = new LimitTokenCountFilter(tokenizer, 3, consumeAll);
|
||||
assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testIllegalArguments() throws Exception {
|
||||
new LimitTokenCountFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
|
||||
}
|
||||
}
|
|
@ -1,11 +1,12 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
|
@ -16,25 +17,28 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestLimitTokenCountFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(reader);
|
||||
// LimitTokenCountFilter doesn't consume the entire stream that it wraps
|
||||
tokenizer.setEnableChecks(false);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("LimitTokenCount",
|
||||
"maxTokenCount", "3").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(reader);
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("LimitTokenCount",
|
||||
LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3",
|
||||
LimitTokenCountFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
|
||||
}
|
||||
}
|
||||
|
||||
public void testRequired() throws Exception {
|
||||
|
@ -44,15 +48,17 @@ public class TestLimitTokenCountFilterFactory extends BaseTokenStreamFactoryTest
|
|||
fail();
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertTrue("exception doesn't mention param: " + e.getMessage(),
|
||||
0 < e.getMessage().indexOf(LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY));
|
||||
0 < e.getMessage().indexOf(LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY));
|
||||
}
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
/**
|
||||
* Test that bogus arguments result in exception
|
||||
*/
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("LimitTokenCount",
|
||||
"maxTokenCount", "3",
|
||||
LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
|
|
|
@ -16,10 +16,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
@ -27,11 +23,15 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testMaxPosition2() throws IOException {
|
||||
for (final boolean consumeAll : new boolean[] { true, false }) {
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
|
@ -42,43 +42,50 @@ public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
// dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
|
||||
// don't use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
|
||||
assertTokenStreamContents(a.tokenStream("dummy", "1 2 3 4 5"),
|
||||
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null);
|
||||
new String[]{"1", "2"}, new int[]{0, 3}, new int[]{1, 4}, consumeAll ? 16 : null);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
|
||||
new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null);
|
||||
new String[]{"1", "2"}, new int[]{0, 2}, new int[]{1, 3}, consumeAll ? 9 : null);
|
||||
|
||||
// less than the limit, ensure we behave correctly
|
||||
assertTokenStreamContents(a.tokenStream("dummy", "1 "),
|
||||
new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null);
|
||||
new String[]{"1"}, new int[]{0}, new int[]{1}, consumeAll ? 3 : null);
|
||||
|
||||
// equal to limit
|
||||
assertTokenStreamContents(a.tokenStream("dummy", "1 2 "),
|
||||
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null);
|
||||
new String[]{"1", "2"}, new int[]{0, 3}, new int[]{1, 4}, consumeAll ? 6 : null);
|
||||
}
|
||||
}
|
||||
|
||||
public void testMaxPosition3WithSynomyms() throws IOException {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
|
||||
tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
|
||||
// if we are consuming all tokens, we can use the checks, otherwise we can't
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
|
||||
SynonymMap.Builder builder = new SynonymMap.Builder(true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("first"), true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
|
||||
CharsRef multiWordCharsRef = new CharsRef();
|
||||
SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
|
||||
builder.add(new CharsRef("one"), multiWordCharsRef, true);
|
||||
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
|
||||
builder.add(new CharsRef("two"), multiWordCharsRef, true);
|
||||
SynonymMap synonymMap = builder.build();
|
||||
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
|
||||
stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
|
||||
SynonymMap.Builder builder = new SynonymMap.Builder(true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("first"), true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
|
||||
CharsRef multiWordCharsRef = new CharsRef();
|
||||
SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
|
||||
builder.add(new CharsRef("one"), multiWordCharsRef, true);
|
||||
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
|
||||
builder.add(new CharsRef("two"), multiWordCharsRef, true);
|
||||
SynonymMap synonymMap = builder.build();
|
||||
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
|
||||
stream = new LimitTokenPositionFilter(stream, 3, consumeAll);
|
||||
|
||||
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" },
|
||||
new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
|
||||
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
|
||||
assertTokenStreamContents(stream,
|
||||
new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
|
||||
new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testIllegalArguments() throws Exception {
|
||||
new LimitTokenPositionFilter(whitespaceMockTokenizer("one two three four five"), 0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,24 +16,28 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testMaxPosition1() throws Exception {
|
||||
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||
tokenizer.setEnableChecks(false);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("LimitTokenPosition",
|
||||
"maxTokenPosition", "1").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "A1" });
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
// if we are consuming all tokens, we can use the checks, otherwise we can't
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("LimitTokenPosition",
|
||||
LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1",
|
||||
LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{"A1"});
|
||||
}
|
||||
}
|
||||
|
||||
public void testMissingParam() throws Exception {
|
||||
|
@ -47,30 +51,27 @@ public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamFactoryT
|
|||
}
|
||||
|
||||
public void testMaxPosition1WithShingles() throws Exception {
|
||||
Reader reader = new StringReader("one two three four five");
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||
tokenizer.setEnableChecks(false);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("Shingle",
|
||||
"minShingleSize", "2",
|
||||
"maxShingleSize", "3",
|
||||
"outputUnigrams", "true").create(stream);
|
||||
stream = tokenFilterFactory("LimitTokenPosition",
|
||||
"maxTokenPosition", "1").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" });
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
Reader reader = new StringReader("one two three four five");
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
// if we are consuming all tokens, we can use the checks, otherwise we can't
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("Shingle",
|
||||
"minShingleSize", "2",
|
||||
"maxShingleSize", "3",
|
||||
"outputUnigrams", "true").create(stream);
|
||||
stream = tokenFilterFactory("LimitTokenPosition",
|
||||
LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1",
|
||||
LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{"one", "one two", "one two three"});
|
||||
}
|
||||
}
|
||||
|
||||
public void testConsumeAllTokens() throws Exception {
|
||||
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("LimitTokenPosition",
|
||||
"maxTokenPosition", "3",
|
||||
"consumeAllTokens", "true").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
/**
|
||||
* Test that bogus arguments result in exception
|
||||
*/
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("LimitTokenPosition",
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Test the truncate token filter.
|
||||
*/
|
||||
public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testTruncating() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
|
||||
stream = new TruncateTokenFilter(stream, 5);
|
||||
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testNonPositiveLength() throws Exception {
|
||||
new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the simple truncation filter factory is working.
|
||||
*/
|
||||
public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
/**
|
||||
* Ensure the filter actually truncates text.
|
||||
*/
|
||||
public void testTruncating() throws Exception {
|
||||
Reader reader = new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer) stream).setReader(reader);
|
||||
stream = tokenFilterFactory("Truncate",
|
||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5").create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that bogus arguments result in exception
|
||||
*/
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Truncate",
|
||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that negative prefix length result in exception
|
||||
*/
|
||||
public void testNonPositivePrefixLengthArgument() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Truncate",
|
||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "-5"
|
||||
);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains(TruncateTokenFilterFactory.PREFIX_LENGTH_KEY + " parameter must be a positive number: -5"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class DelimitedPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -37,15 +38,15 @@ public class DelimitedPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("fox", filter, termAtt, payAtt, "NN".getBytes("UTF-8"));
|
||||
assertTermEquals("jumped", filter, termAtt, payAtt, "VB".getBytes("UTF-8"));
|
||||
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("fox", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("jumped", filter, termAtt, payAtt, "VB".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("over", filter, termAtt, payAtt, null);
|
||||
assertTermEquals("the", filter, termAtt, payAtt, null);
|
||||
assertTermEquals("lazy", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes("UTF-8"));
|
||||
assertTermEquals("lazy", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
|
@ -59,15 +60,15 @@ public class DelimitedPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, null);
|
||||
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("red", filter, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("fox", filter, "NN".getBytes("UTF-8"));
|
||||
assertTermEquals("jumped", filter, "VB".getBytes("UTF-8"));
|
||||
assertTermEquals("quick", filter, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("red", filter, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("fox", filter, "NN".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("jumped", filter, "VB".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("over", filter, null);
|
||||
assertTermEquals("the", filter, null);
|
||||
assertTermEquals("lazy", filter, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("brown", filter, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("dogs", filter, "NN".getBytes("UTF-8"));
|
||||
assertTermEquals("lazy", filter, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("brown", filter, "JJ".getBytes(StandardCharsets.UTF_8));
|
||||
assertTermEquals("dogs", filter, "NN".getBytes(StandardCharsets.UTF_8));
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -41,8 +42,8 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
while (nptf.incrementToken()) {
|
||||
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.buffer()[0]))));
|
||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||
String type = new String(payloadAtt.getPayload().bytes, "UTF-8");
|
||||
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
|
||||
String type = payloadAtt.getPayload().utf8ToString();
|
||||
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()));
|
||||
count++;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
|
@ -21,6 +22,7 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -45,4 +47,9 @@ public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
|||
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||
assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testIllegalArguments() throws Exception {
|
||||
new TokenRangeSinkFilter(4, 2);
|
||||
}
|
||||
}
|
|
@ -172,4 +172,13 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testIllegalArguments() throws Exception {
|
||||
try {
|
||||
tokenizerFactory("UAX29URLEmail", "maxTokenLength", "-1").create();
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("maxTokenLength must be greater than zero"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
/** Fake resource loader for tests: works if you want to fake reading a single file */
|
||||
public class StringMockResourceLoader implements ResourceLoader {
|
||||
|
@ -50,6 +51,6 @@ public class StringMockResourceLoader implements ResourceLoader {
|
|||
|
||||
@Override
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||
return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.IOException;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -49,7 +50,7 @@ public class TestFilesystemResourceLoader extends LuceneTestCase {
|
|||
private void assertClasspathDelegation(ResourceLoader rl) throws Exception {
|
||||
// try a stopwords file from classpath
|
||||
CharArraySet set = WordlistLoader.getSnowballWordSet(
|
||||
new InputStreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), IOUtils.CHARSET_UTF_8),
|
||||
new InputStreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), StandardCharsets.UTF_8),
|
||||
TEST_VERSION_CURRENT
|
||||
);
|
||||
assertTrue(set.contains("you"));
|
||||
|
@ -64,7 +65,7 @@ public class TestFilesystemResourceLoader extends LuceneTestCase {
|
|||
final File base = TestUtil.createTempDir("fsResourceLoaderBase").getAbsoluteFile();
|
||||
try {
|
||||
base.mkdirs();
|
||||
Writer os = new OutputStreamWriter(new FileOutputStream(new File(base, "template.txt")), IOUtils.CHARSET_UTF_8);
|
||||
Writer os = new OutputStreamWriter(new FileOutputStream(new File(base, "template.txt")), StandardCharsets.UTF_8);
|
||||
try {
|
||||
os.write("foobar\n");
|
||||
} finally {
|
||||
|
@ -72,28 +73,28 @@ public class TestFilesystemResourceLoader extends LuceneTestCase {
|
|||
}
|
||||
|
||||
ResourceLoader rl = new FilesystemResourceLoader(base);
|
||||
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), IOUtils.CHARSET_UTF_8).get(0));
|
||||
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
|
||||
// Same with full path name:
|
||||
String fullPath = new File(base, "template.txt").toString();
|
||||
assertEquals("foobar",
|
||||
WordlistLoader.getLines(rl.openResource(fullPath), IOUtils.CHARSET_UTF_8).get(0));
|
||||
WordlistLoader.getLines(rl.openResource(fullPath), StandardCharsets.UTF_8).get(0));
|
||||
assertClasspathDelegation(rl);
|
||||
assertNotFound(rl);
|
||||
|
||||
// now use RL without base dir:
|
||||
rl = new FilesystemResourceLoader();
|
||||
assertEquals("foobar",
|
||||
WordlistLoader.getLines(rl.openResource(new File(base, "template.txt").toString()), IOUtils.CHARSET_UTF_8).get(0));
|
||||
WordlistLoader.getLines(rl.openResource(new File(base, "template.txt").toString()), StandardCharsets.UTF_8).get(0));
|
||||
assertClasspathDelegation(rl);
|
||||
assertNotFound(rl);
|
||||
} finally {
|
||||
TestUtil.rmDir(base);
|
||||
TestUtil.rm(base);
|
||||
}
|
||||
}
|
||||
|
||||
public void testDelegation() throws Exception {
|
||||
ResourceLoader rl = new FilesystemResourceLoader(null, new StringMockResourceLoader("foobar\n"));
|
||||
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), IOUtils.CHARSET_UTF_8).get(0));
|
||||
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.io.OutputStreamWriter;
|
|||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
@ -118,7 +119,7 @@ public class GenerateJflexTLDMacros {
|
|||
connection.connect();
|
||||
tldFileLastModified = connection.getLastModified();
|
||||
BufferedReader reader = new BufferedReader
|
||||
(new InputStreamReader(connection.getInputStream(), "US-ASCII"));
|
||||
(new InputStreamReader(connection.getInputStream(), StandardCharsets.US_ASCII));
|
||||
try {
|
||||
String line;
|
||||
while (null != (line = reader.readLine())) {
|
||||
|
@ -150,7 +151,7 @@ public class GenerateJflexTLDMacros {
|
|||
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
final Writer writer = new OutputStreamWriter
|
||||
(new FileOutputStream(outputFile), "UTF-8");
|
||||
(new FileOutputStream(outputFile), StandardCharsets.UTF_8);
|
||||
try {
|
||||
writer.write(APACHE_LICENSE);
|
||||
writer.write("// Generated from IANA Root Zone Database <");
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.icu.segmentation;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -132,7 +132,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
StringBuilder rules = new StringBuilder();
|
||||
InputStream rulesStream = loader.openResource(filename);
|
||||
BufferedReader reader = new BufferedReader
|
||||
(IOUtils.getDecodingReader(rulesStream, IOUtils.CHARSET_UTF_8));
|
||||
(IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8));
|
||||
String line = null;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if ( ! line.startsWith("#"))
|
||||
|
|
|
@ -35,6 +35,7 @@ import java.io.OutputStreamWriter;
|
|||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
@ -106,7 +107,7 @@ public class GenerateUTR30DataFiles {
|
|||
|
||||
private static void expandDataFileRules(File file) throws IOException {
|
||||
final FileInputStream stream = new FileInputStream(file);
|
||||
final InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
final InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
|
||||
final BufferedReader bufferedReader = new BufferedReader(reader);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
String line;
|
||||
|
@ -154,7 +155,7 @@ public class GenerateUTR30DataFiles {
|
|||
if (modified) {
|
||||
System.err.println("Expanding rules in and overwriting " + file.getName());
|
||||
final FileOutputStream out = new FileOutputStream(file, false);
|
||||
Writer writer = new OutputStreamWriter(out, "UTF-8");
|
||||
Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8);
|
||||
try {
|
||||
writer.write(builder.toString());
|
||||
} finally {
|
||||
|
@ -178,8 +179,8 @@ public class GenerateUTR30DataFiles {
|
|||
System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... ");
|
||||
URLConnection connection = openConnection(new URL(norm2url, NFC_TXT));
|
||||
BufferedReader reader = new BufferedReader
|
||||
(new InputStreamReader(connection.getInputStream(), "UTF-8"));
|
||||
Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), "UTF-8");
|
||||
(new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8));
|
||||
Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), StandardCharsets.UTF_8);
|
||||
try {
|
||||
String line;
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.io.FilenameFilter;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
|
||||
|
@ -37,7 +38,7 @@ public class RBBIRuleCompiler {
|
|||
static String getRules(File ruleFile) throws IOException {
|
||||
StringBuilder rules = new StringBuilder();
|
||||
InputStream in = new FileInputStream(ruleFile);
|
||||
BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
|
||||
BufferedReader cin = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
|
||||
String line = null;
|
||||
while ((line = cin.readLine()) != null) {
|
||||
if (!line.startsWith("#"))
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.ja;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
|
||||
|
@ -52,6 +53,6 @@ class StringMockResourceLoader implements ResourceLoader {
|
|||
|
||||
@Override
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||
return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -34,7 +35,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
|||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.ja.tokenattributes.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
|
@ -49,7 +49,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
try {
|
||||
try {
|
||||
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
|
||||
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
|
||||
return new UserDictionary(reader);
|
||||
} finally {
|
||||
is.close();
|
||||
|
@ -571,7 +571,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
/*
|
||||
public void testWikipedia() throws Exception {
|
||||
final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
|
||||
final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
|
||||
final Reader r = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8));
|
||||
|
||||
final long startTimeNS = System.nanoTime();
|
||||
boolean done = false;
|
||||
|
@ -618,7 +618,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
private void doTestBocchan(int numIterations) throws Exception {
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
|
||||
this.getClass().getResourceAsStream("bocchan.utf-8"), "UTF-8"));
|
||||
this.getClass().getResourceAsStream("bocchan.utf-8"), StandardCharsets.UTF_8));
|
||||
String line = reader.readLine();
|
||||
reader.close();
|
||||
|
||||
|
|
|
@ -22,13 +22,12 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class TestSearchMode extends BaseTokenStreamTestCase {
|
||||
private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
|
||||
|
@ -47,7 +46,7 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
|
|||
throw new FileNotFoundException("Cannot find " + SEGMENTATION_FILENAME + " in test classpath");
|
||||
}
|
||||
try {
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(is, IOUtils.CHARSET_UTF_8));
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
|
||||
String line = null;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
// Remove comments
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.LineNumberReader;
|
|||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class ConnectionCostsBuilder {
|
||||
|
||||
|
@ -32,7 +33,7 @@ public class ConnectionCostsBuilder {
|
|||
|
||||
public static ConnectionCostsWriter build(String filename) throws IOException {
|
||||
FileInputStream inputStream = new FileInputStream(filename);
|
||||
Charset cs = Charset.forName("US-ASCII");
|
||||
Charset cs = StandardCharsets.US_ASCII;
|
||||
CharsetDecoder decoder = cs.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
|
|
|
@ -21,10 +21,9 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Manages analysis data configuration for SmartChineseAnalyzer
|
||||
* <p>
|
||||
|
@ -80,7 +79,7 @@ public class AnalyzerProfile {
|
|||
Properties prop = new Properties();
|
||||
try {
|
||||
FileInputStream input = new FileInputStream(propFile);
|
||||
prop.load(new InputStreamReader(input, IOUtils.CHARSET_UTF_8));
|
||||
prop.load(new InputStreamReader(input, StandardCharsets.UTF_8));
|
||||
String dir = prop.getProperty("analysis.data.dir", "");
|
||||
input.close();
|
||||
return dir;
|
||||
|
|
|
@ -18,18 +18,16 @@
|
|||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -90,7 +88,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils
|
||||
.getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE,
|
||||
IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT,
|
||||
StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT,
|
||||
Version.LUCENE_CURRENT));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.pl;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -76,7 +77,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -65,10 +65,10 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Locale;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
|
@ -139,7 +139,7 @@ public class TestCompile extends LuceneTestCase {
|
|||
private static void assertTrie(Trie trie, String file, boolean usefull,
|
||||
boolean storeorig) throws Exception {
|
||||
LineNumberReader in = new LineNumberReader(new BufferedReader(
|
||||
new InputStreamReader(new FileInputStream(file), IOUtils.CHARSET_UTF_8)));
|
||||
new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8)));
|
||||
|
||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||
try {
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.benchmark.byTask;
|
|||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
@ -107,7 +107,7 @@ public class Benchmark {
|
|||
|
||||
Benchmark benchmark = null;
|
||||
try {
|
||||
benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
|
||||
benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, StandardCharsets.UTF_8));
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
System.exit(1);
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
|
@ -26,6 +25,7 @@ import java.io.FileFilter;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
@ -206,7 +206,7 @@ public class DirContentSource extends ContentSource {
|
|||
name = f.getCanonicalPath()+"_"+iteration;
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8));
|
||||
String line = null;
|
||||
//First line is the date, 3rd is the title, rest is body
|
||||
String dateStr = reader.readLine();
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
|
@ -318,7 +319,7 @@ public class DocMaker implements Closeable {
|
|||
|
||||
if (storeBytes) {
|
||||
Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED);
|
||||
bytesField.setBytesValue(bdy.getBytes("UTF-8"));
|
||||
bytesField.setBytesValue(bdy.getBytes(StandardCharsets.UTF_8));
|
||||
doc.add(bytesField);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,18 +20,15 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
@ -182,10 +179,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
if (localFileIS != null) { // null means fileIS was closed on us
|
||||
try {
|
||||
// To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
|
||||
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
|
||||
reader.parse(new InputSource(IOUtils.getDecodingReader(localFileIS, StandardCharsets.UTF_8)));
|
||||
} catch (IOException ioe) {
|
||||
synchronized(EnwikiContentSource.this) {
|
||||
if (localFileIS != is) {
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -62,12 +63,12 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
|
|||
Reader reader = null;
|
||||
// note: we use a decoding reader, so if your queries are screwed up you know
|
||||
if (file.exists()) {
|
||||
reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
|
||||
reader = IOUtils.getDecodingReader(file, StandardCharsets.UTF_8);
|
||||
} else {
|
||||
//see if we can find it as a resource
|
||||
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
|
||||
if (asStream != null) {
|
||||
reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
|
||||
reader = IOUtils.getDecodingReader(asStream, StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
if (reader != null) {
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Properties;
|
|||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* A {@link ContentSource} reading one line at a time as a
|
||||
|
@ -277,7 +278,7 @@ public class LineDocSource extends ContentSource {
|
|||
}
|
||||
file = new File(fileName).getAbsoluteFile();
|
||||
if (encoding == null) {
|
||||
encoding = "UTF-8";
|
||||
encoding = IOUtils.UTF_8;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
@ -30,7 +31,6 @@ import java.util.Date;
|
|||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* A {@link ContentSource} reading from the Reuters collection.
|
||||
|
@ -114,7 +114,7 @@ public class ReutersContentSource extends ContentSource {
|
|||
name = f.getCanonicalPath() + "_" + iteration;
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8));
|
||||
try {
|
||||
// First line is the date, 3rd is the title, rest is body
|
||||
String dateStr = reader.readLine();
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
@ -320,7 +321,7 @@ public class TrecContentSource extends ContentSource {
|
|||
}
|
||||
// encoding
|
||||
if (encoding == null) {
|
||||
encoding = "ISO-8859-1";
|
||||
encoding = StandardCharsets.ISO_8859_1.name();
|
||||
}
|
||||
// iteration exclusion in doc name
|
||||
excludeDocnameIteration = config.get("content.source.excludeIteration", false);
|
||||
|
|
|
@ -20,19 +20,18 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import org.apache.lucene.index.IndexCommit;
|
||||
import org.apache.lucene.index.IndexDeletionPolicy;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.LogMergePolicy;
|
||||
import org.apache.lucene.index.TieredMergePolicy;
|
||||
import org.apache.lucene.index.MergeScheduler;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import org.apache.lucene.index.MergePolicy;
|
||||
import org.apache.lucene.index.MergeScheduler;
|
||||
import org.apache.lucene.index.NoDeletionPolicy;
|
||||
import org.apache.lucene.index.NoMergePolicy;
|
||||
import org.apache.lucene.index.NoMergeScheduler;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
|
@ -130,7 +129,7 @@ public class CreateIndexTask extends PerfTask {
|
|||
if (defaultCodec != null) {
|
||||
try {
|
||||
Class<? extends Codec> clazz = Class.forName(defaultCodec).asSubclass(Codec.class);
|
||||
Codec.setDefault(clazz.newInstance());
|
||||
iwConf.setCodec(clazz.newInstance());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Couldn't instantiate Codec: " + defaultCodec, e);
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.io.File;
|
|||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
|
@ -41,7 +42,7 @@ public class WriteEnwikiLineDocTask extends WriteLineDocTask {
|
|||
public WriteEnwikiLineDocTask(PerfRunData runData) throws Exception {
|
||||
super(runData);
|
||||
OutputStream out = StreamUtils.outputStream(categoriesLineFile(new File(fname)));
|
||||
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
|
||||
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8), StreamUtils.BUFFER_SIZE));
|
||||
writeHeader(categoryLineFileOut);
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.File;
|
|||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.regex.Matcher;
|
||||
|
@ -101,7 +102,7 @@ public class WriteLineDocTask extends PerfTask {
|
|||
throw new IllegalArgumentException("line.file.out must be set");
|
||||
}
|
||||
OutputStream out = StreamUtils.outputStream(new File(fname));
|
||||
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
|
||||
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8), StreamUtils.BUFFER_SIZE));
|
||||
docMaker = runData.getDocMaker();
|
||||
|
||||
// init fields
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.io.File;
|
|||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -53,7 +54,7 @@ public class QueryDriver {
|
|||
|
||||
File topicsFile = new File(args[0]);
|
||||
File qrelsFile = new File(args[1]);
|
||||
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], "UTF-8"), "lucene");
|
||||
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], IOUtils.UTF_8 /* huh, no nio.Charset ctor? */), "lucene");
|
||||
FSDirectory dir = FSDirectory.open(new File(args[3]));
|
||||
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
|
@ -66,10 +67,10 @@ public class QueryDriver {
|
|||
|
||||
// use trec utilities to read trec topics into quality queries
|
||||
TrecTopicsReader qReader = new TrecTopicsReader();
|
||||
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, IOUtils.CHARSET_UTF_8)));
|
||||
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, StandardCharsets.UTF_8)));
|
||||
|
||||
// prepare judge, with trec utilities that read from a QRels file
|
||||
Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8)));
|
||||
Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, StandardCharsets.UTF_8)));
|
||||
|
||||
// validate topics & judgments match each other
|
||||
judge.validateData(qqs, logger);
|
||||
|
|
|
@ -21,16 +21,13 @@ import java.io.File;
|
|||
import java.io.FileFilter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
|
||||
/**
|
||||
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
|
||||
|
@ -78,7 +75,7 @@ public class ExtractReuters {
|
|||
*/
|
||||
protected void extractFile(File sgmFile) {
|
||||
try {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8));
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), StandardCharsets.UTF_8));
|
||||
|
||||
StringBuilder buffer = new StringBuilder(1024);
|
||||
StringBuilder outBuffer = new StringBuilder(1024);
|
||||
|
@ -112,7 +109,7 @@ public class ExtractReuters {
|
|||
File outFile = new File(outputDir, sgmFile.getName() + "-"
|
||||
+ (docNumber++) + ".txt");
|
||||
// System.out.println("Writing " + outFile);
|
||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), IOUtils.CHARSET_UTF_8);
|
||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), StandardCharsets.UTF_8);
|
||||
writer.write(out);
|
||||
writer.close();
|
||||
outBuffer.setLength(0);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.FileOutputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
|
||||
|
@ -30,7 +31,6 @@ import org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource;
|
|||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
||||
|
@ -86,7 +86,7 @@ public class ExtractWikipedia {
|
|||
contents.append("\n");
|
||||
|
||||
try {
|
||||
Writer writer = new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8);
|
||||
Writer writer = new OutputStreamWriter(new FileOutputStream(f), StandardCharsets.UTF_8);
|
||||
writer.write(contents.toString());
|
||||
writer.close();
|
||||
} catch (IOException ioe) {
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Collator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
@ -406,7 +407,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
|||
|
||||
BufferedReader r = new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream(lineFile), "UTF-8"));
|
||||
new FileInputStream(lineFile), StandardCharsets.UTF_8));
|
||||
int numLines = 0;
|
||||
String line;
|
||||
while((line = r.readLine()) != null) {
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.FileInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.AbstractQueryMaker;
|
||||
|
@ -121,7 +122,7 @@ public class TestPerfTasksParse extends LuceneTestCase {
|
|||
public boolean accept(File pathname) { return pathname.isFile() && pathname.getName().endsWith(".alg"); }
|
||||
})) {
|
||||
try {
|
||||
Config config = new Config(new InputStreamReader(new FileInputStream(algFile), "UTF-8"));
|
||||
Config config = new Config(new InputStreamReader(new FileInputStream(algFile), StandardCharsets.UTF_8));
|
||||
String contentSource = config.get("content.source", null);
|
||||
if (contentSource != null) { Class.forName(contentSource); }
|
||||
config.set("work.dir", TestUtil.createTempDir(LuceneTestCase.getTestClass().getSimpleName()).getAbsolutePath());
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Tests the functionality of {@link DocMaker}. */
|
||||
public class DocMakerTest extends BenchmarkTestCase {
|
||||
|
@ -166,7 +167,7 @@ public class DocMakerTest extends BenchmarkTestCase {
|
|||
// DocMaker did not close its ContentSource if resetInputs was called twice,
|
||||
// leading to a file handle leak.
|
||||
File f = new File(getWorkDir(), "docMakerLeak.txt");
|
||||
PrintStream ps = new PrintStream(f, "UTF-8");
|
||||
PrintStream ps = new PrintStream(f, IOUtils.UTF_8);
|
||||
ps.println("one title\t" + System.currentTimeMillis() + "\tsome content");
|
||||
ps.close();
|
||||
|
||||
|
|
|
@ -18,15 +18,13 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -43,7 +41,7 @@ public class EnwikiContentSourceTest extends LuceneTestCase {
|
|||
|
||||
@Override
|
||||
protected InputStream openInputStream() throws IOException {
|
||||
return new ByteArrayInputStream(docs.getBytes(IOUtils.CHARSET_UTF_8));
|
||||
return new ByteArrayInputStream(docs.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.FileOutputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
|
@ -53,7 +54,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
out = csFactory.createCompressorOutputStream("bzip2", out);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
|
||||
writeDocsToFile(writer, addHeader, null);
|
||||
writer.close();
|
||||
}
|
||||
|
@ -90,14 +91,14 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
|
||||
private void createRegularLineFile(File file, boolean addHeader) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
|
||||
writeDocsToFile(writer, addHeader, null);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
|
||||
Properties p = new Properties();
|
||||
for (String f : extraFields) {
|
||||
p.setProperty(f, f);
|
||||
|
@ -209,7 +210,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
|
||||
for (int i = 0; i < testCases.length; i++) {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), StandardCharsets.UTF_8));
|
||||
writer.write(testCases[i]);
|
||||
writer.newLine();
|
||||
writer.close();
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Properties;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
@ -73,7 +74,7 @@ public class WriteEnwikiLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
private void doReadTest(int n, File file, String expTitle, String expDate, String expBody) throws Exception {
|
||||
InputStream in = new FileInputStream(file);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
WriteLineDocTaskTest.assertHeaderLine(line);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashSet;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
@ -168,7 +169,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
default:
|
||||
assertFalse("Unknown file type!",true); //fail, should not happen
|
||||
}
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
|
@ -274,7 +275,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
|
@ -292,7 +293,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
|
@ -310,7 +311,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
|
@ -345,7 +346,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
wldt.close();
|
||||
|
||||
Set<String> ids = new HashSet<>();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line); // header line is written once, no matter how many threads there are
|
||||
|
|
|
@ -26,10 +26,10 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
@ -87,7 +87,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
|||
|
||||
private File rawTextFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8));
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), StandardCharsets.UTF_8));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
|
@ -116,7 +116,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
|||
}
|
||||
|
||||
private void writeText(OutputStream os) throws IOException {
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
|
@ -124,7 +124,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
|||
|
||||
private void assertReadText(File f) throws Exception {
|
||||
InputStream ir = StreamUtils.inputStream(f);
|
||||
InputStreamReader in = new InputStreamReader(ir, IOUtils.CHARSET_UTF_8);
|
||||
InputStreamReader in = new InputStreamReader(ir, StandardCharsets.UTF_8);
|
||||
BufferedReader r = new BufferedReader(in);
|
||||
String line = r.readLine();
|
||||
assertEquals("Wrong text found in "+f.getName(), TEXT, line);
|
||||
|
@ -136,14 +136,14 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
|||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
testDir = new File(getWorkDir(),"ContentSourceTest");
|
||||
TestUtil.rmDir(testDir);
|
||||
TestUtil.rm(testDir);
|
||||
assertTrue(testDir.mkdirs());
|
||||
}
|
||||
|
||||
@Override
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
TestUtil.rmDir(testDir);
|
||||
TestUtil.rm(testDir);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.io.InputStreamReader;
|
|||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
/**
|
||||
* Test that quality run does its job.
|
||||
|
@ -62,11 +63,11 @@ public class TestQualityRun extends BenchmarkTestCase {
|
|||
// prepare topics
|
||||
InputStream topics = getClass().getResourceAsStream("trecTopics.txt");
|
||||
TrecTopicsReader qReader = new TrecTopicsReader();
|
||||
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new InputStreamReader(topics, "UTF-8")));
|
||||
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new InputStreamReader(topics, StandardCharsets.UTF_8)));
|
||||
|
||||
// prepare judge
|
||||
InputStream qrels = getClass().getResourceAsStream("trecQRels.txt");
|
||||
Judge judge = new TrecJudge(new BufferedReader(new InputStreamReader(qrels, "UTF-8")));
|
||||
Judge judge = new TrecJudge(new BufferedReader(new InputStreamReader(qrels, StandardCharsets.UTF_8)));
|
||||
|
||||
// validate topics & judgments match each other
|
||||
judge.validateData(qqs, logger);
|
||||
|
@ -147,7 +148,7 @@ public class TestQualityRun extends BenchmarkTestCase {
|
|||
InputStream topicsFile = getClass().getResourceAsStream("trecTopics.txt");
|
||||
TrecTopicsReader qReader = new TrecTopicsReader();
|
||||
QualityQuery qqs[] = qReader.readQueries(
|
||||
new BufferedReader(new InputStreamReader(topicsFile, "UTF-8")));
|
||||
new BufferedReader(new InputStreamReader(topicsFile, StandardCharsets.UTF_8)));
|
||||
|
||||
assertEquals(20, qqs.length);
|
||||
|
||||
|
|
|
@ -177,7 +177,10 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
private void seekDir(IndexInput input, long dirOffset) throws IOException {
|
||||
if (version >= BlockTermsWriter.VERSION_APPEND_ONLY) {
|
||||
if (version >= BlockTermsWriter.VERSION_CHECKSUM) {
|
||||
input.seek(input.length() - CodecUtil.footerLength() - 8);
|
||||
dirOffset = input.readLong();
|
||||
} else if (version >= BlockTermsWriter.VERSION_APPEND_ONLY) {
|
||||
input.seek(input.length() - 8);
|
||||
dirOffset = input.readLong();
|
||||
}
|
||||
|
@ -863,4 +866,14 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
sizeInBytes += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
|
||||
return sizeInBytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
// verify terms
|
||||
if (version >= BlockTermsWriter.VERSION_CHECKSUM) {
|
||||
CodecUtil.checksumEntireFile(in);
|
||||
}
|
||||
// verify postings
|
||||
postingsReader.checkIntegrity();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,12 +63,13 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable {
|
|||
public static final int VERSION_START = 0;
|
||||
public static final int VERSION_APPEND_ONLY = 1;
|
||||
public static final int VERSION_META_ARRAY = 2;
|
||||
public static final int VERSION_CURRENT = VERSION_META_ARRAY;
|
||||
public static final int VERSION_CHECKSUM = 3;
|
||||
public static final int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
|
||||
/** Extension of terms file */
|
||||
static final String TERMS_EXTENSION = "tib";
|
||||
|
||||
protected final IndexOutput out;
|
||||
protected IndexOutput out;
|
||||
final PostingsWriterBase postingsWriter;
|
||||
final FieldInfos fieldInfos;
|
||||
FieldInfo currentField;
|
||||
|
@ -176,26 +177,30 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable {
|
|||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
if (out != null) {
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
|
||||
out.writeVInt(fields.size());
|
||||
for(FieldMetaData field : fields) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.numTerms);
|
||||
out.writeVLong(field.termsStartPointer);
|
||||
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
out.writeVLong(field.sumTotalTermFreq);
|
||||
}
|
||||
out.writeVLong(field.sumDocFreq);
|
||||
out.writeVInt(field.docCount);
|
||||
if (VERSION_CURRENT >= VERSION_META_ARRAY) {
|
||||
out.writeVInt(field.longsSize);
|
||||
out.writeVInt(fields.size());
|
||||
for(FieldMetaData field : fields) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.numTerms);
|
||||
out.writeVLong(field.termsStartPointer);
|
||||
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
out.writeVLong(field.sumTotalTermFreq);
|
||||
}
|
||||
out.writeVLong(field.sumDocFreq);
|
||||
out.writeVInt(field.docCount);
|
||||
if (VERSION_CURRENT >= VERSION_META_ARRAY) {
|
||||
out.writeVInt(field.longsSize);
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
CodecUtil.writeFooter(out);
|
||||
} finally {
|
||||
IOUtils.close(out, postingsWriter, termsIndexWriter);
|
||||
out = null;
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
} finally {
|
||||
IOUtils.close(out, postingsWriter, termsIndexWriter);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -66,6 +66,8 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
// start of the field info data
|
||||
private long dirOffset;
|
||||
|
||||
private int version;
|
||||
|
||||
public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
|
||||
throws IOException {
|
||||
|
||||
|
@ -78,6 +80,11 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
try {
|
||||
|
||||
readHeader(in);
|
||||
|
||||
if (version >= FixedGapTermsIndexWriter.VERSION_CHECKSUM) {
|
||||
CodecUtil.checksumEntireFile(in);
|
||||
}
|
||||
|
||||
indexInterval = in.readVInt();
|
||||
if (indexInterval < 1) {
|
||||
throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")");
|
||||
|
@ -124,7 +131,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
}
|
||||
|
||||
private void readHeader(IndexInput input) throws IOException {
|
||||
CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
|
||||
version = CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
|
||||
FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT);
|
||||
}
|
||||
|
||||
|
@ -273,7 +280,11 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
public void close() throws IOException {}
|
||||
|
||||
private void seekDir(IndexInput input, long dirOffset) throws IOException {
|
||||
input.seek(input.length() - 8);
|
||||
if (version >= FixedGapTermsIndexWriter.VERSION_CHECKSUM) {
|
||||
input.seek(input.length() - CodecUtil.footerLength() - 8);
|
||||
} else {
|
||||
input.seek(input.length() - 8);
|
||||
}
|
||||
dirOffset = input.readLong();
|
||||
input.seek(dirOffset);
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
@ -43,7 +42,7 @@ import java.io.IOException;
|
|||
*
|
||||
* @lucene.experimental */
|
||||
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
protected final IndexOutput out;
|
||||
protected IndexOutput out;
|
||||
|
||||
/** Extension of terms index file */
|
||||
static final String TERMS_INDEX_EXTENSION = "tii";
|
||||
|
@ -52,7 +51,8 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
final static int VERSION_START = 0;
|
||||
final static int VERSION_APPEND_ONLY = 1;
|
||||
final static int VERSION_MONOTONIC_ADDRESSING = 2;
|
||||
final static int VERSION_CURRENT = VERSION_MONOTONIC_ADDRESSING;
|
||||
final static int VERSION_CHECKSUM = 3;
|
||||
final static int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
|
||||
final static int BLOCKSIZE = 4096;
|
||||
final private int termIndexInterval;
|
||||
|
@ -207,38 +207,42 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
final int fieldCount = fields.size();
|
||||
if (out != null) {
|
||||
boolean success = false;
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
final int fieldCount = fields.size();
|
||||
|
||||
int nonNullFieldCount = 0;
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
SimpleFieldWriter field = fields.get(i);
|
||||
if (field.numIndexTerms > 0) {
|
||||
nonNullFieldCount++;
|
||||
int nonNullFieldCount = 0;
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
SimpleFieldWriter field = fields.get(i);
|
||||
if (field.numIndexTerms > 0) {
|
||||
nonNullFieldCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(nonNullFieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
SimpleFieldWriter field = fields.get(i);
|
||||
if (field.numIndexTerms > 0) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVInt(field.numIndexTerms);
|
||||
out.writeVLong(field.termsStart);
|
||||
out.writeVLong(field.indexStart);
|
||||
out.writeVLong(field.packedIndexStart);
|
||||
out.writeVLong(field.packedOffsetsStart);
|
||||
out.writeVInt(nonNullFieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
SimpleFieldWriter field = fields.get(i);
|
||||
if (field.numIndexTerms > 0) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVInt(field.numIndexTerms);
|
||||
out.writeVLong(field.termsStart);
|
||||
out.writeVLong(field.indexStart);
|
||||
out.writeVLong(field.packedIndexStart);
|
||||
out.writeVLong(field.packedOffsetsStart);
|
||||
}
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(out);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
writeTrailer(dirStart);
|
||||
CodecUtil.writeFooter(out);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(out);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
}
|
||||
out = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,6 +63,10 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
|
||||
version = readHeader(in);
|
||||
|
||||
if (version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM) {
|
||||
CodecUtil.checksumEntireFile(in);
|
||||
}
|
||||
|
||||
seekDir(in, dirOffset);
|
||||
|
||||
// Read directory
|
||||
|
@ -190,7 +194,10 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
public void close() throws IOException {}
|
||||
|
||||
private void seekDir(IndexInput input, long dirOffset) throws IOException {
|
||||
if (version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY) {
|
||||
if (version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM) {
|
||||
input.seek(input.length() - CodecUtil.footerLength() - 8);
|
||||
dirOffset = input.readLong();
|
||||
} else if (version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY) {
|
||||
input.seek(input.length() - 8);
|
||||
dirOffset = input.readLong();
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.fst.Util;
|
|||
*
|
||||
* @lucene.experimental */
|
||||
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
protected final IndexOutput out;
|
||||
protected IndexOutput out;
|
||||
|
||||
/** Extension of terms index file */
|
||||
static final String TERMS_INDEX_EXTENSION = "tiv";
|
||||
|
@ -53,7 +53,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
final static String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
|
||||
final static int VERSION_START = 0;
|
||||
final static int VERSION_APPEND_ONLY = 1;
|
||||
final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
|
||||
final static int VERSION_CHECKSUM = 2;
|
||||
final static int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
|
||||
private final List<FSTFieldWriter> fields = new ArrayList<>();
|
||||
|
||||
|
@ -290,30 +291,34 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
final int fieldCount = fields.size();
|
||||
if (out != null) {
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
final int fieldCount = fields.size();
|
||||
|
||||
int nonNullFieldCount = 0;
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
FSTFieldWriter field = fields.get(i);
|
||||
if (field.fst != null) {
|
||||
nonNullFieldCount++;
|
||||
int nonNullFieldCount = 0;
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
FSTFieldWriter field = fields.get(i);
|
||||
if (field.fst != null) {
|
||||
nonNullFieldCount++;
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(nonNullFieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
FSTFieldWriter field = fields.get(i);
|
||||
if (field.fst != null) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.indexStart);
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
CodecUtil.writeFooter(out);
|
||||
} finally {
|
||||
out.close();
|
||||
out = null;
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(nonNullFieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
FSTFieldWriter field = fields.get(i);
|
||||
if (field.fst != null) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.indexStart);
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeTrailer(long dirStart) throws IOException {
|
||||
|
|
|
@ -39,8 +39,8 @@ import org.apache.lucene.index.SegmentReadState;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -66,7 +66,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
|
|||
* </p>
|
||||
* <ul>
|
||||
* <li>BloomFilter (.blm) --> Header, DelegatePostingsFormatName,
|
||||
* NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
|
||||
* NumFilteredFields, Filter<sup>NumFilteredFields</sup>, Footer</li>
|
||||
* <li>Filter --> FieldNumber, FuzzySet</li>
|
||||
* <li>FuzzySet -->See {@link FuzzySet#serialize(DataOutput)}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
|
@ -75,13 +75,16 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
|
|||
* <li>NumFilteredFields --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt Uint32} The number of the
|
||||
* field in this segment</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
||||
|
||||
public static final String BLOOM_CODEC_NAME = "BloomFilter";
|
||||
public static final int BLOOM_CODEC_VERSION = 1;
|
||||
public static final int VERSION_START = 1;
|
||||
public static final int VERSION_CHECKSUM = 2;
|
||||
public static final int VERSION_CURRENT = VERSION_CHECKSUM;
|
||||
|
||||
/** Extension of Bloom Filters file */
|
||||
static final String BLOOM_EXTENSION = "blm";
|
||||
|
@ -157,12 +160,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexInput bloomIn = null;
|
||||
ChecksumIndexInput bloomIn = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
bloomIn = state.directory.openInput(bloomFileName, state.context);
|
||||
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
|
||||
BLOOM_CODEC_VERSION);
|
||||
bloomIn = state.directory.openChecksumInput(bloomFileName, state.context);
|
||||
int version = CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT);
|
||||
// // Load the hash function used in the BloomFilter
|
||||
// hashFunction = HashFunction.forName(bloomIn.readString());
|
||||
// Load the delegate postings format
|
||||
|
@ -178,6 +180,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
|
||||
bloomsByFieldName.put(fieldInfo.name, bloom);
|
||||
}
|
||||
if (version >= VERSION_CHECKSUM) {
|
||||
CodecUtil.checkFooter(bloomIn);
|
||||
} else {
|
||||
CodecUtil.checkEOF(bloomIn);
|
||||
}
|
||||
IOUtils.close(bloomIn);
|
||||
success = true;
|
||||
} finally {
|
||||
|
@ -390,6 +397,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
return sizeInBytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
delegateFieldsProducer.checkIntegrity();
|
||||
}
|
||||
}
|
||||
|
||||
class BloomFilteredFieldsConsumer extends FieldsConsumer {
|
||||
|
@ -466,10 +478,8 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexOutput bloomOutput = null;
|
||||
try {
|
||||
bloomOutput = state.directory
|
||||
.createOutput(bloomFileName, state.context);
|
||||
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
|
||||
BLOOM_CODEC_VERSION);
|
||||
bloomOutput = state.directory.createOutput(bloomFileName, state.context);
|
||||
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT);
|
||||
// remember the name of the postings format we will delegate to
|
||||
bloomOutput.writeString(delegatePostingsFormat.getName());
|
||||
|
||||
|
@ -481,6 +491,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
bloomOutput.writeInt(fieldInfo.number);
|
||||
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
|
||||
}
|
||||
CodecUtil.writeFooter(bloomOutput);
|
||||
} finally {
|
||||
IOUtils.close(bloomOutput);
|
||||
}
|
||||
|
|
|
@ -1,171 +0,0 @@
|
|||
package org.apache.lucene.codecs.intblock;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Naive int block API that writes vInts. This is
|
||||
* expected to give poor performance; it's really only for
|
||||
* testing the pluggability. One should typically use pfor instead. */
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.sep.IntIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/** Abstract base class that reads fixed-size blocks of ints
|
||||
* from an IndexInput. While this is a simple approach, a
|
||||
* more performant approach would directly create an impl
|
||||
* of IntIndexInput inside Directory. Wrapping a generic
|
||||
* IndexInput will likely cost performance.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class FixedIntBlockIndexInput extends IntIndexInput {
|
||||
|
||||
private final IndexInput in;
|
||||
protected final int blockSize;
|
||||
|
||||
public FixedIntBlockIndexInput(final IndexInput in) throws IOException {
|
||||
this.in = in;
|
||||
blockSize = in.readVInt();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIndexInput.Reader reader() throws IOException {
|
||||
final int[] buffer = new int[blockSize];
|
||||
final IndexInput clone = in.clone();
|
||||
// TODO: can this be simplified?
|
||||
return new Reader(clone, buffer, this.getBlockReader(clone, buffer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIndexInput.Index index() {
|
||||
return new Index();
|
||||
}
|
||||
|
||||
protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException;
|
||||
|
||||
/**
|
||||
* Interface for fixed-size block decoders.
|
||||
* <p>
|
||||
* Implementations should decode into the buffer in {@link #readBlock}.
|
||||
*/
|
||||
public interface BlockReader {
|
||||
public void readBlock() throws IOException;
|
||||
}
|
||||
|
||||
private static class Reader extends IntIndexInput.Reader {
|
||||
private final IndexInput in;
|
||||
private final BlockReader blockReader;
|
||||
private final int blockSize;
|
||||
private final int[] pending;
|
||||
|
||||
private int upto;
|
||||
private boolean seekPending;
|
||||
private long pendingFP;
|
||||
private long lastBlockFP = -1;
|
||||
|
||||
public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
|
||||
this.in = in;
|
||||
this.pending = pending;
|
||||
this.blockSize = pending.length;
|
||||
this.blockReader = blockReader;
|
||||
upto = blockSize;
|
||||
}
|
||||
|
||||
void seek(final long fp, final int upto) {
|
||||
assert upto < blockSize;
|
||||
if (seekPending || fp != lastBlockFP) {
|
||||
pendingFP = fp;
|
||||
seekPending = true;
|
||||
}
|
||||
this.upto = upto;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() throws IOException {
|
||||
if (seekPending) {
|
||||
// Seek & load new block
|
||||
in.seek(pendingFP);
|
||||
lastBlockFP = pendingFP;
|
||||
blockReader.readBlock();
|
||||
seekPending = false;
|
||||
} else if (upto == blockSize) {
|
||||
// Load new block
|
||||
lastBlockFP = in.getFilePointer();
|
||||
blockReader.readBlock();
|
||||
upto = 0;
|
||||
}
|
||||
return pending[upto++];
|
||||
}
|
||||
}
|
||||
|
||||
private class Index extends IntIndexInput.Index {
|
||||
private long fp;
|
||||
private int upto;
|
||||
|
||||
@Override
|
||||
public void read(final DataInput indexIn, final boolean absolute) throws IOException {
|
||||
if (absolute) {
|
||||
upto = indexIn.readVInt();
|
||||
fp = indexIn.readVLong();
|
||||
} else {
|
||||
final int uptoDelta = indexIn.readVInt();
|
||||
if ((uptoDelta & 1) == 1) {
|
||||
// same block
|
||||
upto += uptoDelta >>> 1;
|
||||
} else {
|
||||
// new block
|
||||
upto = uptoDelta >>> 1;
|
||||
fp += indexIn.readVLong();
|
||||
}
|
||||
}
|
||||
assert upto < blockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(final IntIndexInput.Reader other) throws IOException {
|
||||
((Reader) other).seek(fp, upto);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyFrom(final IntIndexInput.Index other) {
|
||||
final Index idx = (Index) other;
|
||||
fp = idx.fp;
|
||||
upto = idx.upto;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Index clone() {
|
||||
Index other = new Index();
|
||||
other.fp = fp;
|
||||
other.upto = upto;
|
||||
return other;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "fp=" + fp + " upto=" + upto;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,128 +0,0 @@
|
|||
package org.apache.lucene.codecs.intblock;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Naive int block API that writes vInts. This is
|
||||
* expected to give poor performance; it's really only for
|
||||
* testing the pluggability. One should typically use pfor instead. */
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.sep.IntIndexOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
/** Abstract base class that writes fixed-size blocks of ints
|
||||
* to an IndexOutput. While this is a simple approach, a
|
||||
* more performant approach would directly create an impl
|
||||
* of IntIndexOutput inside Directory. Wrapping a generic
|
||||
* IndexInput will likely cost performance.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class FixedIntBlockIndexOutput extends IntIndexOutput {
|
||||
|
||||
protected final IndexOutput out;
|
||||
private final int blockSize;
|
||||
protected final int[] buffer;
|
||||
private int upto;
|
||||
|
||||
protected FixedIntBlockIndexOutput(IndexOutput out, int fixedBlockSize) throws IOException {
|
||||
blockSize = fixedBlockSize;
|
||||
this.out = out;
|
||||
out.writeVInt(blockSize);
|
||||
buffer = new int[blockSize];
|
||||
}
|
||||
|
||||
protected abstract void flushBlock() throws IOException;
|
||||
|
||||
@Override
|
||||
public IntIndexOutput.Index index() {
|
||||
return new Index();
|
||||
}
|
||||
|
||||
private class Index extends IntIndexOutput.Index {
|
||||
long fp;
|
||||
int upto;
|
||||
long lastFP;
|
||||
int lastUpto;
|
||||
|
||||
@Override
|
||||
public void mark() throws IOException {
|
||||
fp = out.getFilePointer();
|
||||
upto = FixedIntBlockIndexOutput.this.upto;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyFrom(IntIndexOutput.Index other, boolean copyLast) throws IOException {
|
||||
Index idx = (Index) other;
|
||||
fp = idx.fp;
|
||||
upto = idx.upto;
|
||||
if (copyLast) {
|
||||
lastFP = fp;
|
||||
lastUpto = upto;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(DataOutput indexOut, boolean absolute) throws IOException {
|
||||
if (absolute) {
|
||||
indexOut.writeVInt(upto);
|
||||
indexOut.writeVLong(fp);
|
||||
} else if (fp == lastFP) {
|
||||
// same block
|
||||
assert upto >= lastUpto;
|
||||
int uptoDelta = upto - lastUpto;
|
||||
indexOut.writeVInt(uptoDelta << 1 | 1);
|
||||
} else {
|
||||
// new block
|
||||
indexOut.writeVInt(upto << 1);
|
||||
indexOut.writeVLong(fp - lastFP);
|
||||
}
|
||||
lastUpto = upto;
|
||||
lastFP = fp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "fp=" + fp + " upto=" + upto;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(int v) throws IOException {
|
||||
buffer[upto++] = v;
|
||||
if (upto == blockSize) {
|
||||
flushBlock();
|
||||
upto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
if (upto > 0) {
|
||||
// NOTE: entries in the block after current upto are
|
||||
// invalid
|
||||
flushBlock();
|
||||
}
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue