Merging with trunk.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/solr5914@1584603 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2014-04-04 10:27:05 +00:00
commit bc43bebedf
491 changed files with 6662 additions and 5337 deletions

View File

@ -15,6 +15,7 @@
<orderEntry type="library" scope="TEST" name="Solrj library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr test framework library" level="project" />
<orderEntry type="library" scope="TEST" name="ICU library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-core-test-files" />
@ -29,5 +30,7 @@
<orderEntry type="module" scope="TEST" module-name="misc" />
<orderEntry type="module" scope="TEST" module-name="join" />
<orderEntry type="module" scope="TEST" module-name="expressions" />
<orderEntry type="module" scope="TEST" module-name="icu" />
<orderEntry type="module" scope="TEST" module-name="analysis-extras" />
</component>
</module>

View File

@ -55,6 +55,10 @@ Documentation
* LUCENE-5392: Add/improve analysis package documentation to reflect
analysis API changes. (Benson Margulies via Robert Muir - pull request #17)
Other
* LUCENE-5563: Removed sep layout: which has fallen behind on features and doesn't
perform as well as other options. (Robert Muir)
======================= Lucene 4.8.0 =======================
@ -135,6 +139,16 @@ New Features
resort the hits from a first pass search using a Sort or an
Expression. (Simon Willnauer, Robert Muir, Mike McCandless)
* LUCENE-5558: Add TruncateTokenFilter which truncates terms to
the specified length. (Ahmet Arslan via Robert Muir)
* LUCENE-2446: Added checksums to lucene index files. As of 4.8, the last 8
bytes of each file contain a zlib-crc32 checksum. Small metadata files are
verified on load. Larger files can be checked on demand via
AtomicReader.checkIntegrity. You can configure this to happen automatically
before merges by enabling IndexWriterConfig.setCheckIntegrityAtMerge.
(Robert Muir)
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
@ -210,8 +224,18 @@ Bug fixes
* LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order. (Robert Muir)
* LUCENE-5555: Fix SortedInputIterator to correctly encode/decode contexts in presence of payload (Areek Zillur)
* LUCENE-5559: Add missing argument checks to tokenfilters taking
numeric arguments. (Ahmet Arslan via Robert Muir)
* LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
Test Framework
* LUCENE-5567: When a suite fails with zombie threads failure marker and count
is not propagated properly. (Dawid Weiss)
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
* LUCENE-5501: Added random out-of-order collection testing (when the collector

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.br;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@ -64,7 +65,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ckb;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -61,7 +62,7 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -32,6 +32,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*;
import java.nio.charset.StandardCharsets;
/**
* {@link Analyzer} for Czech language.
@ -60,7 +61,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.da;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -63,7 +64,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.de;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -68,7 +69,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.es;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -62,7 +63,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.fi;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -63,7 +64,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -36,6 +36,7 @@ import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
/**
@ -79,7 +80,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.gl;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -61,7 +62,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.hu;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -63,7 +64,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -50,6 +50,7 @@ import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
@ -672,7 +673,7 @@ public class Dictionary {
int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(IOUtils.CHARSET_UTF_8));
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
} else {
String text = line.substring(0, flagSep);
CharSequence cleansed = cleanInput(text, sb);
@ -681,10 +682,10 @@ public class Dictionary {
sb.append(cleansed);
}
sb.append(line.substring(flagSep));
writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
} else {
writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
writer.write(line.getBytes(StandardCharsets.UTF_8));
}
}
}

View File

@ -21,8 +21,7 @@ import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import org.apache.lucene.util.IOUtils;
import java.nio.charset.StandardCharsets;
// many hunspell dictionaries use this encoding, yet java does not have it?!?!
final class ISO8859_14Decoder extends CharsetDecoder {
@ -43,7 +42,7 @@ final class ISO8859_14Decoder extends CharsetDecoder {
};
ISO8859_14Decoder() {
super(IOUtils.CHARSET_UTF_8, 1f, 1f);
super(StandardCharsets.ISO_8859_1 /* fake with similar properties */, 1f, 1f);
}
@Override

View File

@ -19,13 +19,13 @@ package org.apache.lucene.analysis.it;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -72,7 +72,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.lv;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -61,7 +62,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -32,7 +32,7 @@ public final class LengthFilter extends FilteringTokenFilter {
private final int min;
private final int max;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
@ -46,6 +46,12 @@ public final class LengthFilter extends FilteringTokenFilter {
*/
public LengthFilter(Version version, TokenStream in, int min, int max) {
super(version, in);
if (min < 0) {
throw new IllegalArgumentException("minimum length must be greater than or equal to zero");
}
if (min > max) {
throw new IllegalArgumentException("maximum length must not be greater than minimum length");
}
this.min = min;
this.max = max;
}

View File

@ -61,6 +61,9 @@ public final class LimitTokenCountFilter extends TokenFilter {
*/
public LimitTokenCountFilter(TokenStream in, int maxTokenCount, boolean consumeAllTokens) {
super(in);
if (maxTokenCount < 1) {
throw new IllegalArgumentException("maxTokenCount must be greater than zero");
}
this.maxTokenCount = maxTokenCount;
this.consumeAllTokens = consumeAllTokens;
}

View File

@ -67,6 +67,9 @@ public final class LimitTokenPositionFilter extends TokenFilter {
*/
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) {
super(in);
if (maxTokenPosition < 1) {
throw new IllegalArgumentException("maxTokenPosition must be greater than zero");
}
this.maxTokenPosition = maxTokenPosition;
this.consumeAllTokens = consumeAllTokens;
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import java.io.IOException;
/**
* A token filter for truncating the terms into a specific length.
* Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
* It is reported that F5, using first 5 characters, produced best results in
* <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
* Information Retrieval on Turkish Texts</a>
*/
public final class TruncateTokenFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final int length;
public TruncateTokenFilter(TokenStream input, int length) {
super(input);
if (length < 1)
throw new IllegalArgumentException("length parameter must be a positive number: " + length);
this.length = length;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword() && termAttribute.length() > length)
termAttribute.setLength(length);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
/**
* Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
* <pre class="prettyprint">
* &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.ApostropheFilterFactory"/&gt;
* &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
* &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
* &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
* &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class TruncateTokenFilterFactory extends TokenFilterFactory {
public static final String PREFIX_LENGTH_KEY = "prefixLength";
private final byte prefixLength;
public TruncateTokenFilterFactory(Map<String, String> args) {
super(args);
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
if (prefixLength < 1)
throw new IllegalArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameter(s): " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new TruncateTokenFilter(input, prefixLength);
}
}

View File

@ -31,16 +31,14 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.FST;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
/**
* {@link Analyzer} for Dutch language.
@ -75,7 +73,7 @@ public final class DutchAnalyzer extends Analyzer {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.no;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -63,7 +64,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.payloads;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.util.BytesRef;
@ -28,7 +29,7 @@ import org.apache.lucene.util.BytesRef;
*
**/
public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{
protected Charset charset = Charset.forName("UTF-8");
protected Charset charset = StandardCharsets.UTF_8;
public IdentityEncoder() {
}

View File

@ -45,8 +45,8 @@ public class TypeAsPayloadTokenFilter extends TokenFilter {
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String type = typeAtt.type();
if (type != null && type.equals("") == false) {
payloadAtt.setPayload(new BytesRef(type.getBytes("UTF-8")));
if (type != null && !type.isEmpty()) {
payloadAtt.setPayload(new BytesRef(type));
}
return true;
} else {

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.pt;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -62,7 +63,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@ -247,7 +248,7 @@ public abstract class RSLPStemmerBase {
// TODO: this parser is ugly, but works. use a jflex grammar instead.
try {
InputStream is = clazz.getResourceAsStream(resource);
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
Map<String,Step> steps = new HashMap<>();
String step;
while ((step = readLine(r)) != null) {

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -53,7 +54,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -31,6 +31,12 @@ public class TokenRangeSinkFilter extends TeeSinkTokenFilter.SinkFilter {
private int count;
public TokenRangeSinkFilter(int lower, int upper) {
if (lower < 1) {
throw new IllegalArgumentException("lower must be greater than zero");
}
if (lower > upper) {
throw new IllegalArgumentException("lower must not be greater than upper");
}
this.lower = lower;
this.upper = upper;
}

View File

@ -84,6 +84,9 @@ public final class ClassicTokenizer extends Tokenizer {
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
}
this.maxTokenLength = length;
}

View File

@ -98,6 +98,9 @@ public final class StandardTokenizer extends Tokenizer {
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
}
this.maxTokenLength = length;
}

View File

@ -84,6 +84,9 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
}
this.maxTokenLength = length;
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.sv;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -63,7 +64,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -24,6 +24,7 @@ import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Iterator;
@ -157,8 +158,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
/**
* Load synonyms with the given {@link SynonymMap.Parser} class.
*/
private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

View File

@ -27,6 +27,7 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@ -252,7 +253,7 @@ public abstract class AbstractAnalysisFactory {
* Returns the resource's lines (with content treated as UTF-8)
*/
protected final List<String> getLines(ResourceLoader loader, String resource) throws IOException {
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8);
}
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
@ -272,7 +273,7 @@ public abstract class AbstractAnalysisFactory {
Reader reader = null;
try {
stream = loader.openResource(file.trim());
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader = new InputStreamReader(stream, decoder);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.util;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.IOUtils;
@ -97,7 +98,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
final String comment) throws IOException {
Reader reader = null;
try {
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_CURRENT, 16, ignoreCase));
} finally {
IOUtils.close(reader);
@ -122,7 +123,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
Version matchVersion) throws IOException {
Reader reader = null;
try {
reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
reader = IOUtils.getDecodingReader(stopwords, StandardCharsets.UTF_8);
return WordlistLoader.getWordSet(reader, matchVersion);
} finally {
IOUtils.close(reader);

View File

@ -69,6 +69,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory

View File

@ -23,6 +23,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
@ -78,7 +79,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, StandardCharsets.UTF_8));
StringBuilder builder = new StringBuilder();
int ch = -1;
while ((ch = reader.read()) != -1){
@ -95,7 +96,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testMSWord14GeneratedHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, StandardCharsets.UTF_8));
String gold = "This is a test";
StringBuilder builder = new StringBuilder();
int ch = 0;

View File

@ -15,6 +15,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -269,7 +270,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
String luceneResourcesWikiPage;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("LuceneResourcesWikiPage.html"), "UTF-8");
("LuceneResourcesWikiPage.html"), StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
@ -289,7 +290,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
try {
List<String> urlList = new ArrayList<>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), StandardCharsets.UTF_8));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
@ -313,7 +314,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
String randomTextWithEmails;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("random.text.with.email.addresses.txt"), "UTF-8");
("random.text.with.email.addresses.txt"), StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
@ -334,7 +335,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
List<String> emailList = new ArrayList<>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream
("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
("email.addresses.from.random.text.with.email.addresses.txt"), StandardCharsets.UTF_8));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
@ -383,7 +384,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
String randomTextWithURLs;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("random.text.with.urls.txt"), "UTF-8");
("random.text.with.urls.txt"), StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
@ -404,7 +405,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
List<String> urlList = new ArrayList<>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream
("urls.from.random.text.with.urls.txt"), "UTF-8"));
("urls.from.random.text.with.urls.txt"), StandardCharsets.UTF_8));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.hunspell;
import java.io.File;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -157,7 +158,7 @@ public class TestAllDictionaries extends LuceneTestCase {
File f = new File(DICTIONARY_HOME, tests[i]);
assert f.exists();
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);
@ -186,7 +187,7 @@ public class TestAllDictionaries extends LuceneTestCase {
File f = new File(DICTIONARY_HOME, tests[i]);
assert f.exists();
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.hunspell;
import java.io.File;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -173,7 +174,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
File f = new File(DICTIONARY_HOME, tests[i]);
assert f.exists();
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);
@ -202,7 +203,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
File f = new File(DICTIONARY_HOME, tests[i]);
assert f.exists();
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);

View File

@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import org.apache.lucene.util.BytesRef;
@ -232,10 +233,10 @@ public class TestDictionary extends LuceneTestCase {
}
public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\tUTF-8\n".getBytes(IOUtils.CHARSET_UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\t UTF-8\n".getBytes(IOUtils.CHARSET_UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(IOUtils.CHARSET_UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(IOUtils.CHARSET_UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
assertEquals("UTF-8", Dictionary.getDictionaryEncoding(new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
}
public void testFlagWithCrazyWhitespace() throws Exception {

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.junit.Test;
public class TestLengthFilter extends BaseTokenStreamTestCase {
@ -50,4 +51,11 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
checkOneTerm(a, "", "");
}
/**
* checking the validity of constructor arguments
*/
@Test(expected = IllegalArgumentException.class)
public void testIllegalArguments() throws Exception {
new LengthFilter(TEST_VERSION_CURRENT, whitespaceMockTokenizer("accept only valid arguments"), -4, -1);
}
}

View File

@ -1,11 +1,12 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@ -31,21 +32,36 @@ public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer)stream).setReader(reader);
stream = tokenFilterFactory("Length",
"min", "4",
"max", "10").create(stream);
LengthFilterFactory.MIN_KEY, "4",
LengthFilterFactory.MAX_KEY, "10").create(stream);
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("Length",
"min", "4",
"max", "5",
tokenFilterFactory("Length",
LengthFilterFactory.MIN_KEY, "4",
LengthFilterFactory.MAX_KEY, "5",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
/** Test that invalid arguments result in exception */
public void testInvalidArguments() throws Exception {
try {
Reader reader = new StringReader("foo foobar super-duper-trooper");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer)stream).setReader(reader);
tokenFilterFactory("Length",
LengthFilterFactory.MIN_KEY, "5",
LengthFilterFactory.MAX_KEY, "4").create(stream);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("maximum length must not be greater than minimum length"));
}
}
}

View File

@ -0,0 +1,40 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
public class TestLimitTokenCountFilter extends BaseTokenStreamTestCase {
public void test() throws Exception {
for (final boolean consumeAll : new boolean[]{true, false}) {
MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
tokenizer.setEnableChecks(consumeAll);
TokenStream stream = new LimitTokenCountFilter(tokenizer, 3, consumeAll);
assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
}
}
@Test(expected = IllegalArgumentException.class)
public void testIllegalArguments() throws Exception {
new LimitTokenCountFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
}
}

View File

@ -1,11 +1,12 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@ -16,25 +17,28 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import java.io.Reader;
import java.io.StringReader;
public class TestLimitTokenCountFilterFactory extends BaseTokenStreamFactoryTestCase {
public void test() throws Exception {
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(reader);
// LimitTokenCountFilter doesn't consume the entire stream that it wraps
tokenizer.setEnableChecks(false);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("LimitTokenCount",
"maxTokenCount", "3").create(stream);
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
for (final boolean consumeAll : new boolean[]{true, false}) {
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(reader);
tokenizer.setEnableChecks(consumeAll);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("LimitTokenCount",
LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3",
LimitTokenCountFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
).create(stream);
assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
}
}
public void testRequired() throws Exception {
@ -44,15 +48,17 @@ public class TestLimitTokenCountFilterFactory extends BaseTokenStreamFactoryTest
fail();
} catch (IllegalArgumentException e) {
assertTrue("exception doesn't mention param: " + e.getMessage(),
0 < e.getMessage().indexOf(LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY));
0 < e.getMessage().indexOf(LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY));
}
}
/** Test that bogus arguments result in exception */
/**
* Test that bogus arguments result in exception
*/
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("LimitTokenCount",
"maxTokenCount", "3",
tokenFilterFactory("LimitTokenCount",
LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {

View File

@ -16,10 +16,6 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
@ -27,11 +23,15 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.CharsRef;
import org.junit.Test;
import java.io.IOException;
import java.io.StringReader;
public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
public void testMaxPosition2() throws IOException {
for (final boolean consumeAll : new boolean[] { true, false }) {
for (final boolean consumeAll : new boolean[]{true, false}) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
@ -42,43 +42,50 @@ public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
}
};
// dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
assertTokenStreamContents(a.tokenStream("dummy", "1 2 3 4 5"),
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null);
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null);
// don't use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
assertTokenStreamContents(a.tokenStream("dummy", "1 2 3 4 5"),
new String[]{"1", "2"}, new int[]{0, 3}, new int[]{1, 4}, consumeAll ? 16 : null);
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
new String[]{"1", "2"}, new int[]{0, 2}, new int[]{1, 3}, consumeAll ? 9 : null);
// less than the limit, ensure we behave correctly
assertTokenStreamContents(a.tokenStream("dummy", "1 "),
new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null);
new String[]{"1"}, new int[]{0}, new int[]{1}, consumeAll ? 3 : null);
// equal to limit
assertTokenStreamContents(a.tokenStream("dummy", "1 2 "),
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null);
assertTokenStreamContents(a.tokenStream("dummy", "1 2 "),
new String[]{"1", "2"}, new int[]{0, 3}, new int[]{1, 4}, consumeAll ? 6 : null);
}
}
public void testMaxPosition3WithSynomyms() throws IOException {
MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("one"), new CharsRef("first"), true);
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
CharsRef multiWordCharsRef = new CharsRef();
SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
builder.add(new CharsRef("one"), multiWordCharsRef, true);
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
builder.add(new CharsRef("two"), multiWordCharsRef, true);
SynonymMap synonymMap = builder.build();
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
assertTokenStreamContents(stream,
new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" },
new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
for (final boolean consumeAll : new boolean[]{true, false}) {
MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
// if we are consuming all tokens, we can use the checks, otherwise we can't
tokenizer.setEnableChecks(consumeAll);
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("one"), new CharsRef("first"), true);
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
CharsRef multiWordCharsRef = new CharsRef();
SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
builder.add(new CharsRef("one"), multiWordCharsRef, true);
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
builder.add(new CharsRef("two"), multiWordCharsRef, true);
SynonymMap synonymMap = builder.build();
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
stream = new LimitTokenPositionFilter(stream, 3, consumeAll);
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
assertTokenStreamContents(stream,
new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
}
}
@Test(expected = IllegalArgumentException.class)
public void testIllegalArguments() throws Exception {
new LimitTokenPositionFilter(whitespaceMockTokenizer("one two three four five"), 0);
}
}

View File

@ -16,26 +16,30 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import java.io.Reader;
import java.io.StringReader;
public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testMaxPosition1() throws Exception {
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
tokenizer.setEnableChecks(false);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("LimitTokenPosition",
"maxTokenPosition", "1").create(stream);
assertTokenStreamContents(stream, new String[] { "A1" });
for (final boolean consumeAll : new boolean[]{true, false}) {
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
// if we are consuming all tokens, we can use the checks, otherwise we can't
tokenizer.setEnableChecks(consumeAll);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("LimitTokenPosition",
LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1",
LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
).create(stream);
assertTokenStreamContents(stream, new String[]{"A1"});
}
}
public void testMissingParam() throws Exception {
try {
tokenFilterFactory("LimitTokenPosition");
@ -47,34 +51,31 @@ public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamFactoryT
}
public void testMaxPosition1WithShingles() throws Exception {
Reader reader = new StringReader("one two three four five");
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
tokenizer.setEnableChecks(false);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("Shingle",
"minShingleSize", "2",
"maxShingleSize", "3",
"outputUnigrams", "true").create(stream);
stream = tokenFilterFactory("LimitTokenPosition",
"maxTokenPosition", "1").create(stream);
assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" });
for (final boolean consumeAll : new boolean[]{true, false}) {
Reader reader = new StringReader("one two three four five");
MockTokenizer tokenizer = whitespaceMockTokenizer(reader);
// if we are consuming all tokens, we can use the checks, otherwise we can't
tokenizer.setEnableChecks(consumeAll);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("Shingle",
"minShingleSize", "2",
"maxShingleSize", "3",
"outputUnigrams", "true").create(stream);
stream = tokenFilterFactory("LimitTokenPosition",
LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1",
LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
).create(stream);
assertTokenStreamContents(stream, new String[]{"one", "one two", "one two three"});
}
}
public void testConsumeAllTokens() throws Exception {
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("LimitTokenPosition",
"maxTokenPosition", "3",
"consumeAllTokens", "true").create(stream);
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
}
/** Test that bogus arguments result in exception */
/**
* Test that bogus arguments result in exception
*/
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("LimitTokenPosition",
"maxTokenPosition", "3",
tokenFilterFactory("LimitTokenPosition",
"maxTokenPosition", "3",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {

View File

@ -0,0 +1,39 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
/**
* Test the truncate token filter.
*/
public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
public void testTruncating() throws Exception {
TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
stream = new TruncateTokenFilter(stream, 5);
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
}
@Test(expected = IllegalArgumentException.class)
public void testNonPositiveLength() throws Exception {
new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
}
}

View File

@ -0,0 +1,73 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import java.io.Reader;
import java.io.StringReader;
/**
* Simple tests to ensure the simple truncation filter factory is working.
*/
public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
/**
* Ensure the filter actually truncates text.
*/
public void testTruncating() throws Exception {
Reader reader = new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer) stream).setReader(reader);
stream = tokenFilterFactory("Truncate",
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5").create(stream);
assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
}
/**
* Test that bogus arguments result in exception
*/
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("Truncate",
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
}
}
/**
* Test that negative prefix length result in exception
*/
public void testNonPositivePrefixLengthArgument() throws Exception {
try {
tokenFilterFactory("Truncate",
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "-5"
);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains(TruncateTokenFilterFactory.PREFIX_LENGTH_KEY + " parameter must be a positive number: -5"));
}
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
public class DelimitedPayloadTokenFilterTest extends BaseTokenStreamTestCase {
@ -37,15 +38,15 @@ public class DelimitedPayloadTokenFilterTest extends BaseTokenStreamTestCase {
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, payAtt, null);
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
assertTermEquals("fox", filter, termAtt, payAtt, "NN".getBytes("UTF-8"));
assertTermEquals("jumped", filter, termAtt, payAtt, "VB".getBytes("UTF-8"));
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("fox", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
assertTermEquals("jumped", filter, termAtt, payAtt, "VB".getBytes(StandardCharsets.UTF_8));
assertTermEquals("over", filter, termAtt, payAtt, null);
assertTermEquals("the", filter, termAtt, payAtt, null);
assertTermEquals("lazy", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes("UTF-8"));
assertTermEquals("lazy", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
assertFalse(filter.incrementToken());
filter.end();
filter.close();
@ -59,15 +60,15 @@ public class DelimitedPayloadTokenFilterTest extends BaseTokenStreamTestCase {
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
filter.reset();
assertTermEquals("The", filter, null);
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
assertTermEquals("red", filter, "JJ".getBytes("UTF-8"));
assertTermEquals("fox", filter, "NN".getBytes("UTF-8"));
assertTermEquals("jumped", filter, "VB".getBytes("UTF-8"));
assertTermEquals("quick", filter, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("red", filter, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("fox", filter, "NN".getBytes(StandardCharsets.UTF_8));
assertTermEquals("jumped", filter, "VB".getBytes(StandardCharsets.UTF_8));
assertTermEquals("over", filter, null);
assertTermEquals("the", filter, null);
assertTermEquals("lazy", filter, "JJ".getBytes("UTF-8"));
assertTermEquals("brown", filter, "JJ".getBytes("UTF-8"));
assertTermEquals("dogs", filter, "NN".getBytes("UTF-8"));
assertTermEquals("lazy", filter, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("brown", filter, "JJ".getBytes(StandardCharsets.UTF_8));
assertTermEquals("dogs", filter, "NN".getBytes(StandardCharsets.UTF_8));
assertFalse(filter.incrementToken());
filter.end();
filter.close();

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
@ -41,8 +42,8 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
while (nptf.incrementToken()) {
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.buffer()[0]))));
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
String type = new String(payloadAtt.getPayload().bytes, "UTF-8");
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
String type = payloadAtt.getPayload().utf8ToString();
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()));
count++;
}

View File

@ -1,11 +1,12 @@
package org.apache.lucene.analysis.sinks;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@ -21,6 +22,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.junit.Test;
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
@ -29,20 +31,25 @@ public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
String test = "The quick red fox jumped over the lazy brown dogs";
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(whitespaceMockTokenizer(test));
TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
int count = 0;
tee.reset();
while(tee.incrementToken()) {
count++;
}
int sinkCount = 0;
rangeToks.reset();
while (rangeToks.incrementToken()) {
sinkCount++;
}
assertTrue(count + " does not equal: " + 10, count == 10);
assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
}
@Test(expected = IllegalArgumentException.class)
public void testIllegalArguments() throws Exception {
new TokenRangeSinkFilter(4, 2);
}
}

View File

@ -172,4 +172,13 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
public void testIllegalArguments() throws Exception {
try {
tokenizerFactory("UAX29URLEmail", "maxTokenLength", "-1").create();
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("maxTokenLength must be greater than zero"));
}
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
/** Fake resource loader for tests: works if you want to fake reading a single file */
public class StringMockResourceLoader implements ResourceLoader {
@ -50,6 +51,6 @@ public class StringMockResourceLoader implements ResourceLoader {
@Override
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
}
}

View File

@ -23,6 +23,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
@ -49,7 +50,7 @@ public class TestFilesystemResourceLoader extends LuceneTestCase {
private void assertClasspathDelegation(ResourceLoader rl) throws Exception {
// try a stopwords file from classpath
CharArraySet set = WordlistLoader.getSnowballWordSet(
new InputStreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), IOUtils.CHARSET_UTF_8),
new InputStreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), StandardCharsets.UTF_8),
TEST_VERSION_CURRENT
);
assertTrue(set.contains("you"));
@ -64,7 +65,7 @@ public class TestFilesystemResourceLoader extends LuceneTestCase {
final File base = TestUtil.createTempDir("fsResourceLoaderBase").getAbsoluteFile();
try {
base.mkdirs();
Writer os = new OutputStreamWriter(new FileOutputStream(new File(base, "template.txt")), IOUtils.CHARSET_UTF_8);
Writer os = new OutputStreamWriter(new FileOutputStream(new File(base, "template.txt")), StandardCharsets.UTF_8);
try {
os.write("foobar\n");
} finally {
@ -72,28 +73,28 @@ public class TestFilesystemResourceLoader extends LuceneTestCase {
}
ResourceLoader rl = new FilesystemResourceLoader(base);
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), IOUtils.CHARSET_UTF_8).get(0));
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
// Same with full path name:
String fullPath = new File(base, "template.txt").toString();
assertEquals("foobar",
WordlistLoader.getLines(rl.openResource(fullPath), IOUtils.CHARSET_UTF_8).get(0));
WordlistLoader.getLines(rl.openResource(fullPath), StandardCharsets.UTF_8).get(0));
assertClasspathDelegation(rl);
assertNotFound(rl);
// now use RL without base dir:
rl = new FilesystemResourceLoader();
assertEquals("foobar",
WordlistLoader.getLines(rl.openResource(new File(base, "template.txt").toString()), IOUtils.CHARSET_UTF_8).get(0));
WordlistLoader.getLines(rl.openResource(new File(base, "template.txt").toString()), StandardCharsets.UTF_8).get(0));
assertClasspathDelegation(rl);
assertNotFound(rl);
} finally {
TestUtil.rmDir(base);
TestUtil.rm(base);
}
}
public void testDelegation() throws Exception {
ResourceLoader rl = new FilesystemResourceLoader(null, new StringMockResourceLoader("foobar\n"));
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), IOUtils.CHARSET_UTF_8).get(0));
assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
}
}

View File

@ -25,6 +25,7 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.util.Date;
import java.util.Locale;
@ -118,7 +119,7 @@ public class GenerateJflexTLDMacros {
connection.connect();
tldFileLastModified = connection.getLastModified();
BufferedReader reader = new BufferedReader
(new InputStreamReader(connection.getInputStream(), "US-ASCII"));
(new InputStreamReader(connection.getInputStream(), StandardCharsets.US_ASCII));
try {
String line;
while (null != (line = reader.readLine())) {
@ -150,7 +151,7 @@ public class GenerateJflexTLDMacros {
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
final Writer writer = new OutputStreamWriter
(new FileOutputStream(outputFile), "UTF-8");
(new FileOutputStream(outputFile), StandardCharsets.UTF_8);
try {
writer.write(APACHE_LICENSE);
writer.write("// Generated from IANA Root Zone Database <");

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.icu.segmentation;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -132,7 +132,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
StringBuilder rules = new StringBuilder();
InputStream rulesStream = loader.openResource(filename);
BufferedReader reader = new BufferedReader
(IOUtils.getDecodingReader(rulesStream, IOUtils.CHARSET_UTF_8));
(IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8));
String line = null;
while ((line = reader.readLine()) != null) {
if ( ! line.startsWith("#"))

View File

@ -35,6 +35,7 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -106,7 +107,7 @@ public class GenerateUTR30DataFiles {
private static void expandDataFileRules(File file) throws IOException {
final FileInputStream stream = new FileInputStream(file);
final InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
final InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
final BufferedReader bufferedReader = new BufferedReader(reader);
StringBuilder builder = new StringBuilder();
String line;
@ -154,7 +155,7 @@ public class GenerateUTR30DataFiles {
if (modified) {
System.err.println("Expanding rules in and overwriting " + file.getName());
final FileOutputStream out = new FileOutputStream(file, false);
Writer writer = new OutputStreamWriter(out, "UTF-8");
Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8);
try {
writer.write(builder.toString());
} finally {
@ -178,8 +179,8 @@ public class GenerateUTR30DataFiles {
System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... ");
URLConnection connection = openConnection(new URL(norm2url, NFC_TXT));
BufferedReader reader = new BufferedReader
(new InputStreamReader(connection.getInputStream(), "UTF-8"));
Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), "UTF-8");
(new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8));
Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), StandardCharsets.UTF_8);
try {
String line;

View File

@ -25,6 +25,7 @@ import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import com.ibm.icu.text.RuleBasedBreakIterator;
@ -37,7 +38,7 @@ public class RBBIRuleCompiler {
static String getRules(File ruleFile) throws IOException {
StringBuilder rules = new StringBuilder();
InputStream in = new FileInputStream(ruleFile);
BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
BufferedReader cin = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
String line = null;
while ((line = cin.readLine()) != null) {
if (!line.startsWith("#"))

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.ja;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.util.ResourceLoader;
@ -52,6 +53,6 @@ class StringMockResourceLoader implements ResourceLoader {
@Override
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
return new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
}
}

View File

@ -22,6 +22,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@ -34,7 +35,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.LuceneTestCase.Slow;
@ -49,7 +49,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
}
try {
try {
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
return new UserDictionary(reader);
} finally {
is.close();
@ -571,7 +571,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
/*
public void testWikipedia() throws Exception {
final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
final Reader r = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8));
final long startTimeNS = System.nanoTime();
boolean done = false;
@ -618,7 +618,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
private void doTestBocchan(int numIterations) throws Exception {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getResourceAsStream("bocchan.utf-8"), "UTF-8"));
this.getClass().getResourceAsStream("bocchan.utf-8"), StandardCharsets.UTF_8));
String line = reader.readLine();
reader.close();

View File

@ -22,13 +22,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.util.IOUtils;
public class TestSearchMode extends BaseTokenStreamTestCase {
private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
@ -47,7 +46,7 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
throw new FileNotFoundException("Cannot find " + SEGMENTATION_FILENAME + " in test classpath");
}
try {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(is, IOUtils.CHARSET_UTF_8));
LineNumberReader reader = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
String line = null;
while ((line = reader.readLine()) != null) {
// Remove comments

View File

@ -24,6 +24,7 @@ import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
public class ConnectionCostsBuilder {
@ -32,7 +33,7 @@ public class ConnectionCostsBuilder {
public static ConnectionCostsWriter build(String filename) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
Charset cs = Charset.forName("US-ASCII");
Charset cs = StandardCharsets.US_ASCII;
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

View File

@ -21,10 +21,9 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import org.apache.lucene.util.IOUtils;
/**
* Manages analysis data configuration for SmartChineseAnalyzer
* <p>
@ -80,7 +79,7 @@ public class AnalyzerProfile {
Properties prop = new Properties();
try {
FileInputStream input = new FileInputStream(propFile);
prop.load(new InputStreamReader(input, IOUtils.CHARSET_UTF_8));
prop.load(new InputStreamReader(input, StandardCharsets.UTF_8));
String dir = prop.getProperty("analysis.data.dir", "");
input.close();
return dir;

View File

@ -18,18 +18,16 @@
package org.apache.lucene.analysis.cn.smart;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
@ -90,7 +88,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
// make sure it is unmodifiable as we expose it in the outer class
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils
.getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE,
IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT,
StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT,
Version.LUCENE_CURRENT));
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.pl;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -76,7 +77,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -65,10 +65,10 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.StringTokenizer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
@ -139,7 +139,7 @@ public class TestCompile extends LuceneTestCase {
private static void assertTrie(Trie trie, String file, boolean usefull,
boolean storeorig) throws Exception {
LineNumberReader in = new LineNumberReader(new BufferedReader(
new InputStreamReader(new FileInputStream(file), IOUtils.CHARSET_UTF_8)));
new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8)));
for (String line = in.readLine(); line != null; line = in.readLine()) {
try {

View File

@ -18,8 +18,8 @@ package org.apache.lucene.benchmark.byTask;
*/
import java.io.File;
import java.io.FileReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
import org.apache.lucene.benchmark.byTask.utils.Config;
@ -107,7 +107,7 @@ public class Benchmark {
Benchmark benchmark = null;
try {
benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, StandardCharsets.UTF_8));
} catch (Exception e) {
e.printStackTrace();
System.exit(1);

View File

@ -18,7 +18,6 @@ package org.apache.lucene.benchmark.byTask.feeds;
*/
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.util.IOUtils;
import java.io.BufferedReader;
import java.io.File;
@ -26,6 +25,7 @@ import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@ -206,7 +206,7 @@ public class DirContentSource extends ContentSource {
name = f.getCanonicalPath()+"_"+iteration;
}
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8));
String line = null;
//First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine();

View File

@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.Closeable;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Calendar;
@ -318,7 +319,7 @@ public class DocMaker implements Closeable {
if (storeBytes) {
Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED);
bytesField.setBytesValue(bdy.getBytes("UTF-8"));
bytesField.setBytesValue(bdy.getBytes(StandardCharsets.UTF_8));
doc.add(bytesField);
}
}

View File

@ -20,18 +20,15 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ThreadInterruptedException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@ -182,10 +179,7 @@ public class EnwikiContentSource extends ContentSource {
if (localFileIS != null) { // null means fileIS was closed on us
try {
// To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
reader.parse(new InputSource(IOUtils.getDecodingReader(localFileIS, StandardCharsets.UTF_8)));
} catch (IOException ioe) {
synchronized(EnwikiContentSource.this) {
if (localFileIS != is) {

View File

@ -9,6 +9,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@ -62,12 +63,12 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
Reader reader = null;
// note: we use a decoding reader, so if your queries are screwed up you know
if (file.exists()) {
reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
reader = IOUtils.getDecodingReader(file, StandardCharsets.UTF_8);
} else {
//see if we can find it as a resource
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
if (asStream != null) {
reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
reader = IOUtils.getDecodingReader(asStream, StandardCharsets.UTF_8);
}
}
if (reader != null) {

View File

@ -29,6 +29,7 @@ import java.util.Properties;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.IOUtils;
/**
* A {@link ContentSource} reading one line at a time as a
@ -277,7 +278,7 @@ public class LineDocSource extends ContentSource {
}
file = new File(fileName).getAbsoluteFile();
if (encoding == null) {
encoding = "UTF-8";
encoding = IOUtils.UTF_8;
}
}

View File

@ -22,6 +22,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@ -30,7 +31,6 @@ import java.util.Date;
import java.util.Locale;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.util.IOUtils;
/**
* A {@link ContentSource} reading from the Reuters collection.
@ -114,7 +114,7 @@ public class ReutersContentSource extends ContentSource {
name = f.getCanonicalPath() + "_" + iteration;
}
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8));
try {
// First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine();

View File

@ -22,6 +22,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@ -320,7 +321,7 @@ public class TrecContentSource extends ContentSource {
}
// encoding
if (encoding == null) {
encoding = "ISO-8859-1";
encoding = StandardCharsets.ISO_8859_1.name();
}
// iteration exclusion in doc name
excludeDocnameIteration = config.get("content.source.excludeIteration", false);

View File

@ -20,19 +20,18 @@ package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.NoDeletionPolicy;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.NoMergeScheduler;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.util.Version;
import java.io.BufferedOutputStream;
@ -130,7 +129,7 @@ public class CreateIndexTask extends PerfTask {
if (defaultCodec != null) {
try {
Class<? extends Codec> clazz = Class.forName(defaultCodec).asSubclass(Codec.class);
Codec.setDefault(clazz.newInstance());
iwConf.setCodec(clazz.newInstance());
} catch (Exception e) {
throw new RuntimeException("Couldn't instantiate Codec: " + defaultCodec, e);
}

View File

@ -5,6 +5,7 @@ import java.io.File;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@ -41,7 +42,7 @@ public class WriteEnwikiLineDocTask extends WriteLineDocTask {
public WriteEnwikiLineDocTask(PerfRunData runData) throws Exception {
super(runData);
OutputStream out = StreamUtils.outputStream(categoriesLineFile(new File(fname)));
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8), StreamUtils.BUFFER_SIZE));
writeHeader(categoryLineFileOut);
}

View File

@ -22,6 +22,7 @@ import java.io.File;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
@ -101,7 +102,7 @@ public class WriteLineDocTask extends PerfTask {
throw new IllegalArgumentException("line.file.out must be set");
}
OutputStream out = StreamUtils.outputStream(new File(fname));
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8), StreamUtils.BUFFER_SIZE));
docMaker = runData.getDocMaker();
// init fields

View File

@ -31,6 +31,7 @@ import java.io.File;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
@ -53,7 +54,7 @@ public class QueryDriver {
File topicsFile = new File(args[0]);
File qrelsFile = new File(args[1]);
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], "UTF-8"), "lucene");
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], IOUtils.UTF_8 /* huh, no nio.Charset ctor? */), "lucene");
FSDirectory dir = FSDirectory.open(new File(args[3]));
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
IndexReader reader = DirectoryReader.open(dir);
@ -66,10 +67,10 @@ public class QueryDriver {
// use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, IOUtils.CHARSET_UTF_8)));
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, StandardCharsets.UTF_8)));
// prepare judge, with trec utilities that read from a QRels file
Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8)));
Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, StandardCharsets.UTF_8)));
// validate topics & judgments match each other
judge.validateData(qqs, logger);

View File

@ -21,16 +21,13 @@ import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.util.IOUtils;
/**
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
@ -78,7 +75,7 @@ public class ExtractReuters {
*/
protected void extractFile(File sgmFile) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8));
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), StandardCharsets.UTF_8));
StringBuilder buffer = new StringBuilder(1024);
StringBuilder outBuffer = new StringBuilder(1024);
@ -112,7 +109,7 @@ public class ExtractReuters {
File outFile = new File(outputDir, sgmFile.getName() + "-"
+ (docNumber++) + ".txt");
// System.out.println("Writing " + outFile);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), IOUtils.CHARSET_UTF_8);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), StandardCharsets.UTF_8);
writer.write(out);
writer.close();
outBuffer.setLength(0);

View File

@ -22,6 +22,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
@ -30,7 +31,6 @@ import org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.IOUtils;
/**
* Extract the downloaded Wikipedia dump into separate files for indexing.
@ -86,7 +86,7 @@ public class ExtractWikipedia {
contents.append("\n");
try {
Writer writer = new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8);
Writer writer = new OutputStreamWriter(new FileOutputStream(f), StandardCharsets.UTF_8);
writer.write(contents.toString());
writer.close();
} catch (IOException ioe) {

View File

@ -21,6 +21,7 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.text.Collator;
import java.util.List;
import java.util.Locale;
@ -406,7 +407,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
BufferedReader r = new BufferedReader(
new InputStreamReader(
new FileInputStream(lineFile), "UTF-8"));
new FileInputStream(lineFile), StandardCharsets.UTF_8));
int numLines = 0;
String line;
while((line = r.readLine()) != null) {

View File

@ -23,6 +23,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import org.apache.lucene.benchmark.byTask.feeds.AbstractQueryMaker;
@ -121,7 +122,7 @@ public class TestPerfTasksParse extends LuceneTestCase {
public boolean accept(File pathname) { return pathname.isFile() && pathname.getName().endsWith(".alg"); }
})) {
try {
Config config = new Config(new InputStreamReader(new FileInputStream(algFile), "UTF-8"));
Config config = new Config(new InputStreamReader(new FileInputStream(algFile), StandardCharsets.UTF_8));
String contentSource = config.get("content.source", null);
if (contentSource != null) { Class.forName(contentSource); }
config.set("work.dir", TestUtil.createTempDir(LuceneTestCase.getTestClass().getSimpleName()).getAbsolutePath());

View File

@ -36,6 +36,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.IOUtils;
/** Tests the functionality of {@link DocMaker}. */
public class DocMakerTest extends BenchmarkTestCase {
@ -166,7 +167,7 @@ public class DocMakerTest extends BenchmarkTestCase {
// DocMaker did not close its ContentSource if resetInputs was called twice,
// leading to a file handle leak.
File f = new File(getWorkDir(), "docMakerLeak.txt");
PrintStream ps = new PrintStream(f, "UTF-8");
PrintStream ps = new PrintStream(f, IOUtils.UTF_8);
ps.println("one title\t" + System.currentTimeMillis() + "\tsome content");
ps.close();

View File

@ -18,15 +18,13 @@ package org.apache.lucene.benchmark.byTask.feeds;
*/
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Properties;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
@ -43,7 +41,7 @@ public class EnwikiContentSourceTest extends LuceneTestCase {
@Override
protected InputStream openInputStream() throws IOException {
return new ByteArrayInputStream(docs.getBytes(IOUtils.CHARSET_UTF_8));
return new ByteArrayInputStream(docs.getBytes(StandardCharsets.UTF_8));
}
}

View File

@ -23,6 +23,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
@ -53,7 +54,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
OutputStream out = new FileOutputStream(file);
out = csFactory.createCompressorOutputStream("bzip2", out);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
writeDocsToFile(writer, addHeader, null);
writer.close();
}
@ -90,14 +91,14 @@ public class LineDocSourceTest extends BenchmarkTestCase {
private void createRegularLineFile(File file, boolean addHeader) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
writeDocsToFile(writer, addHeader, null);
writer.close();
}
private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
Properties p = new Properties();
for (String f : extraFields) {
p.setProperty(f, f);
@ -209,7 +210,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
for (int i = 0; i < testCases.length; i++) {
File file = new File(getWorkDir(), "one-line");
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), StandardCharsets.UTF_8));
writer.write(testCases[i]);
writer.newLine();
writer.close();

View File

@ -22,6 +22,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
@ -73,7 +74,7 @@ public class WriteEnwikiLineDocTaskTest extends BenchmarkTestCase {
private void doReadTest(int n, File file, String expTitle, String expDate, String expBody) throws Exception {
InputStream in = new FileInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
try {
String line = br.readLine();
WriteLineDocTaskTest.assertHeaderLine(line);

View File

@ -22,6 +22,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
@ -168,7 +169,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
default:
assertFalse("Unknown file type!",true); //fail, should not happen
}
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
try {
String line = br.readLine();
assertHeaderLine(line);
@ -274,7 +275,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
wldt.doLogic();
wldt.close();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
try {
String line = br.readLine();
assertHeaderLine(line);
@ -292,7 +293,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
wldt.doLogic();
wldt.close();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
try {
String line = br.readLine();
assertHeaderLine(line);
@ -310,7 +311,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
wldt.doLogic();
wldt.close();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
try {
String line = br.readLine();
assertHeaderLine(line);
@ -345,7 +346,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
wldt.close();
Set<String> ids = new HashSet<>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
try {
String line = br.readLine();
assertHeaderLine(line); // header line is written once, no matter how many threads there are

View File

@ -26,10 +26,10 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import org.junit.After;
import org.junit.Before;
@ -87,7 +87,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
private File rawTextFile(String ext) throws Exception {
File f = new File(testDir,"testfile." + ext);
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8));
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), StandardCharsets.UTF_8));
w.write(TEXT);
w.newLine();
w.close();
@ -116,7 +116,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
}
private void writeText(OutputStream os) throws IOException {
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
w.write(TEXT);
w.newLine();
w.close();
@ -124,7 +124,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
private void assertReadText(File f) throws Exception {
InputStream ir = StreamUtils.inputStream(f);
InputStreamReader in = new InputStreamReader(ir, IOUtils.CHARSET_UTF_8);
InputStreamReader in = new InputStreamReader(ir, StandardCharsets.UTF_8);
BufferedReader r = new BufferedReader(in);
String line = r.readLine();
assertEquals("Wrong text found in "+f.getName(), TEXT, line);
@ -136,14 +136,14 @@ public class StreamUtilsTest extends BenchmarkTestCase {
public void setUp() throws Exception {
super.setUp();
testDir = new File(getWorkDir(),"ContentSourceTest");
TestUtil.rmDir(testDir);
TestUtil.rm(testDir);
assertTrue(testDir.mkdirs());
}
@Override
@After
public void tearDown() throws Exception {
TestUtil.rmDir(testDir);
TestUtil.rm(testDir);
super.tearDown();
}

View File

@ -34,6 +34,7 @@ import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
/**
* Test that quality run does its job.
@ -62,11 +63,11 @@ public class TestQualityRun extends BenchmarkTestCase {
// prepare topics
InputStream topics = getClass().getResourceAsStream("trecTopics.txt");
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new InputStreamReader(topics, "UTF-8")));
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new InputStreamReader(topics, StandardCharsets.UTF_8)));
// prepare judge
InputStream qrels = getClass().getResourceAsStream("trecQRels.txt");
Judge judge = new TrecJudge(new BufferedReader(new InputStreamReader(qrels, "UTF-8")));
Judge judge = new TrecJudge(new BufferedReader(new InputStreamReader(qrels, StandardCharsets.UTF_8)));
// validate topics & judgments match each other
judge.validateData(qqs, logger);
@ -147,7 +148,7 @@ public class TestQualityRun extends BenchmarkTestCase {
InputStream topicsFile = getClass().getResourceAsStream("trecTopics.txt");
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(
new BufferedReader(new InputStreamReader(topicsFile, "UTF-8")));
new BufferedReader(new InputStreamReader(topicsFile, StandardCharsets.UTF_8)));
assertEquals(20, qqs.length);

View File

@ -177,7 +177,10 @@ public class BlockTermsReader extends FieldsProducer {
}
private void seekDir(IndexInput input, long dirOffset) throws IOException {
if (version >= BlockTermsWriter.VERSION_APPEND_ONLY) {
if (version >= BlockTermsWriter.VERSION_CHECKSUM) {
input.seek(input.length() - CodecUtil.footerLength() - 8);
dirOffset = input.readLong();
} else if (version >= BlockTermsWriter.VERSION_APPEND_ONLY) {
input.seek(input.length() - 8);
dirOffset = input.readLong();
}
@ -863,4 +866,14 @@ public class BlockTermsReader extends FieldsProducer {
sizeInBytes += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
return sizeInBytes;
}
@Override
public void checkIntegrity() throws IOException {
// verify terms
if (version >= BlockTermsWriter.VERSION_CHECKSUM) {
CodecUtil.checksumEntireFile(in);
}
// verify postings
postingsReader.checkIntegrity();
}
}

View File

@ -63,12 +63,13 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable {
public static final int VERSION_START = 0;
public static final int VERSION_APPEND_ONLY = 1;
public static final int VERSION_META_ARRAY = 2;
public static final int VERSION_CURRENT = VERSION_META_ARRAY;
public static final int VERSION_CHECKSUM = 3;
public static final int VERSION_CURRENT = VERSION_CHECKSUM;
/** Extension of terms file */
static final String TERMS_EXTENSION = "tib";
protected final IndexOutput out;
protected IndexOutput out;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
FieldInfo currentField;
@ -176,26 +177,30 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable {
}
public void close() throws IOException {
try {
final long dirStart = out.getFilePointer();
out.writeVInt(fields.size());
for(FieldMetaData field : fields) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.numTerms);
out.writeVLong(field.termsStartPointer);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
if (VERSION_CURRENT >= VERSION_META_ARRAY) {
out.writeVInt(field.longsSize);
if (out != null) {
try {
final long dirStart = out.getFilePointer();
out.writeVInt(fields.size());
for(FieldMetaData field : fields) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.numTerms);
out.writeVLong(field.termsStartPointer);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVLong(field.sumTotalTermFreq);
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
if (VERSION_CURRENT >= VERSION_META_ARRAY) {
out.writeVInt(field.longsSize);
}
}
writeTrailer(dirStart);
CodecUtil.writeFooter(out);
} finally {
IOUtils.close(out, postingsWriter, termsIndexWriter);
out = null;
}
writeTrailer(dirStart);
} finally {
IOUtils.close(out, postingsWriter, termsIndexWriter);
}
}

View File

@ -66,6 +66,8 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
// start of the field info data
private long dirOffset;
private int version;
public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
throws IOException {
@ -78,6 +80,11 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
try {
readHeader(in);
if (version >= FixedGapTermsIndexWriter.VERSION_CHECKSUM) {
CodecUtil.checksumEntireFile(in);
}
indexInterval = in.readVInt();
if (indexInterval < 1) {
throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")");
@ -124,7 +131,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
}
private void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
version = CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT);
}
@ -273,7 +280,11 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
public void close() throws IOException {}
private void seekDir(IndexInput input, long dirOffset) throws IOException {
input.seek(input.length() - 8);
if (version >= FixedGapTermsIndexWriter.VERSION_CHECKSUM) {
input.seek(input.length() - CodecUtil.footerLength() - 8);
} else {
input.seek(input.length() - 8);
}
dirOffset = input.readLong();
input.seek(dirOffset);
}

View File

@ -26,7 +26,6 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
@ -43,7 +42,7 @@ import java.io.IOException;
*
* @lucene.experimental */
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
protected final IndexOutput out;
protected IndexOutput out;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tii";
@ -52,7 +51,8 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
final static int VERSION_START = 0;
final static int VERSION_APPEND_ONLY = 1;
final static int VERSION_MONOTONIC_ADDRESSING = 2;
final static int VERSION_CURRENT = VERSION_MONOTONIC_ADDRESSING;
final static int VERSION_CHECKSUM = 3;
final static int VERSION_CURRENT = VERSION_CHECKSUM;
final static int BLOCKSIZE = 4096;
final private int termIndexInterval;
@ -207,38 +207,42 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
@Override
public void close() throws IOException {
boolean success = false;
try {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
int nonNullFieldCount = 0;
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
if (field.numIndexTerms > 0) {
nonNullFieldCount++;
if (out != null) {
boolean success = false;
try {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
int nonNullFieldCount = 0;
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
if (field.numIndexTerms > 0) {
nonNullFieldCount++;
}
}
}
out.writeVInt(nonNullFieldCount);
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
if (field.numIndexTerms > 0) {
out.writeVInt(field.fieldInfo.number);
out.writeVInt(field.numIndexTerms);
out.writeVLong(field.termsStart);
out.writeVLong(field.indexStart);
out.writeVLong(field.packedIndexStart);
out.writeVLong(field.packedOffsetsStart);
out.writeVInt(nonNullFieldCount);
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
if (field.numIndexTerms > 0) {
out.writeVInt(field.fieldInfo.number);
out.writeVInt(field.numIndexTerms);
out.writeVLong(field.termsStart);
out.writeVLong(field.indexStart);
out.writeVLong(field.packedIndexStart);
out.writeVLong(field.packedOffsetsStart);
}
}
}
writeTrailer(dirStart);
success = true;
} finally {
if (success) {
IOUtils.close(out);
} else {
IOUtils.closeWhileHandlingException(out);
writeTrailer(dirStart);
CodecUtil.writeFooter(out);
success = true;
} finally {
if (success) {
IOUtils.close(out);
} else {
IOUtils.closeWhileHandlingException(out);
}
out = null;
}
}
}

View File

@ -62,6 +62,10 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
try {
version = readHeader(in);
if (version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM) {
CodecUtil.checksumEntireFile(in);
}
seekDir(in, dirOffset);
@ -190,7 +194,10 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
public void close() throws IOException {}
private void seekDir(IndexInput input, long dirOffset) throws IOException {
if (version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY) {
if (version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM) {
input.seek(input.length() - CodecUtil.footerLength() - 8);
dirOffset = input.readLong();
} else if (version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY) {
input.seek(input.length() - 8);
dirOffset = input.readLong();
}

View File

@ -45,7 +45,7 @@ import org.apache.lucene.util.fst.Util;
*
* @lucene.experimental */
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
protected final IndexOutput out;
protected IndexOutput out;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tiv";
@ -53,7 +53,8 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final static String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
final static int VERSION_START = 0;
final static int VERSION_APPEND_ONLY = 1;
final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
final static int VERSION_CHECKSUM = 2;
final static int VERSION_CURRENT = VERSION_CHECKSUM;
private final List<FSTFieldWriter> fields = new ArrayList<>();
@ -290,30 +291,34 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
@Override
public void close() throws IOException {
try {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
int nonNullFieldCount = 0;
for(int i=0;i<fieldCount;i++) {
FSTFieldWriter field = fields.get(i);
if (field.fst != null) {
nonNullFieldCount++;
if (out != null) {
try {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
int nonNullFieldCount = 0;
for(int i=0;i<fieldCount;i++) {
FSTFieldWriter field = fields.get(i);
if (field.fst != null) {
nonNullFieldCount++;
}
}
out.writeVInt(nonNullFieldCount);
for(int i=0;i<fieldCount;i++) {
FSTFieldWriter field = fields.get(i);
if (field.fst != null) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.indexStart);
}
}
writeTrailer(dirStart);
CodecUtil.writeFooter(out);
} finally {
out.close();
out = null;
}
}
out.writeVInt(nonNullFieldCount);
for(int i=0;i<fieldCount;i++) {
FSTFieldWriter field = fields.get(i);
if (field.fst != null) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.indexStart);
}
}
writeTrailer(dirStart);
} finally {
out.close();
}
}
private void writeTrailer(long dirStart) throws IOException {

View File

@ -39,8 +39,8 @@ import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@ -66,7 +66,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
* </p>
* <ul>
* <li>BloomFilter (.blm) --&gt; Header, DelegatePostingsFormatName,
* NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
* NumFilteredFields, Filter<sup>NumFilteredFields</sup>, Footer</li>
* <li>Filter --&gt; FieldNumber, FuzzySet</li>
* <li>FuzzySet --&gt;See {@link FuzzySet#serialize(DataOutput)}</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
@ -75,13 +75,16 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
* <li>NumFilteredFields --&gt; {@link DataOutput#writeInt Uint32}</li>
* <li>FieldNumber --&gt; {@link DataOutput#writeInt Uint32} The number of the
* field in this segment</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* @lucene.experimental
*/
public final class BloomFilteringPostingsFormat extends PostingsFormat {
public static final String BLOOM_CODEC_NAME = "BloomFilter";
public static final int BLOOM_CODEC_VERSION = 1;
public static final int VERSION_START = 1;
public static final int VERSION_CHECKSUM = 2;
public static final int VERSION_CURRENT = VERSION_CHECKSUM;
/** Extension of Bloom Filters file */
static final String BLOOM_EXTENSION = "blm";
@ -157,12 +160,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
IndexInput bloomIn = null;
ChecksumIndexInput bloomIn = null;
boolean success = false;
try {
bloomIn = state.directory.openInput(bloomFileName, state.context);
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
BLOOM_CODEC_VERSION);
bloomIn = state.directory.openChecksumInput(bloomFileName, state.context);
int version = CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT);
// // Load the hash function used in the BloomFilter
// hashFunction = HashFunction.forName(bloomIn.readString());
// Load the delegate postings format
@ -178,6 +180,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
bloomsByFieldName.put(fieldInfo.name, bloom);
}
if (version >= VERSION_CHECKSUM) {
CodecUtil.checkFooter(bloomIn);
} else {
CodecUtil.checkEOF(bloomIn);
}
IOUtils.close(bloomIn);
success = true;
} finally {
@ -390,6 +397,11 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
}
return sizeInBytes;
}
@Override
public void checkIntegrity() throws IOException {
delegateFieldsProducer.checkIntegrity();
}
}
class BloomFilteredFieldsConsumer extends FieldsConsumer {
@ -466,10 +478,8 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
IndexOutput bloomOutput = null;
try {
bloomOutput = state.directory
.createOutput(bloomFileName, state.context);
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
BLOOM_CODEC_VERSION);
bloomOutput = state.directory.createOutput(bloomFileName, state.context);
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT);
// remember the name of the postings format we will delegate to
bloomOutput.writeString(delegatePostingsFormat.getName());
@ -481,6 +491,7 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
bloomOutput.writeInt(fieldInfo.number);
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
}
CodecUtil.writeFooter(bloomOutput);
} finally {
IOUtils.close(bloomOutput);
}

View File

@ -1,171 +0,0 @@
package org.apache.lucene.codecs.intblock;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Naive int block API that writes vInts. This is
* expected to give poor performance; it's really only for
* testing the pluggability. One should typically use pfor instead. */
import java.io.IOException;
import org.apache.lucene.codecs.sep.IntIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
/** Abstract base class that reads fixed-size blocks of ints
* from an IndexInput. While this is a simple approach, a
* more performant approach would directly create an impl
* of IntIndexInput inside Directory. Wrapping a generic
* IndexInput will likely cost performance.
*
* @lucene.experimental
*/
public abstract class FixedIntBlockIndexInput extends IntIndexInput {
private final IndexInput in;
protected final int blockSize;
public FixedIntBlockIndexInput(final IndexInput in) throws IOException {
this.in = in;
blockSize = in.readVInt();
}
@Override
public IntIndexInput.Reader reader() throws IOException {
final int[] buffer = new int[blockSize];
final IndexInput clone = in.clone();
// TODO: can this be simplified?
return new Reader(clone, buffer, this.getBlockReader(clone, buffer));
}
@Override
public void close() throws IOException {
in.close();
}
@Override
public IntIndexInput.Index index() {
return new Index();
}
protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException;
/**
* Interface for fixed-size block decoders.
* <p>
* Implementations should decode into the buffer in {@link #readBlock}.
*/
public interface BlockReader {
public void readBlock() throws IOException;
}
private static class Reader extends IntIndexInput.Reader {
private final IndexInput in;
private final BlockReader blockReader;
private final int blockSize;
private final int[] pending;
private int upto;
private boolean seekPending;
private long pendingFP;
private long lastBlockFP = -1;
public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
this.in = in;
this.pending = pending;
this.blockSize = pending.length;
this.blockReader = blockReader;
upto = blockSize;
}
void seek(final long fp, final int upto) {
assert upto < blockSize;
if (seekPending || fp != lastBlockFP) {
pendingFP = fp;
seekPending = true;
}
this.upto = upto;
}
@Override
public int next() throws IOException {
if (seekPending) {
// Seek & load new block
in.seek(pendingFP);
lastBlockFP = pendingFP;
blockReader.readBlock();
seekPending = false;
} else if (upto == blockSize) {
// Load new block
lastBlockFP = in.getFilePointer();
blockReader.readBlock();
upto = 0;
}
return pending[upto++];
}
}
private class Index extends IntIndexInput.Index {
private long fp;
private int upto;
@Override
public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
upto = indexIn.readVInt();
fp = indexIn.readVLong();
} else {
final int uptoDelta = indexIn.readVInt();
if ((uptoDelta & 1) == 1) {
// same block
upto += uptoDelta >>> 1;
} else {
// new block
upto = uptoDelta >>> 1;
fp += indexIn.readVLong();
}
}
assert upto < blockSize;
}
@Override
public void seek(final IntIndexInput.Reader other) throws IOException {
((Reader) other).seek(fp, upto);
}
@Override
public void copyFrom(final IntIndexInput.Index other) {
final Index idx = (Index) other;
fp = idx.fp;
upto = idx.upto;
}
@Override
public Index clone() {
Index other = new Index();
other.fp = fp;
other.upto = upto;
return other;
}
@Override
public String toString() {
return "fp=" + fp + " upto=" + upto;
}
}
}

View File

@ -1,128 +0,0 @@
package org.apache.lucene.codecs.intblock;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Naive int block API that writes vInts. This is
* expected to give poor performance; it's really only for
* testing the pluggability. One should typically use pfor instead. */
import java.io.IOException;
import org.apache.lucene.codecs.sep.IntIndexOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
/** Abstract base class that writes fixed-size blocks of ints
* to an IndexOutput. While this is a simple approach, a
* more performant approach would directly create an impl
* of IntIndexOutput inside Directory. Wrapping a generic
* IndexInput will likely cost performance.
*
* @lucene.experimental
*/
public abstract class FixedIntBlockIndexOutput extends IntIndexOutput {
protected final IndexOutput out;
private final int blockSize;
protected final int[] buffer;
private int upto;
protected FixedIntBlockIndexOutput(IndexOutput out, int fixedBlockSize) throws IOException {
blockSize = fixedBlockSize;
this.out = out;
out.writeVInt(blockSize);
buffer = new int[blockSize];
}
protected abstract void flushBlock() throws IOException;
@Override
public IntIndexOutput.Index index() {
return new Index();
}
private class Index extends IntIndexOutput.Index {
long fp;
int upto;
long lastFP;
int lastUpto;
@Override
public void mark() throws IOException {
fp = out.getFilePointer();
upto = FixedIntBlockIndexOutput.this.upto;
}
@Override
public void copyFrom(IntIndexOutput.Index other, boolean copyLast) throws IOException {
Index idx = (Index) other;
fp = idx.fp;
upto = idx.upto;
if (copyLast) {
lastFP = fp;
lastUpto = upto;
}
}
@Override
public void write(DataOutput indexOut, boolean absolute) throws IOException {
if (absolute) {
indexOut.writeVInt(upto);
indexOut.writeVLong(fp);
} else if (fp == lastFP) {
// same block
assert upto >= lastUpto;
int uptoDelta = upto - lastUpto;
indexOut.writeVInt(uptoDelta << 1 | 1);
} else {
// new block
indexOut.writeVInt(upto << 1);
indexOut.writeVLong(fp - lastFP);
}
lastUpto = upto;
lastFP = fp;
}
@Override
public String toString() {
return "fp=" + fp + " upto=" + upto;
}
}
@Override
public void write(int v) throws IOException {
buffer[upto++] = v;
if (upto == blockSize) {
flushBlock();
upto = 0;
}
}
@Override
public void close() throws IOException {
try {
if (upto > 0) {
// NOTE: entries in the block after current upto are
// invalid
flushBlock();
}
} finally {
out.close();
}
}
}

Some files were not shown because too many files have changed in this diff Show More