mirror of https://github.com/apache/lucene.git
LUCENE-3312: Merge up to trunk HEAD. There was a really huge change (LUCENE-4199).
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3312@1359283 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
27aa2f6a28
|
@ -62,6 +62,12 @@ Build
|
||||||
* LUCENE-4115: JAR resolution/ cleanup should be done automatically for ant
|
* LUCENE-4115: JAR resolution/ cleanup should be done automatically for ant
|
||||||
clean/ eclipse/ resolve (Dawid Weiss)
|
clean/ eclipse/ resolve (Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4199: Add a new target "check-forbidden-apis", that parses all
|
||||||
|
generated .class files for use of APIs that use default charset, default
|
||||||
|
locale, or default timezone and fail build if violations found. This
|
||||||
|
ensures, that Lucene / Solr is independent on local configuration options.
|
||||||
|
(Uwe Schindler, Robert Muir, Dawid Weiss)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-4195: Added package documentation and examples for
|
* LUCENE-4195: Added package documentation and examples for
|
||||||
|
|
|
@ -61,50 +61,50 @@
|
||||||
executable="${python.exe}" failonerror="true" logerror="true">
|
executable="${python.exe}" failonerror="true" logerror="true">
|
||||||
<arg value="htmlentity.py"/>
|
<arg value="htmlentity.py"/>
|
||||||
</exec>
|
</exec>
|
||||||
|
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex"
|
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
|
||||||
outdir="src/java/org/apache/lucene/analysis/wikipedia"
|
|
||||||
nobak="on"/>
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
|
||||||
nobak="on" />
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex"
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard/std34"
|
|
||||||
nobak="on" />
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<!-- Remove the inappropriate JFlex-generated constructor -->
|
||||||
|
<macrodef name="run-jflex">
|
||||||
|
<attribute name="dir"/>
|
||||||
|
<attribute name="name"/>
|
||||||
|
<sequential>
|
||||||
|
<jflex file="@{dir}/@{name}.jflex"
|
||||||
|
outdir="@{dir}"
|
||||||
|
nobak="on" />
|
||||||
|
<replaceregexp file="@{dir}/@{name}.java"
|
||||||
|
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||||
|
replace="" flags="sg"/>
|
||||||
|
</sequential>
|
||||||
|
</macrodef>
|
||||||
|
|
||||||
<target name="clean-jflex">
|
<target name="clean-jflex">
|
||||||
<delete>
|
<delete>
|
||||||
|
<fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">
|
||||||
|
<containsregexp expression="generated.*by.*JFlex"/>
|
||||||
|
</fileset>
|
||||||
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
||||||
<containsregexp expression="generated.*by.*JFlex"/>
|
<containsregexp expression="generated.*by.*JFlex"/>
|
||||||
</fileset>
|
</fileset>
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package org.apache.lucene.analysis.br;
|
package org.apache.lucene.analysis.br;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -21,6 +23,7 @@ package org.apache.lucene.analysis.br;
|
||||||
* A stemmer for Brazilian Portuguese words.
|
* A stemmer for Brazilian Portuguese words.
|
||||||
*/
|
*/
|
||||||
public class BrazilianStemmer {
|
public class BrazilianStemmer {
|
||||||
|
private static final Locale locale = new Locale("pt", "BR");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Changed term
|
* Changed term
|
||||||
|
@ -243,7 +246,7 @@ public class BrazilianStemmer {
|
||||||
return null ;
|
return null ;
|
||||||
}
|
}
|
||||||
|
|
||||||
value = value.toLowerCase() ;
|
value = value.toLowerCase(locale) ;
|
||||||
for (j=0 ; j < value.length() ; j++) {
|
for (j=0 ; j < value.length() ; j++) {
|
||||||
if ((value.charAt(j) == 'á') ||
|
if ((value.charAt(j) == 'á') ||
|
||||||
(value.charAt(j) == 'â') ||
|
(value.charAt(j) == 'â') ||
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package org.apache.lucene.analysis.charfilter;
|
package org.apache.lucene.analysis.charfilter;
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
package org.apache.lucene.analysis.de;
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
// This file is encoded in UTF-8
|
// This file is encoded in UTF-8
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -38,6 +41,8 @@ public class GermanStemmer
|
||||||
*/
|
*/
|
||||||
private int substCount = 0;
|
private int substCount = 0;
|
||||||
|
|
||||||
|
private static final Locale locale = new Locale("de", "DE");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||||
*
|
*
|
||||||
|
@ -47,7 +52,7 @@ public class GermanStemmer
|
||||||
protected String stem( String term )
|
protected String stem( String term )
|
||||||
{
|
{
|
||||||
// Use lowercase for medium stemming.
|
// Use lowercase for medium stemming.
|
||||||
term = term.toLowerCase();
|
term = term.toLowerCase(locale);
|
||||||
if ( !isStemmable( term ) )
|
if ( !isStemmable( term ) )
|
||||||
return term;
|
return term;
|
||||||
// Reset the StringBuilder.
|
// Reset the StringBuilder.
|
||||||
|
|
|
@ -252,7 +252,7 @@ public class HunspellDictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
String condition = ruleArgs[4];
|
String condition = ruleArgs[4];
|
||||||
affix.setCondition(condition, String.format(conditionPattern, condition));
|
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
||||||
affix.setCrossProduct(crossProduct);
|
affix.setCrossProduct(crossProduct);
|
||||||
|
|
||||||
List<HunspellAffix> list = affixes.get(affix.getAppend());
|
List<HunspellAffix> list = affixes.get(affix.getAppend());
|
||||||
|
@ -376,7 +376,7 @@ public class HunspellDictionary {
|
||||||
Arrays.sort(wordForm.getFlags());
|
Arrays.sort(wordForm.getFlags());
|
||||||
entry = line.substring(0, flagSep);
|
entry = line.substring(0, flagSep);
|
||||||
if(ignoreCase) {
|
if(ignoreCase) {
|
||||||
entry = entry.toLowerCase(Locale.ENGLISH);
|
entry = entry.toLowerCase(Locale.ROOT);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.hunspell;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -330,7 +331,7 @@ public class HunspellStemmer {
|
||||||
|
|
||||||
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
|
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
|
||||||
|
|
||||||
Scanner scanner = new Scanner(System.in);
|
Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
|
||||||
|
|
||||||
System.out.print("> ");
|
System.out.print("> ");
|
||||||
while (scanner.hasNextLine()) {
|
while (scanner.hasNextLine()) {
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.sinks;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
@ -37,10 +38,12 @@ public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||||
protected CharTermAttribute termAtt;
|
protected CharTermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
* Uses {@link java.text.DateFormat#getDateInstance(int, Locale)
|
||||||
|
* DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as
|
||||||
|
* the {@link java.text.DateFormat} object.
|
||||||
*/
|
*/
|
||||||
public DateRecognizerSinkFilter() {
|
public DateRecognizerSinkFilter() {
|
||||||
this(DateFormat.getDateInstance());
|
this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT));
|
||||||
}
|
}
|
||||||
|
|
||||||
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 9/30/11 12:10 PM from the specification file
|
* on 08.07.12 16:59 from the specification file
|
||||||
* <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||||
|
|
||||||
|
@ -383,15 +383,7 @@ public final void getText(CharTermAttribute t) {
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
ClassicTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
|
// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
|
||||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -759,15 +759,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
public StandardTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/18/12 12:05 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
@ -3844,15 +3844,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
public UAX29URLEmailTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.wikipedia;
|
package org.apache.lucene.analysis.wikipedia;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 1/22/12 10:26 PM from the specification file
|
* on 08.07.12 17:00 from the specification file
|
||||||
* <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class WikipediaTokenizerImpl {
|
class WikipediaTokenizerImpl {
|
||||||
|
|
||||||
|
@ -519,15 +519,7 @@ final void reset() {
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
WikipediaTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -79,7 +79,7 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if (!keywordAttr.isKeyword()) {
|
if (!keywordAttr.isKeyword()) {
|
||||||
final String term = termAtt.toString().toLowerCase(Locale.ENGLISH);
|
final String term = termAtt.toString().toLowerCase(Locale.ROOT);
|
||||||
termAtt.setEmpty().append(term);
|
termAtt.setEmpty().append(term);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -27,7 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
|
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.ROOT));
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
|
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
|
||||||
TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.sinks;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -164,7 +165,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
||||||
TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
|
TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
|
||||||
String[] lowerCaseTokens = new String[tokens1.length];
|
String[] lowerCaseTokens = new String[tokens1.length];
|
||||||
for (int i = 0; i < tokens1.length; i++)
|
for (int i = 0; i < tokens1.length; i++)
|
||||||
lowerCaseTokens[i] = tokens1[i].toLowerCase();
|
lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT);
|
||||||
assertTokenStreamContents(lowerCasing, lowerCaseTokens);
|
assertTokenStreamContents(lowerCasing, lowerCaseTokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -180,7 +181,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
||||||
StringBuilder buffer = new StringBuilder();
|
StringBuilder buffer = new StringBuilder();
|
||||||
System.out.println("-----Tokens: " + tokCount[k] + "-----");
|
System.out.println("-----Tokens: " + tokCount[k] + "-----");
|
||||||
for (int i = 0; i < tokCount[k]; i++) {
|
for (int i = 0; i < tokCount[k]; i++) {
|
||||||
buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
|
buffer.append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).append(' ');
|
||||||
}
|
}
|
||||||
//make sure we produce the same tokens
|
//make sure we produce the same tokens
|
||||||
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
|
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
|
||||||
|
|
|
@ -32,7 +32,8 @@ public class TestCharArrayIterator extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testConsumeWordInstance() {
|
public void testConsumeWordInstance() {
|
||||||
BreakIterator bi = BreakIterator.getWordInstance();
|
// we use the default locale, as its randomized by LuceneTestCase
|
||||||
|
BreakIterator bi = BreakIterator.getWordInstance(Locale.getDefault());
|
||||||
CharArrayIterator ci = CharArrayIterator.newWordInstance();
|
CharArrayIterator ci = CharArrayIterator.newWordInstance();
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 10000; i++) {
|
||||||
char text[] = _TestUtil.randomUnicodeString(random()).toCharArray();
|
char text[] = _TestUtil.randomUnicodeString(random()).toCharArray();
|
||||||
|
@ -43,7 +44,8 @@ public class TestCharArrayIterator extends LuceneTestCase {
|
||||||
|
|
||||||
/* run this to test if your JRE is buggy
|
/* run this to test if your JRE is buggy
|
||||||
public void testWordInstanceJREBUG() {
|
public void testWordInstanceJREBUG() {
|
||||||
BreakIterator bi = BreakIterator.getWordInstance();
|
// we use the default locale, as its randomized by LuceneTestCase
|
||||||
|
BreakIterator bi = BreakIterator.getWordInstance(Locale.getDefault());
|
||||||
Segment ci = new Segment();
|
Segment ci = new Segment();
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 10000; i++) {
|
||||||
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
||||||
|
@ -60,7 +62,8 @@ public class TestCharArrayIterator extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testConsumeSentenceInstance() {
|
public void testConsumeSentenceInstance() {
|
||||||
BreakIterator bi = BreakIterator.getSentenceInstance();
|
// we use the default locale, as its randomized by LuceneTestCase
|
||||||
|
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.getDefault());
|
||||||
CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
|
CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 10000; i++) {
|
||||||
char text[] = _TestUtil.randomUnicodeString(random()).toCharArray();
|
char text[] = _TestUtil.randomUnicodeString(random()).toCharArray();
|
||||||
|
@ -71,7 +74,8 @@ public class TestCharArrayIterator extends LuceneTestCase {
|
||||||
|
|
||||||
/* run this to test if your JRE is buggy
|
/* run this to test if your JRE is buggy
|
||||||
public void testSentenceInstanceJREBUG() {
|
public void testSentenceInstanceJREBUG() {
|
||||||
BreakIterator bi = BreakIterator.getSentenceInstance();
|
// we use the default locale, as its randomized by LuceneTestCase
|
||||||
|
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.getDefault());
|
||||||
Segment ci = new Segment();
|
Segment ci = new Segment();
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 10000; i++) {
|
||||||
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
||||||
|
|
|
@ -36,7 +36,7 @@ public class TestCharArrayMap extends LuceneTestCase {
|
||||||
key[j] = (char)random().nextInt(127);
|
key[j] = (char)random().nextInt(127);
|
||||||
}
|
}
|
||||||
String keyStr = new String(key);
|
String keyStr = new String(key);
|
||||||
String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ENGLISH) : keyStr;
|
String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr;
|
||||||
|
|
||||||
int val = random().nextInt();
|
int val = random().nextInt();
|
||||||
|
|
||||||
|
|
|
@ -208,16 +208,16 @@ public class TestCharArraySet extends LuceneTestCase {
|
||||||
set.add(upper);
|
set.add(upper);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < upperArr.length; i++) {
|
for (int i = 0; i < upperArr.length; i++) {
|
||||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
|
||||||
assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
|
assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
|
||||||
}
|
}
|
||||||
set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), false);
|
set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), false);
|
||||||
for (String upper : upperArr) {
|
for (String upper : upperArr) {
|
||||||
set.add(upper);
|
set.add(upper);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < upperArr.length; i++) {
|
for (int i = 0; i < upperArr.length; i++) {
|
||||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
|
||||||
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -235,8 +235,8 @@ public class TestCharArraySet extends LuceneTestCase {
|
||||||
set.add(upper);
|
set.add(upper);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < upperArr.length; i++) {
|
for (int i = 0; i < upperArr.length; i++) {
|
||||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
|
||||||
assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
|
assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
|
||||||
}
|
}
|
||||||
set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS),
|
set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS),
|
||||||
false);
|
false);
|
||||||
|
@ -244,8 +244,8 @@ public class TestCharArraySet extends LuceneTestCase {
|
||||||
set.add(upper);
|
set.add(upper);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < upperArr.length; i++) {
|
for (int i = 0; i < upperArr.length; i++) {
|
||||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
|
||||||
assertFalse(String.format(falsePos, upperArr[i]), set
|
assertFalse(String.format(Locale.ROOT, falsePos, upperArr[i]), set
|
||||||
.contains(lowerArr[i]));
|
.contains(lowerArr[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -258,7 +258,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
||||||
List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
|
List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
|
||||||
List<String> stopwordsUpper = new ArrayList<String>();
|
List<String> stopwordsUpper = new ArrayList<String>();
|
||||||
for (String string : stopwords) {
|
for (String string : stopwords) {
|
||||||
stopwordsUpper.add(string.toUpperCase());
|
stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
|
setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||||
setIngoreCase.add(Integer.valueOf(1));
|
setIngoreCase.add(Integer.valueOf(1));
|
||||||
|
@ -305,7 +305,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
||||||
List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
|
List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
|
||||||
List<String> stopwordsUpper = new ArrayList<String>();
|
List<String> stopwordsUpper = new ArrayList<String>();
|
||||||
for (String string : stopwords) {
|
for (String string : stopwords) {
|
||||||
stopwordsUpper.add(string.toUpperCase());
|
stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
|
setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||||
setIngoreCase.add(Integer.valueOf(1));
|
setIngoreCase.add(Integer.valueOf(1));
|
||||||
|
@ -351,7 +351,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
||||||
List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
|
List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
|
||||||
List<String> stopwordsUpper = new ArrayList<String>();
|
List<String> stopwordsUpper = new ArrayList<String>();
|
||||||
for (String string : stopwords) {
|
for (String string : stopwords) {
|
||||||
stopwordsUpper.add(string.toUpperCase());
|
stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.util;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -53,7 +54,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||||
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
|
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
|
||||||
builder.insert(1023, "\ud801\udc1c");
|
builder.insert(1023, "\ud801\udc1c");
|
||||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
|
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
|
||||||
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
|
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -70,7 +71,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
builder.append("\ud801\udc1cabc");
|
builder.append("\ud801\udc1cabc");
|
||||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
|
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
|
||||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
|
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,7 +85,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||||
builder.append("A");
|
builder.append("A");
|
||||||
}
|
}
|
||||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
||||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
|
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -98,7 +99,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
builder.append("\ud801\udc1c");
|
builder.append("\ud801\udc1c");
|
||||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
||||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
|
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
|
||||||
}
|
}
|
||||||
|
|
||||||
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
|
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
|
||||||
|
|
|
@ -123,11 +123,11 @@ public class GenerateJflexTLDMacros {
|
||||||
while (null != (line = reader.readLine())) {
|
while (null != (line = reader.readLine())) {
|
||||||
Matcher matcher = TLD_PATTERN_1.matcher(line);
|
Matcher matcher = TLD_PATTERN_1.matcher(line);
|
||||||
if (matcher.matches()) {
|
if (matcher.matches()) {
|
||||||
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
|
TLDs.add(matcher.group(1).toLowerCase(Locale.ROOT));
|
||||||
} else {
|
} else {
|
||||||
matcher = TLD_PATTERN_2.matcher(line);
|
matcher = TLD_PATTERN_2.matcher(line);
|
||||||
if (matcher.matches()) {
|
if (matcher.matches()) {
|
||||||
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
|
TLDs.add(matcher.group(1).toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -146,7 +146,7 @@ public class GenerateJflexTLDMacros {
|
||||||
*/
|
*/
|
||||||
private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
|
private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
|
||||||
final DateFormat dateFormat = DateFormat.getDateTimeInstance
|
final DateFormat dateFormat = DateFormat.getDateTimeInstance
|
||||||
(DateFormat.FULL, DateFormat.FULL, Locale.US);
|
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||||
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||||
final Writer writer = new OutputStreamWriter
|
final Writer writer = new OutputStreamWriter
|
||||||
(new FileOutputStream(outputFile), "UTF-8");
|
(new FileOutputStream(outputFile), "UTF-8");
|
||||||
|
|
|
@ -64,7 +64,7 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
|
||||||
//
|
//
|
||||||
public void testCollationKeySort() throws Exception {
|
public void testCollationKeySort() throws Exception {
|
||||||
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
|
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
|
||||||
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
|
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.ROOT));
|
||||||
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
|
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
|
||||||
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
|
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
|
||||||
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
|
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
|
||||||
|
@ -73,7 +73,7 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
|
||||||
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
|
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
|
||||||
|
|
||||||
// The ICU Collator and java.text.Collator implementations differ in their
|
// The ICU Collator and java.text.Collator implementations differ in their
|
||||||
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
|
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.ROOT.
|
||||||
testCollationKeySort
|
testCollationKeySort
|
||||||
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
|
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
|
||||||
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
|
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class GenerateHTMLStripCharFilterSupplementaryMacros {
|
||||||
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
||||||
private static final String NL = System.getProperty("line.separator");
|
private static final String NL = System.getProperty("line.separator");
|
||||||
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
||||||
(DateFormat.FULL, DateFormat.FULL, Locale.US);
|
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||||
static {
|
static {
|
||||||
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ public class GenerateJFlexSupplementaryMacros {
|
||||||
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
||||||
private static final String NL = System.getProperty("line.separator");
|
private static final String NL = System.getProperty("line.separator");
|
||||||
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
||||||
(DateFormat.FULL, DateFormat.FULL, Locale.US);
|
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||||
static {
|
static {
|
||||||
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -607,7 +607,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private void doTestBocchan(int numIterations) throws Exception {
|
private void doTestBocchan(int numIterations) throws Exception {
|
||||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
|
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
|
||||||
this.getClass().getResourceAsStream("bocchan.utf-8")));
|
this.getClass().getResourceAsStream("bocchan.utf-8"), "UTF-8"));
|
||||||
String line = reader.readLine();
|
String line = reader.readLine();
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ public class StempelStemmer {
|
||||||
DataInputStream in = null;
|
DataInputStream in = null;
|
||||||
try {
|
try {
|
||||||
in = new DataInputStream(new BufferedInputStream(stemmerTable));
|
in = new DataInputStream(new BufferedInputStream(stemmerTable));
|
||||||
String method = in.readUTF().toUpperCase(Locale.ENGLISH);
|
String method = in.readUTF().toUpperCase(Locale.ROOT);
|
||||||
if (method.indexOf('M') < 0) {
|
if (method.indexOf('M') < 0) {
|
||||||
return new org.egothor.stemmer.Trie(in);
|
return new org.egothor.stemmer.Trie(in);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -63,6 +63,7 @@ import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.LineNumberReader;
|
import java.io.LineNumberReader;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -89,7 +90,7 @@ public class Compile {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
args[0].toUpperCase();
|
args[0].toUpperCase(Locale.ROOT);
|
||||||
|
|
||||||
backward = args[0].charAt(0) == '-';
|
backward = args[0].charAt(0) == '-';
|
||||||
int qq = (backward) ? 1 : 0;
|
int qq = (backward) ? 1 : 0;
|
||||||
|
@ -127,7 +128,7 @@ public class Compile {
|
||||||
new FileInputStream(args[i]), charset)));
|
new FileInputStream(args[i]), charset)));
|
||||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||||
try {
|
try {
|
||||||
line = line.toLowerCase();
|
line = line.toLowerCase(Locale.ROOT);
|
||||||
StringTokenizer st = new StringTokenizer(line);
|
StringTokenizer st = new StringTokenizer(line);
|
||||||
String stem = st.nextToken();
|
String stem = st.nextToken();
|
||||||
if (storeorig) {
|
if (storeorig) {
|
||||||
|
|
|
@ -55,9 +55,11 @@
|
||||||
package org.egothor.stemmer;
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.FileReader;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.io.LineNumberReader;
|
import java.io.LineNumberReader;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -95,10 +97,11 @@ public class DiffIt {
|
||||||
// System.out.println("[" + args[i] + "]");
|
// System.out.println("[" + args[i] + "]");
|
||||||
Diff diff = new Diff(ins, del, rep, nop);
|
Diff diff = new Diff(ins, del, rep, nop);
|
||||||
try {
|
try {
|
||||||
in = new LineNumberReader(new BufferedReader(new FileReader(args[i])));
|
String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
|
||||||
|
in = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[i]), charset)));
|
||||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||||
try {
|
try {
|
||||||
line = line.toLowerCase();
|
line = line.toLowerCase(Locale.ROOT);
|
||||||
StringTokenizer st = new StringTokenizer(line);
|
StringTokenizer st = new StringTokenizer(line);
|
||||||
String stem = st.nextToken();
|
String stem = st.nextToken();
|
||||||
System.out.println(stem + " -a");
|
System.out.println(stem + " -a");
|
||||||
|
|
|
@ -60,12 +60,14 @@ import java.io.BufferedReader;
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.io.LineNumberReader;
|
import java.io.LineNumberReader;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestCompile extends LuceneTestCase {
|
public class TestCompile extends LuceneTestCase {
|
||||||
|
@ -107,7 +109,7 @@ public class TestCompile extends LuceneTestCase {
|
||||||
Trie trie;
|
Trie trie;
|
||||||
DataInputStream is = new DataInputStream(new BufferedInputStream(
|
DataInputStream is = new DataInputStream(new BufferedInputStream(
|
||||||
new FileInputStream(path)));
|
new FileInputStream(path)));
|
||||||
String method = is.readUTF().toUpperCase();
|
String method = is.readUTF().toUpperCase(Locale.ROOT);
|
||||||
if (method.indexOf('M') < 0) {
|
if (method.indexOf('M') < 0) {
|
||||||
trie = new Trie(is);
|
trie = new Trie(is);
|
||||||
} else {
|
} else {
|
||||||
|
@ -120,11 +122,11 @@ public class TestCompile extends LuceneTestCase {
|
||||||
private static void assertTrie(Trie trie, String file, boolean usefull,
|
private static void assertTrie(Trie trie, String file, boolean usefull,
|
||||||
boolean storeorig) throws Exception {
|
boolean storeorig) throws Exception {
|
||||||
LineNumberReader in = new LineNumberReader(new BufferedReader(
|
LineNumberReader in = new LineNumberReader(new BufferedReader(
|
||||||
new FileReader(file)));
|
new InputStreamReader(new FileInputStream(file), IOUtils.CHARSET_UTF_8)));
|
||||||
|
|
||||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||||
try {
|
try {
|
||||||
line = line.toLowerCase();
|
line = line.toLowerCase(Locale.ROOT);
|
||||||
StringTokenizer st = new StringTokenizer(line);
|
StringTokenizer st = new StringTokenizer(line);
|
||||||
String stem = st.nextToken();
|
String stem = st.nextToken();
|
||||||
if (storeorig) {
|
if (storeorig) {
|
||||||
|
@ -132,7 +134,7 @@ public class TestCompile extends LuceneTestCase {
|
||||||
.getLastOnPath(stem);
|
.getLastOnPath(stem);
|
||||||
StringBuilder stm = new StringBuilder(stem);
|
StringBuilder stm = new StringBuilder(stem);
|
||||||
Diff.apply(stm, cmd);
|
Diff.apply(stm, cmd);
|
||||||
assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
|
assertEquals(stem.toLowerCase(Locale.ROOT), stm.toString().toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
while (st.hasMoreTokens()) {
|
while (st.hasMoreTokens()) {
|
||||||
String token = st.nextToken();
|
String token = st.nextToken();
|
||||||
|
@ -143,7 +145,7 @@ public class TestCompile extends LuceneTestCase {
|
||||||
.getLastOnPath(token);
|
.getLastOnPath(token);
|
||||||
StringBuilder stm = new StringBuilder(token);
|
StringBuilder stm = new StringBuilder(token);
|
||||||
Diff.apply(stm, cmd);
|
Diff.apply(stm, cmd);
|
||||||
assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
|
assertEquals(stem.toLowerCase(Locale.ROOT), stm.toString().toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
} catch (java.util.NoSuchElementException x) {
|
} catch (java.util.NoSuchElementException x) {
|
||||||
// no base token (stem) on a line
|
// no base token (stem) on a line
|
||||||
|
|
|
@ -262,9 +262,11 @@
|
||||||
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
|
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
|
||||||
|
|
||||||
<target name="clean-javacc">
|
<target name="clean-javacc">
|
||||||
<fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
|
<delete>
|
||||||
<containsregexp expression="Generated.*By.*JavaCC"/>
|
<fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
|
||||||
</fileset>
|
<containsregexp expression="Generated.*By.*JavaCC"/>
|
||||||
|
</fileset>
|
||||||
|
</delete>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="javacc" depends="init,javacc-check" if="javacc.present">
|
<target name="javacc" depends="init,javacc-check" if="javacc.present">
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -106,7 +107,7 @@ public class Benchmark {
|
||||||
|
|
||||||
Benchmark benchmark = null;
|
Benchmark benchmark = null;
|
||||||
try {
|
try {
|
||||||
benchmark = new Benchmark(new FileReader(algFile));
|
benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
|
|
|
@ -18,12 +18,14 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileFilter;
|
import java.io.FileFilter;
|
||||||
import java.io.FileReader;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParsePosition;
|
import java.text.ParsePosition;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
@ -161,7 +163,7 @@ public class DirContentSource extends ContentSource {
|
||||||
dfi = new DateFormatInfo();
|
dfi = new DateFormatInfo();
|
||||||
dfi.pos = new ParsePosition(0);
|
dfi.pos = new ParsePosition(0);
|
||||||
// date format: 30-MAR-1987 14:22:36.87
|
// date format: 30-MAR-1987 14:22:36.87
|
||||||
dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.US);
|
dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.ROOT);
|
||||||
dfi.df.setLenient(true);
|
dfi.df.setLenient(true);
|
||||||
dateFormat.set(dfi);
|
dateFormat.set(dfi);
|
||||||
}
|
}
|
||||||
|
@ -198,7 +200,7 @@ public class DirContentSource extends ContentSource {
|
||||||
name = f.getCanonicalPath()+"_"+iteration;
|
name = f.getCanonicalPath()+"_"+iteration;
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
|
||||||
String line = null;
|
String line = null;
|
||||||
//First line is the date, 3rd is the title, rest is body
|
//First line is the date, 3rd is the title, rest is body
|
||||||
String dateStr = reader.readLine();
|
String dateStr = reader.readLine();
|
||||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.TimeZone;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
@ -182,8 +183,8 @@ public class DocMaker implements Closeable {
|
||||||
private boolean storeBytes = false;
|
private boolean storeBytes = false;
|
||||||
|
|
||||||
private static class DateUtil {
|
private static class DateUtil {
|
||||||
public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
|
public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ROOT);
|
||||||
public Calendar cal = Calendar.getInstance();
|
public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
|
||||||
public ParsePosition pos = new ParsePosition(0);
|
public ParsePosition pos = new ParsePosition(0);
|
||||||
public DateUtil() {
|
public DateUtil() {
|
||||||
parser.setLenient(true);
|
parser.setLenient(true);
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.io.InputStreamReader;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
@ -146,7 +147,7 @@ public class EnwikiContentSource extends ContentSource {
|
||||||
case BODY:
|
case BODY:
|
||||||
body = contents.toString();
|
body = contents.toString();
|
||||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
|
||||||
if (startsWith.startsWith("#redirect")) {
|
if (startsWith.startsWith("#redirect")) {
|
||||||
body = null;
|
body = null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException;
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
|
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
@ -59,13 +60,14 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
|
||||||
{
|
{
|
||||||
File file = new File(fileName);
|
File file = new File(fileName);
|
||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
|
// note: we use a decoding reader, so if your queries are screwed up you know
|
||||||
if (file.exists()) {
|
if (file.exists()) {
|
||||||
reader = new FileReader(file);
|
reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
|
||||||
} else {
|
} else {
|
||||||
//see if we can find it as a resource
|
//see if we can find it as a resource
|
||||||
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
|
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
|
||||||
if (asStream != null) {
|
if (asStream != null) {
|
||||||
reader = new InputStreamReader(asStream);
|
reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (reader != null) {
|
if (reader != null) {
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class LongToEnglishContentSource extends ContentSource{
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: we could take param to specify locale...
|
// TODO: we could take param to specify locale...
|
||||||
private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ENGLISH,
|
private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
|
||||||
RuleBasedNumberFormat.SPELLOUT);
|
RuleBasedNumberFormat.SPELLOUT);
|
||||||
@Override
|
@Override
|
||||||
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
||||||
|
|
|
@ -37,7 +37,7 @@ public class LongToEnglishQueryMaker implements QueryMaker {
|
||||||
protected QueryParser parser;
|
protected QueryParser parser;
|
||||||
|
|
||||||
// TODO: we could take param to specify locale...
|
// TODO: we could take param to specify locale...
|
||||||
private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ENGLISH,
|
private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
|
||||||
RuleBasedNumberFormat.SPELLOUT);
|
RuleBasedNumberFormat.SPELLOUT);
|
||||||
|
|
||||||
public Query makeQuery(int size) throws Exception {
|
public Query makeQuery(int size) throws Exception {
|
||||||
|
|
|
@ -19,8 +19,9 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParsePosition;
|
import java.text.ParsePosition;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
@ -29,6 +30,7 @@ import java.util.Date;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link ContentSource} reading from the Reuters collection.
|
* A {@link ContentSource} reading from the Reuters collection.
|
||||||
|
@ -74,7 +76,7 @@ public class ReutersContentSource extends ContentSource {
|
||||||
if (dfi == null) {
|
if (dfi == null) {
|
||||||
dfi = new DateFormatInfo();
|
dfi = new DateFormatInfo();
|
||||||
// date format: 30-MAR-1987 14:22:36.87
|
// date format: 30-MAR-1987 14:22:36.87
|
||||||
dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
|
dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.ROOT);
|
||||||
dfi.df.setLenient(true);
|
dfi.df.setLenient(true);
|
||||||
dfi.pos = new ParsePosition(0);
|
dfi.pos = new ParsePosition(0);
|
||||||
dateFormat.set(dfi);
|
dateFormat.set(dfi);
|
||||||
|
@ -112,7 +114,7 @@ public class ReutersContentSource extends ContentSource {
|
||||||
name = f.getCanonicalPath() + "_" + iteration;
|
name = f.getCanonicalPath() + "_" + iteration;
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
|
||||||
try {
|
try {
|
||||||
// First line is the date, 3rd is the title, rest is body
|
// First line is the date, 3rd is the title, rest is body
|
||||||
String dateStr = reader.readLine();
|
String dateStr = reader.readLine();
|
||||||
|
|
|
@ -108,7 +108,7 @@ public class TrecContentSource extends ContentSource {
|
||||||
dfi = new DateFormatInfo();
|
dfi = new DateFormatInfo();
|
||||||
dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length];
|
dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length];
|
||||||
for (int i = 0; i < dfi.dfs.length; i++) {
|
for (int i = 0; i < dfi.dfs.length; i++) {
|
||||||
dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US);
|
dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.ROOT);
|
||||||
dfi.dfs[i].setLenient(true);
|
dfi.dfs[i].setLenient(true);
|
||||||
}
|
}
|
||||||
dfi.pos = new ParsePosition(0);
|
dfi.pos = new ParsePosition(0);
|
||||||
|
|
|
@ -47,7 +47,7 @@ public abstract class TrecDocParser {
|
||||||
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
|
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
|
||||||
static {
|
static {
|
||||||
for (ParsePathType ppt : ParsePathType.values()) {
|
for (ParsePathType ppt : ParsePathType.values()) {
|
||||||
pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
|
pathName2Type.put(ppt.name().toUpperCase(Locale.ROOT),ppt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ public abstract class TrecDocParser {
|
||||||
public static ParsePathType pathType(File f) {
|
public static ParsePathType pathType(File f) {
|
||||||
int pathLength = 0;
|
int pathLength = 0;
|
||||||
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
|
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
|
||||||
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
|
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ROOT));
|
||||||
if (ppt!=null) {
|
if (ppt!=null) {
|
||||||
return ppt;
|
return ppt;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */
|
||||||
|
/* JavaCCOptions:STATIC=false */
|
||||||
|
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This interface describes a character stream that maintains line and
|
||||||
|
* column number positions of the characters. It also has the capability
|
||||||
|
* to backup the stream to some extent. An implementation of this
|
||||||
|
* interface is used in the TokenManager implementation generated by
|
||||||
|
* JavaCCParser.
|
||||||
|
*
|
||||||
|
* All the methods except backup can be implemented in any fashion. backup
|
||||||
|
* needs to be implemented correctly for the correct operation of the lexer.
|
||||||
|
* Rest of the methods are all used to get information like line number,
|
||||||
|
* column number and the String that constitutes a token and are not used
|
||||||
|
* by the lexer. Hence their implementation won't affect the generated lexer's
|
||||||
|
* operation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public interface CharStream {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next character from the selected input. The method
|
||||||
|
* of selecting the input is the responsibility of the class
|
||||||
|
* implementing this interface. Can throw any java.io.IOException.
|
||||||
|
*/
|
||||||
|
char readChar() throws java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the column position of the character last read.
|
||||||
|
* @deprecated
|
||||||
|
* @see #getEndColumn
|
||||||
|
*/
|
||||||
|
int getColumn();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the line number of the character last read.
|
||||||
|
* @deprecated
|
||||||
|
* @see #getEndLine
|
||||||
|
*/
|
||||||
|
int getLine();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the column number of the last character for current token (being
|
||||||
|
* matched after the last call to BeginTOken).
|
||||||
|
*/
|
||||||
|
int getEndColumn();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the line number of the last character for current token (being
|
||||||
|
* matched after the last call to BeginTOken).
|
||||||
|
*/
|
||||||
|
int getEndLine();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the column number of the first character for current token (being
|
||||||
|
* matched after the last call to BeginTOken).
|
||||||
|
*/
|
||||||
|
int getBeginColumn();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the line number of the first character for current token (being
|
||||||
|
* matched after the last call to BeginTOken).
|
||||||
|
*/
|
||||||
|
int getBeginLine();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backs up the input stream by amount steps. Lexer calls this method if it
|
||||||
|
* had already read some characters, but could not use them to match a
|
||||||
|
* (longer) token. So, they will be used again as the prefix of the next
|
||||||
|
* token and it is the implemetation's responsibility to do this right.
|
||||||
|
*/
|
||||||
|
void backup(int amount);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next character that marks the beginning of the next token.
|
||||||
|
* All characters must remain in the buffer between two successive calls
|
||||||
|
* to this method to implement backup correctly.
|
||||||
|
*/
|
||||||
|
char BeginToken() throws java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a string made up of characters from the marked token beginning
|
||||||
|
* to the current buffer position. Implementations have the choice of returning
|
||||||
|
* anything that they want to. For example, for efficiency, one might decide
|
||||||
|
* to just return null, which is a valid implementation.
|
||||||
|
*/
|
||||||
|
String GetImage();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of characters that make up the suffix of length 'len' for
|
||||||
|
* the currently matched token. This is used to build up the matched string
|
||||||
|
* for use in actions in the case of MORE. A simple and inefficient
|
||||||
|
* implementation of this is as follows :
|
||||||
|
*
|
||||||
|
* {
|
||||||
|
* String t = GetImage();
|
||||||
|
* return t.substring(t.length() - len, t.length()).toCharArray();
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
char[] GetSuffix(int len);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The lexer calls this function to indicate that it is done with the stream
|
||||||
|
* and hence implementations can free any resources held by this class.
|
||||||
|
* Again, the body of this function can be just empty and it will not
|
||||||
|
* affect the lexer's operation.
|
||||||
|
*/
|
||||||
|
void Done();
|
||||||
|
|
||||||
|
}
|
||||||
|
/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */
|
|
@ -0,0 +1,123 @@
|
||||||
|
// FastCharStream.java
|
||||||
|
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
|
||||||
|
* this does not do line-number counting, but instead keeps track of the
|
||||||
|
* character position of the token in the input, as required by Lucene's {@link
|
||||||
|
* org.apache.lucene.analysis.Token} API.
|
||||||
|
* */
|
||||||
|
public final class FastCharStream implements CharStream {
|
||||||
|
char[] buffer = null;
|
||||||
|
|
||||||
|
int bufferLength = 0; // end of valid chars
|
||||||
|
int bufferPosition = 0; // next char to read
|
||||||
|
|
||||||
|
int tokenStart = 0; // offset in buffer
|
||||||
|
int bufferStart = 0; // position in file of buffer
|
||||||
|
|
||||||
|
Reader input; // source of chars
|
||||||
|
|
||||||
|
/** Constructs from a Reader. */
|
||||||
|
public FastCharStream(Reader r) {
|
||||||
|
input = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final char readChar() throws IOException {
|
||||||
|
if (bufferPosition >= bufferLength)
|
||||||
|
refill();
|
||||||
|
return buffer[bufferPosition++];
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void refill() throws IOException {
|
||||||
|
int newPosition = bufferLength - tokenStart;
|
||||||
|
|
||||||
|
if (tokenStart == 0) { // token won't fit in buffer
|
||||||
|
if (buffer == null) { // first time: alloc buffer
|
||||||
|
buffer = new char[2048];
|
||||||
|
} else if (bufferLength == buffer.length) { // grow buffer
|
||||||
|
char[] newBuffer = new char[buffer.length*2];
|
||||||
|
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
|
||||||
|
buffer = newBuffer;
|
||||||
|
}
|
||||||
|
} else { // shift token to front
|
||||||
|
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
|
||||||
|
}
|
||||||
|
|
||||||
|
bufferLength = newPosition; // update state
|
||||||
|
bufferPosition = newPosition;
|
||||||
|
bufferStart += tokenStart;
|
||||||
|
tokenStart = 0;
|
||||||
|
|
||||||
|
int charsRead = // fill space in buffer
|
||||||
|
input.read(buffer, newPosition, buffer.length-newPosition);
|
||||||
|
if (charsRead == -1)
|
||||||
|
throw new IOException("read past eof");
|
||||||
|
else
|
||||||
|
bufferLength += charsRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final char BeginToken() throws IOException {
|
||||||
|
tokenStart = bufferPosition;
|
||||||
|
return readChar();
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void backup(int amount) {
|
||||||
|
bufferPosition -= amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final String GetImage() {
|
||||||
|
return new String(buffer, tokenStart, bufferPosition - tokenStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final char[] GetSuffix(int len) {
|
||||||
|
char[] value = new char[len];
|
||||||
|
System.arraycopy(buffer, bufferPosition - len, value, 0, len);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void Done() {
|
||||||
|
try {
|
||||||
|
input.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final int getColumn() {
|
||||||
|
return bufferStart + bufferPosition;
|
||||||
|
}
|
||||||
|
public final int getLine() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
public final int getEndColumn() {
|
||||||
|
return bufferStart + bufferPosition;
|
||||||
|
}
|
||||||
|
public final int getEndLine() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
public final int getBeginColumn() {
|
||||||
|
return bufferStart + tokenStart;
|
||||||
|
}
|
||||||
|
public final int getBeginLine() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -29,6 +29,10 @@ public class HTMLParser implements HTMLParserConstants {
|
||||||
private MyPipedInputStream pipeInStream = null;
|
private MyPipedInputStream pipeInStream = null;
|
||||||
private PipedOutputStream pipeOutStream = null;
|
private PipedOutputStream pipeOutStream = null;
|
||||||
|
|
||||||
|
public HTMLParser(Reader reader) {
|
||||||
|
this(new FastCharStream(reader));
|
||||||
|
}
|
||||||
|
|
||||||
private class MyPipedInputStream extends PipedInputStream{
|
private class MyPipedInputStream extends PipedInputStream{
|
||||||
|
|
||||||
public MyPipedInputStream(){
|
public MyPipedInputStream(){
|
||||||
|
@ -227,7 +231,7 @@ InterruptedException {
|
||||||
Token t1, t2;
|
Token t1, t2;
|
||||||
boolean inImg = false;
|
boolean inImg = false;
|
||||||
t1 = jj_consume_token(TagName);
|
t1 = jj_consume_token(TagName);
|
||||||
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
|
String tagName = t1.image.toLowerCase(Locale.ROOT);
|
||||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||||
addSpace();
|
addSpace();
|
||||||
}
|
}
|
||||||
|
@ -264,7 +268,7 @@ InterruptedException {
|
||||||
)
|
)
|
||||||
&& t2 != null)
|
&& t2 != null)
|
||||||
{
|
{
|
||||||
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
|
currentMetaTag=t2.image.toLowerCase(Locale.ROOT);
|
||||||
if(currentMetaTag != null && currentMetaContent != null) {
|
if(currentMetaTag != null && currentMetaContent != null) {
|
||||||
addMetaTag();
|
addMetaTag();
|
||||||
}
|
}
|
||||||
|
@ -272,7 +276,7 @@ InterruptedException {
|
||||||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||||
null)
|
null)
|
||||||
{
|
{
|
||||||
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
|
currentMetaContent=t2.image.toLowerCase(Locale.ROOT);
|
||||||
if(currentMetaTag != null && currentMetaContent != null) {
|
if(currentMetaTag != null && currentMetaContent != null) {
|
||||||
addMetaTag();
|
addMetaTag();
|
||||||
}
|
}
|
||||||
|
@ -464,7 +468,6 @@ null)
|
||||||
|
|
||||||
/** Generated Token Manager. */
|
/** Generated Token Manager. */
|
||||||
public HTMLParserTokenManager token_source;
|
public HTMLParserTokenManager token_source;
|
||||||
SimpleCharStream jj_input_stream;
|
|
||||||
/** Current token. */
|
/** Current token. */
|
||||||
public Token token;
|
public Token token;
|
||||||
/** Next token. */
|
/** Next token. */
|
||||||
|
@ -485,14 +488,9 @@ null)
|
||||||
private boolean jj_rescan = false;
|
private boolean jj_rescan = false;
|
||||||
private int jj_gc = 0;
|
private int jj_gc = 0;
|
||||||
|
|
||||||
/** Constructor with InputStream. */
|
/** Constructor with user supplied CharStream. */
|
||||||
public HTMLParser(java.io.InputStream stream) {
|
public HTMLParser(CharStream stream) {
|
||||||
this(stream, null);
|
token_source = new HTMLParserTokenManager(stream);
|
||||||
}
|
|
||||||
/** Constructor with InputStream and supplied encoding */
|
|
||||||
public HTMLParser(java.io.InputStream stream, String encoding) {
|
|
||||||
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
|
|
||||||
token_source = new HTMLParserTokenManager(jj_input_stream);
|
|
||||||
token = new Token();
|
token = new Token();
|
||||||
jj_ntk = -1;
|
jj_ntk = -1;
|
||||||
jj_gen = 0;
|
jj_gen = 0;
|
||||||
|
@ -501,35 +499,8 @@ null)
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Reinitialise. */
|
/** Reinitialise. */
|
||||||
public void ReInit(java.io.InputStream stream) {
|
public void ReInit(CharStream stream) {
|
||||||
ReInit(stream, null);
|
token_source.ReInit(stream);
|
||||||
}
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream stream, String encoding) {
|
|
||||||
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
|
|
||||||
token_source.ReInit(jj_input_stream);
|
|
||||||
token = new Token();
|
|
||||||
jj_ntk = -1;
|
|
||||||
jj_gen = 0;
|
|
||||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
|
||||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public HTMLParser(java.io.Reader stream) {
|
|
||||||
jj_input_stream = new SimpleCharStream(stream, 1, 1);
|
|
||||||
token_source = new HTMLParserTokenManager(jj_input_stream);
|
|
||||||
token = new Token();
|
|
||||||
jj_ntk = -1;
|
|
||||||
jj_gen = 0;
|
|
||||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
|
||||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.Reader stream) {
|
|
||||||
jj_input_stream.ReInit(stream, 1, 1);
|
|
||||||
token_source.ReInit(jj_input_stream);
|
|
||||||
token = new Token();
|
token = new Token();
|
||||||
jj_ntk = -1;
|
jj_ntk = -1;
|
||||||
jj_gen = 0;
|
jj_gen = 0;
|
||||||
|
@ -631,7 +602,7 @@ null)
|
||||||
return (jj_ntk = jj_nt.kind);
|
return (jj_ntk = jj_nt.kind);
|
||||||
}
|
}
|
||||||
|
|
||||||
private java.util.List<int[]> jj_expentries = new java.util.ArrayList<int[]>();
|
private java.util.List jj_expentries = new java.util.ArrayList();
|
||||||
private int[] jj_expentry;
|
private int[] jj_expentry;
|
||||||
private int jj_kind = -1;
|
private int jj_kind = -1;
|
||||||
private int[] jj_lasttokens = new int[100];
|
private int[] jj_lasttokens = new int[100];
|
||||||
|
@ -691,7 +662,7 @@ null)
|
||||||
jj_add_error_token(0, 0);
|
jj_add_error_token(0, 0);
|
||||||
int[][] exptokseq = new int[jj_expentries.size()][];
|
int[][] exptokseq = new int[jj_expentries.size()][];
|
||||||
for (int i = 0; i < jj_expentries.size(); i++) {
|
for (int i = 0; i < jj_expentries.size(); i++) {
|
||||||
exptokseq[i] = jj_expentries.get(i);
|
exptokseq[i] = (int[])jj_expentries.get(i);
|
||||||
}
|
}
|
||||||
return new ParseException(token, exptokseq, tokenImage);
|
return new ParseException(token, exptokseq, tokenImage);
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ options {
|
||||||
//DEBUG_LOOKAHEAD = true;
|
//DEBUG_LOOKAHEAD = true;
|
||||||
//DEBUG_TOKEN_MANAGER = true;
|
//DEBUG_TOKEN_MANAGER = true;
|
||||||
UNICODE_INPUT = true;
|
UNICODE_INPUT = true;
|
||||||
|
USER_CHAR_STREAM=true;
|
||||||
}
|
}
|
||||||
|
|
||||||
PARSER_BEGIN(HTMLParser)
|
PARSER_BEGIN(HTMLParser)
|
||||||
|
@ -56,6 +57,10 @@ public class HTMLParser {
|
||||||
private MyPipedInputStream pipeInStream = null;
|
private MyPipedInputStream pipeInStream = null;
|
||||||
private PipedOutputStream pipeOutStream = null;
|
private PipedOutputStream pipeOutStream = null;
|
||||||
|
|
||||||
|
public HTMLParser(Reader reader) {
|
||||||
|
this(new FastCharStream(reader));
|
||||||
|
}
|
||||||
|
|
||||||
private class MyPipedInputStream extends PipedInputStream{
|
private class MyPipedInputStream extends PipedInputStream{
|
||||||
|
|
||||||
public MyPipedInputStream(){
|
public MyPipedInputStream(){
|
||||||
|
@ -227,7 +232,7 @@ void Tag() throws IOException :
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
t1=<TagName> {
|
t1=<TagName> {
|
||||||
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
|
String tagName = t1.image.toLowerCase(Locale.ROOT);
|
||||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||||
addSpace();
|
addSpace();
|
||||||
}
|
}
|
||||||
|
@ -249,7 +254,7 @@ void Tag() throws IOException :
|
||||||
)
|
)
|
||||||
&& t2 != null)
|
&& t2 != null)
|
||||||
{
|
{
|
||||||
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
|
currentMetaTag=t2.image.toLowerCase(Locale.ROOT);
|
||||||
if(currentMetaTag != null && currentMetaContent != null) {
|
if(currentMetaTag != null && currentMetaContent != null) {
|
||||||
addMetaTag();
|
addMetaTag();
|
||||||
}
|
}
|
||||||
|
@ -257,7 +262,7 @@ void Tag() throws IOException :
|
||||||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||||
null)
|
null)
|
||||||
{
|
{
|
||||||
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
|
currentMetaContent=t2.image.toLowerCase(Locale.ROOT);
|
||||||
if(currentMetaTag != null && currentMetaContent != null) {
|
if(currentMetaTag != null && currentMetaContent != null) {
|
||||||
addMetaTag();
|
addMetaTag();
|
||||||
}
|
}
|
||||||
|
|
|
@ -464,7 +464,7 @@ private int jjMoveNfa_0(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -569,7 +569,7 @@ private int jjMoveNfa_5(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -670,7 +670,7 @@ private int jjMoveNfa_7(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -766,7 +766,7 @@ private int jjMoveNfa_4(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -892,7 +892,7 @@ private int jjMoveNfa_3(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -1061,7 +1061,7 @@ private int jjMoveNfa_6(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -1205,7 +1205,7 @@ private int jjMoveNfa_1(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -1361,7 +1361,7 @@ private int jjMoveNfa_2(int startState, int curPos)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int hiByte = (curChar >> 8);
|
int hiByte = (int)(curChar >> 8);
|
||||||
int i1 = hiByte >> 6;
|
int i1 = hiByte >> 6;
|
||||||
long l1 = 1L << (hiByte & 077);
|
long l1 = 1L << (hiByte & 077);
|
||||||
int i2 = (curChar & 0xff) >> 6;
|
int i2 = (curChar & 0xff) >> 6;
|
||||||
|
@ -1441,25 +1441,23 @@ static final long[] jjtoToken = {
|
||||||
static final long[] jjtoSkip = {
|
static final long[] jjtoSkip = {
|
||||||
0x400000L,
|
0x400000L,
|
||||||
};
|
};
|
||||||
protected SimpleCharStream input_stream;
|
protected CharStream input_stream;
|
||||||
private final int[] jjrounds = new int[28];
|
private final int[] jjrounds = new int[28];
|
||||||
private final int[] jjstateSet = new int[56];
|
private final int[] jjstateSet = new int[56];
|
||||||
protected char curChar;
|
protected char curChar;
|
||||||
/** Constructor. */
|
/** Constructor. */
|
||||||
public HTMLParserTokenManager(SimpleCharStream stream){
|
public HTMLParserTokenManager(CharStream stream){
|
||||||
if (SimpleCharStream.staticFlag)
|
|
||||||
throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer.");
|
|
||||||
input_stream = stream;
|
input_stream = stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructor. */
|
/** Constructor. */
|
||||||
public HTMLParserTokenManager(SimpleCharStream stream, int lexState){
|
public HTMLParserTokenManager(CharStream stream, int lexState){
|
||||||
this(stream);
|
this(stream);
|
||||||
SwitchTo(lexState);
|
SwitchTo(lexState);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Reinitialise parser. */
|
/** Reinitialise parser. */
|
||||||
public void ReInit(SimpleCharStream stream)
|
public void ReInit(CharStream stream)
|
||||||
{
|
{
|
||||||
jjmatchedPos = jjnewStateCnt = 0;
|
jjmatchedPos = jjnewStateCnt = 0;
|
||||||
curLexState = defaultLexState;
|
curLexState = defaultLexState;
|
||||||
|
@ -1475,7 +1473,7 @@ private void ReInitRounds()
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Reinitialise parser. */
|
/** Reinitialise parser. */
|
||||||
public void ReInit(SimpleCharStream stream, int lexState)
|
public void ReInit(CharStream stream, int lexState)
|
||||||
{
|
{
|
||||||
ReInit(stream);
|
ReInit(stream);
|
||||||
SwitchTo(lexState);
|
SwitchTo(lexState);
|
||||||
|
|
|
@ -195,4 +195,4 @@ public class ParseException extends Exception {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
/* JavaCC - OriginalChecksum=e5376178619291bc9d2c0c6647dc3cef (do not edit this line) */
|
/* JavaCC - OriginalChecksum=e449d0e43f3d85deb1260a88b7e90fcd (do not edit this line) */
|
||||||
|
|
|
@ -1,472 +0,0 @@
|
||||||
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.1 */
|
|
||||||
/* JavaCCOptions:STATIC=false */
|
|
||||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An implementation of interface CharStream, where the stream is assumed to
|
|
||||||
* contain only ASCII characters (without unicode processing).
|
|
||||||
*/
|
|
||||||
|
|
||||||
public class SimpleCharStream
|
|
||||||
{
|
|
||||||
/** Whether parser is static. */
|
|
||||||
public static final boolean staticFlag = false;
|
|
||||||
int bufsize;
|
|
||||||
int available;
|
|
||||||
int tokenBegin;
|
|
||||||
/** Position in buffer. */
|
|
||||||
public int bufpos = -1;
|
|
||||||
protected int bufline[];
|
|
||||||
protected int bufcolumn[];
|
|
||||||
|
|
||||||
protected int column = 0;
|
|
||||||
protected int line = 1;
|
|
||||||
|
|
||||||
protected boolean prevCharIsCR = false;
|
|
||||||
protected boolean prevCharIsLF = false;
|
|
||||||
|
|
||||||
protected java.io.Reader inputStream;
|
|
||||||
|
|
||||||
protected char[] buffer;
|
|
||||||
protected int maxNextCharInd = 0;
|
|
||||||
protected int inBuf = 0;
|
|
||||||
protected int tabSize = 8;
|
|
||||||
|
|
||||||
protected void setTabSize(int i) { tabSize = i; }
|
|
||||||
protected int getTabSize(int i) { return tabSize; }
|
|
||||||
|
|
||||||
|
|
||||||
protected void ExpandBuff(boolean wrapAround)
|
|
||||||
{
|
|
||||||
char[] newbuffer = new char[bufsize + 2048];
|
|
||||||
int newbufline[] = new int[bufsize + 2048];
|
|
||||||
int newbufcolumn[] = new int[bufsize + 2048];
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
if (wrapAround)
|
|
||||||
{
|
|
||||||
System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
|
|
||||||
System.arraycopy(buffer, 0, newbuffer,
|
|
||||||
bufsize - tokenBegin, bufpos);
|
|
||||||
buffer = newbuffer;
|
|
||||||
|
|
||||||
System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
|
|
||||||
System.arraycopy(bufline, 0, newbufline, bufsize - tokenBegin, bufpos);
|
|
||||||
bufline = newbufline;
|
|
||||||
|
|
||||||
System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
|
|
||||||
System.arraycopy(bufcolumn, 0, newbufcolumn, bufsize - tokenBegin, bufpos);
|
|
||||||
bufcolumn = newbufcolumn;
|
|
||||||
|
|
||||||
maxNextCharInd = (bufpos += (bufsize - tokenBegin));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
|
|
||||||
buffer = newbuffer;
|
|
||||||
|
|
||||||
System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
|
|
||||||
bufline = newbufline;
|
|
||||||
|
|
||||||
System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
|
|
||||||
bufcolumn = newbufcolumn;
|
|
||||||
|
|
||||||
maxNextCharInd = (bufpos -= tokenBegin);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Throwable t)
|
|
||||||
{
|
|
||||||
throw new Error(t.getMessage());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bufsize += 2048;
|
|
||||||
available = bufsize;
|
|
||||||
tokenBegin = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void FillBuff() throws java.io.IOException
|
|
||||||
{
|
|
||||||
if (maxNextCharInd == available)
|
|
||||||
{
|
|
||||||
if (available == bufsize)
|
|
||||||
{
|
|
||||||
if (tokenBegin > 2048)
|
|
||||||
{
|
|
||||||
bufpos = maxNextCharInd = 0;
|
|
||||||
available = tokenBegin;
|
|
||||||
}
|
|
||||||
else if (tokenBegin < 0)
|
|
||||||
bufpos = maxNextCharInd = 0;
|
|
||||||
else
|
|
||||||
ExpandBuff(false);
|
|
||||||
}
|
|
||||||
else if (available > tokenBegin)
|
|
||||||
available = bufsize;
|
|
||||||
else if ((tokenBegin - available) < 2048)
|
|
||||||
ExpandBuff(true);
|
|
||||||
else
|
|
||||||
available = tokenBegin;
|
|
||||||
}
|
|
||||||
|
|
||||||
int i;
|
|
||||||
try {
|
|
||||||
if ((i = inputStream.read(buffer, maxNextCharInd,
|
|
||||||
available - maxNextCharInd)) == -1)
|
|
||||||
{
|
|
||||||
inputStream.close();
|
|
||||||
throw new java.io.IOException();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
maxNextCharInd += i;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
catch(java.io.IOException e) {
|
|
||||||
--bufpos;
|
|
||||||
backup(0);
|
|
||||||
if (tokenBegin == -1)
|
|
||||||
tokenBegin = bufpos;
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Start. */
|
|
||||||
public char BeginToken() throws java.io.IOException
|
|
||||||
{
|
|
||||||
tokenBegin = -1;
|
|
||||||
char c = readChar();
|
|
||||||
tokenBegin = bufpos;
|
|
||||||
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void UpdateLineColumn(char c)
|
|
||||||
{
|
|
||||||
column++;
|
|
||||||
|
|
||||||
if (prevCharIsLF)
|
|
||||||
{
|
|
||||||
prevCharIsLF = false;
|
|
||||||
line += (column = 1);
|
|
||||||
}
|
|
||||||
else if (prevCharIsCR)
|
|
||||||
{
|
|
||||||
prevCharIsCR = false;
|
|
||||||
if (c == '\n')
|
|
||||||
{
|
|
||||||
prevCharIsLF = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
line += (column = 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (c)
|
|
||||||
{
|
|
||||||
case '\r' :
|
|
||||||
prevCharIsCR = true;
|
|
||||||
break;
|
|
||||||
case '\n' :
|
|
||||||
prevCharIsLF = true;
|
|
||||||
break;
|
|
||||||
case '\t' :
|
|
||||||
column--;
|
|
||||||
column += (tabSize - (column % tabSize));
|
|
||||||
break;
|
|
||||||
default :
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
bufline[bufpos] = line;
|
|
||||||
bufcolumn[bufpos] = column;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Read a character. */
|
|
||||||
public char readChar() throws java.io.IOException
|
|
||||||
{
|
|
||||||
if (inBuf > 0)
|
|
||||||
{
|
|
||||||
--inBuf;
|
|
||||||
|
|
||||||
if (++bufpos == bufsize)
|
|
||||||
bufpos = 0;
|
|
||||||
|
|
||||||
return buffer[bufpos];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (++bufpos >= maxNextCharInd)
|
|
||||||
FillBuff();
|
|
||||||
|
|
||||||
char c = buffer[bufpos];
|
|
||||||
|
|
||||||
UpdateLineColumn(c);
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @deprecated
|
|
||||||
* @see #getEndColumn
|
|
||||||
*/
|
|
||||||
|
|
||||||
public int getColumn() {
|
|
||||||
return bufcolumn[bufpos];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @deprecated
|
|
||||||
* @see #getEndLine
|
|
||||||
*/
|
|
||||||
|
|
||||||
public int getLine() {
|
|
||||||
return bufline[bufpos];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get token end column number. */
|
|
||||||
public int getEndColumn() {
|
|
||||||
return bufcolumn[bufpos];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get token end line number. */
|
|
||||||
public int getEndLine() {
|
|
||||||
return bufline[bufpos];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get token beginning column number. */
|
|
||||||
public int getBeginColumn() {
|
|
||||||
return bufcolumn[tokenBegin];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get token beginning line number. */
|
|
||||||
public int getBeginLine() {
|
|
||||||
return bufline[tokenBegin];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Backup a number of characters. */
|
|
||||||
public void backup(int amount) {
|
|
||||||
|
|
||||||
inBuf += amount;
|
|
||||||
if ((bufpos -= amount) < 0)
|
|
||||||
bufpos += bufsize;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.Reader dstream, int startline,
|
|
||||||
int startcolumn, int buffersize)
|
|
||||||
{
|
|
||||||
inputStream = dstream;
|
|
||||||
line = startline;
|
|
||||||
column = startcolumn - 1;
|
|
||||||
|
|
||||||
available = bufsize = buffersize;
|
|
||||||
buffer = new char[buffersize];
|
|
||||||
bufline = new int[buffersize];
|
|
||||||
bufcolumn = new int[buffersize];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.Reader dstream, int startline,
|
|
||||||
int startcolumn)
|
|
||||||
{
|
|
||||||
this(dstream, startline, startcolumn, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.Reader dstream)
|
|
||||||
{
|
|
||||||
this(dstream, 1, 1, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.Reader dstream, int startline,
|
|
||||||
int startcolumn, int buffersize)
|
|
||||||
{
|
|
||||||
inputStream = dstream;
|
|
||||||
line = startline;
|
|
||||||
column = startcolumn - 1;
|
|
||||||
|
|
||||||
if (buffer == null || buffersize != buffer.length)
|
|
||||||
{
|
|
||||||
available = bufsize = buffersize;
|
|
||||||
buffer = new char[buffersize];
|
|
||||||
bufline = new int[buffersize];
|
|
||||||
bufcolumn = new int[buffersize];
|
|
||||||
}
|
|
||||||
prevCharIsLF = prevCharIsCR = false;
|
|
||||||
tokenBegin = inBuf = maxNextCharInd = 0;
|
|
||||||
bufpos = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.Reader dstream, int startline,
|
|
||||||
int startcolumn)
|
|
||||||
{
|
|
||||||
ReInit(dstream, startline, startcolumn, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.Reader dstream)
|
|
||||||
{
|
|
||||||
ReInit(dstream, 1, 1, 4096);
|
|
||||||
}
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
|
|
||||||
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
|
|
||||||
{
|
|
||||||
this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
|
||||||
int startcolumn, int buffersize)
|
|
||||||
{
|
|
||||||
this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
|
|
||||||
int startcolumn) throws java.io.UnsupportedEncodingException
|
|
||||||
{
|
|
||||||
this(dstream, encoding, startline, startcolumn, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
|
||||||
int startcolumn)
|
|
||||||
{
|
|
||||||
this(dstream, startline, startcolumn, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
|
|
||||||
{
|
|
||||||
this(dstream, encoding, 1, 1, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructor. */
|
|
||||||
public SimpleCharStream(java.io.InputStream dstream)
|
|
||||||
{
|
|
||||||
this(dstream, 1, 1, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
|
|
||||||
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
|
|
||||||
{
|
|
||||||
ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream dstream, int startline,
|
|
||||||
int startcolumn, int buffersize)
|
|
||||||
{
|
|
||||||
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
|
|
||||||
{
|
|
||||||
ReInit(dstream, encoding, 1, 1, 4096);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream dstream)
|
|
||||||
{
|
|
||||||
ReInit(dstream, 1, 1, 4096);
|
|
||||||
}
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
|
|
||||||
int startcolumn) throws java.io.UnsupportedEncodingException
|
|
||||||
{
|
|
||||||
ReInit(dstream, encoding, startline, startcolumn, 4096);
|
|
||||||
}
|
|
||||||
/** Reinitialise. */
|
|
||||||
public void ReInit(java.io.InputStream dstream, int startline,
|
|
||||||
int startcolumn)
|
|
||||||
{
|
|
||||||
ReInit(dstream, startline, startcolumn, 4096);
|
|
||||||
}
|
|
||||||
/** Get token literal value. */
|
|
||||||
public String GetImage()
|
|
||||||
{
|
|
||||||
if (bufpos >= tokenBegin)
|
|
||||||
return new String(buffer, tokenBegin, bufpos - tokenBegin + 1);
|
|
||||||
else
|
|
||||||
return new String(buffer, tokenBegin, bufsize - tokenBegin) +
|
|
||||||
new String(buffer, 0, bufpos + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get the suffix. */
|
|
||||||
public char[] GetSuffix(int len)
|
|
||||||
{
|
|
||||||
char[] ret = new char[len];
|
|
||||||
|
|
||||||
if ((bufpos + 1) >= len)
|
|
||||||
System.arraycopy(buffer, bufpos - len + 1, ret, 0, len);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
System.arraycopy(buffer, bufsize - (len - bufpos - 1), ret, 0,
|
|
||||||
len - bufpos - 1);
|
|
||||||
System.arraycopy(buffer, 0, ret, len - bufpos - 1, bufpos + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reset buffer when finished. */
|
|
||||||
public void Done()
|
|
||||||
{
|
|
||||||
buffer = null;
|
|
||||||
bufline = null;
|
|
||||||
bufcolumn = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Method to adjust line and column numbers for the start of a token.
|
|
||||||
*/
|
|
||||||
public void adjustBeginLineColumn(int newLine, int newCol)
|
|
||||||
{
|
|
||||||
int start = tokenBegin;
|
|
||||||
int len;
|
|
||||||
|
|
||||||
if (bufpos >= tokenBegin)
|
|
||||||
{
|
|
||||||
len = bufpos - tokenBegin + inBuf + 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
len = bufsize - tokenBegin + bufpos + 1 + inBuf;
|
|
||||||
}
|
|
||||||
|
|
||||||
int i = 0, j = 0, k = 0;
|
|
||||||
int nextColDiff = 0, columnDiff = 0;
|
|
||||||
|
|
||||||
while (i < len &&
|
|
||||||
bufline[j = start % bufsize] == bufline[k = ++start % bufsize])
|
|
||||||
{
|
|
||||||
bufline[j] = newLine;
|
|
||||||
nextColDiff = columnDiff + bufcolumn[k] - bufcolumn[j];
|
|
||||||
bufcolumn[j] = newCol + columnDiff;
|
|
||||||
columnDiff = nextColDiff;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i < len)
|
|
||||||
{
|
|
||||||
bufline[j] = newLine++;
|
|
||||||
bufcolumn[j] = newCol + columnDiff;
|
|
||||||
|
|
||||||
while (i++ < len)
|
|
||||||
{
|
|
||||||
if (bufline[j = start % bufsize] != bufline[++start % bufsize])
|
|
||||||
bufline[j] = newLine++;
|
|
||||||
else
|
|
||||||
bufline[j] = newLine;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
line = bufline[j];
|
|
||||||
column = bufcolumn[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
/* JavaCC - OriginalChecksum=7c2e625567f11c3058995b779d0149ad (do not edit this line) */
|
|
|
@ -121,4 +121,4 @@ public class Token {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
/* JavaCC - OriginalChecksum=e49c2a0c10d50ff2ebd0639552330ce7 (do not edit this line) */
|
/* JavaCC - OriginalChecksum=24643dc85fd6daeec42ceba20b46ee61 (do not edit this line) */
|
||||||
|
|
|
@ -138,4 +138,4 @@ public class TokenMgrError extends Error
|
||||||
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
|
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* JavaCC - OriginalChecksum=3aee554f696e5d7a18b1ad330c1de53f (do not edit this line) */
|
/* JavaCC - OriginalChecksum=538f0da130356fcc0bc7db621ab0389d (do not edit this line) */
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
|
@ -61,7 +62,7 @@ public class AddDocTask extends PerfTask {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getLogMessage(int recsCount) {
|
protected String getLogMessage(int recsCount) {
|
||||||
return String.format("added %9d docs",recsCount);
|
return String.format(Locale.ROOT, "added %9d docs",recsCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -40,6 +40,7 @@ import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an index. <br>
|
* Create an index. <br>
|
||||||
|
@ -182,7 +183,7 @@ public class CreateIndexTask extends PerfTask {
|
||||||
iwc.setInfoStream(System.err);
|
iwc.setInfoStream(System.err);
|
||||||
} else {
|
} else {
|
||||||
File f = new File(infoStreamVal).getAbsoluteFile();
|
File f = new File(infoStreamVal).getAbsoluteFile();
|
||||||
iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f))));
|
iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)), false, Charset.defaultCharset().name()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);
|
IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.stats.Points;
|
import org.apache.lucene.benchmark.byTask.stats.Points;
|
||||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||||
|
@ -266,7 +268,7 @@ public abstract class PerfTask implements Cloneable {
|
||||||
public void tearDown() throws Exception {
|
public void tearDown() throws Exception {
|
||||||
if (++logStepCount % logStep == 0) {
|
if (++logStepCount % logStep == 0) {
|
||||||
double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
|
double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
|
||||||
System.out.println(String.format("%7.2f",time) + " sec --> "
|
System.out.println(String.format(Locale.ROOT, "%7.2f",time) + " sec --> "
|
||||||
+ Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
|
+ Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,7 +77,7 @@ public class SearchWithSortTask extends ReadTask {
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("You must specify the sort type ie page:int,subject:string");
|
throw new RuntimeException("You must specify the sort type ie page:int,subject:string");
|
||||||
}
|
}
|
||||||
sortField0 = new SortField(fieldName, SortField.Type.valueOf(typeString.toUpperCase(Locale.ENGLISH)));
|
sortField0 = new SortField(fieldName, SortField.Type.valueOf(typeString.toUpperCase(Locale.ROOT)));
|
||||||
}
|
}
|
||||||
sortFields[upto++] = sortField0;
|
sortFields[upto++] = sortField0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
@ -428,7 +429,7 @@ public class TaskSequence extends PerfTask {
|
||||||
sb.append(padd);
|
sb.append(padd);
|
||||||
sb.append(!letChildReport ? ">" : (parallel ? "]" : "}"));
|
sb.append(!letChildReport ? ">" : (parallel ? "]" : "}"));
|
||||||
if (fixedTime) {
|
if (fixedTime) {
|
||||||
sb.append(" " + NumberFormat.getNumberInstance().format(runTimeSec) + "s");
|
sb.append(" " + NumberFormat.getNumberInstance(Locale.ROOT).format(runTimeSec) + "s");
|
||||||
} else if (repetitions>1) {
|
} else if (repetitions>1) {
|
||||||
sb.append(" * " + repetitions);
|
sb.append(" * " + repetitions);
|
||||||
} else if (repetitions==REPEAT_EXHAUST) {
|
} else if (repetitions==REPEAT_EXHAUST) {
|
||||||
|
@ -487,7 +488,7 @@ public class TaskSequence extends PerfTask {
|
||||||
if (rate>0) {
|
if (rate>0) {
|
||||||
seqName += "_" + rate + (perMin?"/min":"/sec");
|
seqName += "_" + rate + (perMin?"/min":"/sec");
|
||||||
}
|
}
|
||||||
if (parallel && seqName.toLowerCase().indexOf("par")<0) {
|
if (parallel && seqName.toLowerCase(Locale.ROOT).indexOf("par")<0) {
|
||||||
seqName += "_Par";
|
seqName += "_Par";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.StringReader;
|
||||||
import java.lang.reflect.Constructor;
|
import java.lang.reflect.Constructor;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
||||||
|
@ -159,7 +160,7 @@ public class Algorithm {
|
||||||
} else {
|
} else {
|
||||||
stok.nextToken();
|
stok.nextToken();
|
||||||
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
|
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
|
||||||
String unit = stok.sval.toLowerCase();
|
String unit = stok.sval.toLowerCase(Locale.ROOT);
|
||||||
if ("min".equals(unit)) {
|
if ("min".equals(unit)) {
|
||||||
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
|
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
|
||||||
} else if ("sec".equals(unit)) {
|
} else if ("sec".equals(unit)) {
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTask.utils;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Formatting utilities (for reports).
|
* Formatting utilities (for reports).
|
||||||
|
@ -25,9 +26,9 @@ import java.text.NumberFormat;
|
||||||
public class Format {
|
public class Format {
|
||||||
|
|
||||||
private static NumberFormat numFormat [] = {
|
private static NumberFormat numFormat [] = {
|
||||||
NumberFormat.getInstance(),
|
NumberFormat.getInstance(Locale.ROOT),
|
||||||
NumberFormat.getInstance(),
|
NumberFormat.getInstance(Locale.ROOT),
|
||||||
NumberFormat.getInstance(),
|
NumberFormat.getInstance(Locale.ROOT),
|
||||||
};
|
};
|
||||||
private static final String padd = " ";
|
private static final String padd = " ";
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ public class StreamUtils {
|
||||||
String fileName = file.getName();
|
String fileName = file.getName();
|
||||||
int idx = fileName.lastIndexOf('.');
|
int idx = fileName.lastIndexOf('.');
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ENGLISH));
|
type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
return type==null ? Type.PLAIN : type;
|
return type==null ? Type.PLAIN : type;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.quality;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Results of quality benchmark run for a single query or for a set of queries.
|
* Results of quality benchmark run for a single query or for a set of queries.
|
||||||
|
@ -141,7 +142,7 @@ public class QualityStats {
|
||||||
logger.println(title);
|
logger.println(title);
|
||||||
}
|
}
|
||||||
prefix = prefix==null ? "" : prefix;
|
prefix = prefix==null ? "" : prefix;
|
||||||
NumberFormat nf = NumberFormat.getInstance();
|
NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
||||||
nf.setMaximumFractionDigits(3);
|
nf.setMaximumFractionDigits(3);
|
||||||
nf.setMinimumFractionDigits(3);
|
nf.setMinimumFractionDigits(3);
|
||||||
nf.setGroupingUsed(true);
|
nf.setGroupingUsed(true);
|
||||||
|
|
|
@ -24,11 +24,13 @@ import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -51,7 +53,7 @@ public class QueryDriver {
|
||||||
|
|
||||||
File topicsFile = new File(args[0]);
|
File topicsFile = new File(args[0]);
|
||||||
File qrelsFile = new File(args[1]);
|
File qrelsFile = new File(args[1]);
|
||||||
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene");
|
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], "UTF-8"), "lucene");
|
||||||
FSDirectory dir = FSDirectory.open(new File(args[3]));
|
FSDirectory dir = FSDirectory.open(new File(args[3]));
|
||||||
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
|
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
|
||||||
IndexReader reader = DirectoryReader.open(dir);
|
IndexReader reader = DirectoryReader.open(dir);
|
||||||
|
@ -60,14 +62,14 @@ public class QueryDriver {
|
||||||
int maxResults = 1000;
|
int maxResults = 1000;
|
||||||
String docNameField = "docname";
|
String docNameField = "docname";
|
||||||
|
|
||||||
PrintWriter logger = new PrintWriter(System.out, true);
|
PrintWriter logger = new PrintWriter(new OutputStreamWriter(System.out, Charset.defaultCharset()), true);
|
||||||
|
|
||||||
// use trec utilities to read trec topics into quality queries
|
// use trec utilities to read trec topics into quality queries
|
||||||
TrecTopicsReader qReader = new TrecTopicsReader();
|
TrecTopicsReader qReader = new TrecTopicsReader();
|
||||||
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
|
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, IOUtils.CHARSET_UTF_8)));
|
||||||
|
|
||||||
// prepare judge, with trec utilities that read from a QRels file
|
// prepare judge, with trec utilities that read from a QRels file
|
||||||
Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
|
Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8)));
|
||||||
|
|
||||||
// validate topics & judgments match each other
|
// validate topics & judgments match each other
|
||||||
judge.validateData(qqs, logger);
|
judge.validateData(qqs, logger);
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.quality.utils;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.quality.QualityQuery;
|
import org.apache.lucene.benchmark.quality.QualityQuery;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
@ -45,7 +46,7 @@ public class SubmissionReport {
|
||||||
public SubmissionReport (PrintWriter logger, String name) {
|
public SubmissionReport (PrintWriter logger, String name) {
|
||||||
this.logger = logger;
|
this.logger = logger;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
nf = NumberFormat.getInstance();
|
nf = NumberFormat.getInstance(Locale.ROOT);
|
||||||
nf.setMaximumFractionDigits(4);
|
nf.setMaximumFractionDigits(4);
|
||||||
nf.setMinimumFractionDigits(4);
|
nf.setMinimumFractionDigits(4);
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,12 +19,18 @@ package org.apache.lucene.benchmark.utils;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileFilter;
|
import java.io.FileFilter;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
|
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
|
||||||
|
@ -73,7 +79,7 @@ public class ExtractReuters {
|
||||||
*/
|
*/
|
||||||
protected void extractFile(File sgmFile) {
|
protected void extractFile(File sgmFile) {
|
||||||
try {
|
try {
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8));
|
||||||
|
|
||||||
StringBuilder buffer = new StringBuilder(1024);
|
StringBuilder buffer = new StringBuilder(1024);
|
||||||
StringBuilder outBuffer = new StringBuilder(1024);
|
StringBuilder outBuffer = new StringBuilder(1024);
|
||||||
|
@ -107,7 +113,7 @@ public class ExtractReuters {
|
||||||
File outFile = new File(outputDir, sgmFile.getName() + "-"
|
File outFile = new File(outputDir, sgmFile.getName() + "-"
|
||||||
+ (docNumber++) + ".txt");
|
+ (docNumber++) + ".txt");
|
||||||
// System.out.println("Writing " + outFile);
|
// System.out.println("Writing " + outFile);
|
||||||
FileWriter writer = new FileWriter(outFile);
|
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), IOUtils.CHARSET_UTF_8);
|
||||||
writer.write(out);
|
writer.write(out);
|
||||||
writer.close();
|
writer.close();
|
||||||
outBuffer.setLength(0);
|
outBuffer.setLength(0);
|
||||||
|
|
|
@ -18,8 +18,10 @@ package org.apache.lucene.benchmark.utils;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.Writer;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
|
import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
|
||||||
|
@ -28,6 +30,7 @@ import org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
||||||
|
@ -83,7 +86,7 @@ public class ExtractWikipedia {
|
||||||
contents.append("\n");
|
contents.append("\n");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
FileWriter writer = new FileWriter(f);
|
Writer writer = new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8);
|
||||||
writer.write(contents.toString());
|
writer.write(contents.toString());
|
||||||
writer.close();
|
writer.close();
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
|
|
|
@ -166,7 +166,7 @@ public class DocMakerTest extends BenchmarkTestCase {
|
||||||
// DocMaker did not close its ContentSource if resetInputs was called twice,
|
// DocMaker did not close its ContentSource if resetInputs was called twice,
|
||||||
// leading to a file handle leak.
|
// leading to a file handle leak.
|
||||||
File f = new File(getWorkDir(), "docMakerLeak.txt");
|
File f = new File(getWorkDir(), "docMakerLeak.txt");
|
||||||
PrintStream ps = new PrintStream(f);
|
PrintStream ps = new PrintStream(f, "UTF-8");
|
||||||
ps.println("one title\t" + System.currentTimeMillis() + "\tsome content");
|
ps.println("one title\t" + System.currentTimeMillis() + "\tsome content");
|
||||||
ps.close();
|
ps.close();
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
|
@ -50,7 +51,7 @@ public class CreateIndexTaskTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
PrintStream curOut = System.out;
|
PrintStream curOut = System.out;
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
System.setOut(new PrintStream(baos));
|
System.setOut(new PrintStream(baos, false, Charset.defaultCharset().name()));
|
||||||
try {
|
try {
|
||||||
PerfRunData runData = createPerfRunData("SystemOut");
|
PerfRunData runData = createPerfRunData("SystemOut");
|
||||||
CreateIndexTask cit = new CreateIndexTask(runData);
|
CreateIndexTask cit = new CreateIndexTask(runData);
|
||||||
|
@ -63,7 +64,7 @@ public class CreateIndexTaskTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
PrintStream curErr = System.err;
|
PrintStream curErr = System.err;
|
||||||
baos.reset();
|
baos.reset();
|
||||||
System.setErr(new PrintStream(baos));
|
System.setErr(new PrintStream(baos, false, Charset.defaultCharset().name()));
|
||||||
try {
|
try {
|
||||||
PerfRunData runData = createPerfRunData("SystemErr");
|
PerfRunData runData = createPerfRunData("SystemErr");
|
||||||
CreateIndexTask cit = new CreateIndexTask(runData);
|
CreateIndexTask cit = new CreateIndexTask(runData);
|
||||||
|
|
|
@ -31,6 +31,7 @@ import java.io.OutputStreamWriter;
|
||||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -88,7 +89,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
private File rawTextFile(String ext) throws Exception {
|
private File rawTextFile(String ext) throws Exception {
|
||||||
File f = new File(testDir,"testfile." + ext);
|
File f = new File(testDir,"testfile." + ext);
|
||||||
BufferedWriter w = new BufferedWriter(new FileWriter(f));
|
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8));
|
||||||
w.write(TEXT);
|
w.write(TEXT);
|
||||||
w.newLine();
|
w.newLine();
|
||||||
w.close();
|
w.close();
|
||||||
|
@ -117,7 +118,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeText(OutputStream os) throws IOException {
|
private void writeText(OutputStream os) throws IOException {
|
||||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
|
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
|
||||||
w.write(TEXT);
|
w.write(TEXT);
|
||||||
w.newLine();
|
w.newLine();
|
||||||
w.close();
|
w.close();
|
||||||
|
@ -125,7 +126,7 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
private void assertReadText(File f) throws Exception {
|
private void assertReadText(File f) throws Exception {
|
||||||
InputStream ir = StreamUtils.inputStream(f);
|
InputStream ir = StreamUtils.inputStream(f);
|
||||||
InputStreamReader in = new InputStreamReader(ir);
|
InputStreamReader in = new InputStreamReader(ir, IOUtils.CHARSET_UTF_8);
|
||||||
BufferedReader r = new BufferedReader(in);
|
BufferedReader r = new BufferedReader(in);
|
||||||
String line = r.readLine();
|
String line = r.readLine();
|
||||||
assertEquals("Wrong text found in "+f.getName(), TEXT, line);
|
assertEquals("Wrong text found in "+f.getName(), TEXT, line);
|
||||||
|
|
|
@ -31,7 +31,9 @@ import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that quality run does its job.
|
* Test that quality run does its job.
|
||||||
|
@ -55,7 +57,7 @@ public class TestQualityRun extends BenchmarkTestCase {
|
||||||
int maxResults = 1000;
|
int maxResults = 1000;
|
||||||
String docNameField = "doctitle"; // orig docID is in the linedoc format title
|
String docNameField = "doctitle"; // orig docID is in the linedoc format title
|
||||||
|
|
||||||
PrintWriter logger = VERBOSE ? new PrintWriter(System.out,true) : null;
|
PrintWriter logger = VERBOSE ? new PrintWriter(new OutputStreamWriter(System.out, Charset.defaultCharset()),true) : null;
|
||||||
|
|
||||||
// prepare topics
|
// prepare topics
|
||||||
InputStream topics = getClass().getResourceAsStream("trecTopics.txt");
|
InputStream topics = getClass().getResourceAsStream("trecTopics.txt");
|
||||||
|
|
|
@ -169,11 +169,19 @@
|
||||||
</clover-report>
|
</clover-report>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<!-- Validate once from top-level. -->
|
<!-- Validation (license/notice/api checks). -->
|
||||||
<target name="validate" depends="compile-tools,resolve" description="Validate legal stuff.">
|
<target name="validate" depends="check-licenses,check-forbidden-apis" description="Validate stuff." />
|
||||||
|
|
||||||
|
<target name="check-licenses" depends="compile-tools,resolve,load-custom-tasks" description="Validate license stuff.">
|
||||||
<license-check-macro dir="${basedir}" />
|
<license-check-macro dir="${basedir}" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<target name="check-forbidden-apis" depends="compile-tools,compile-test,load-custom-tasks" description="Check forbidden API calls in compiled class files.">
|
||||||
|
<forbidden-apis apiFile="${custom-tasks.dir}/forbiddenApis/jdk.txt">
|
||||||
|
<fileset dir="${basedir}/build" includes="**/*.class" />
|
||||||
|
</forbidden-apis>
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="resolve">
|
<target name="resolve">
|
||||||
<sequential>
|
<sequential>
|
||||||
<ant dir="test-framework" target="resolve" inheritall="false">
|
<ant dir="test-framework" target="resolve" inheritall="false">
|
||||||
|
|
|
@ -68,6 +68,7 @@
|
||||||
executable="${python.exe}" failonerror="true">
|
executable="${python.exe}" failonerror="true">
|
||||||
<arg line="createLevAutomata.py @{n} False"/>
|
<arg line="createLevAutomata.py @{n} False"/>
|
||||||
</exec>
|
</exec>
|
||||||
|
<fixcrlf srcdir="src/java/org/apache/lucene/util/automaton" includes="*ParametricDescription.java" encoding="UTF-8"/>
|
||||||
</sequential>
|
</sequential>
|
||||||
</macrodef>
|
</macrodef>
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.lucene.codecs;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
|
@ -345,7 +347,12 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||||
final PrintStream out = new PrintStream(bos);
|
PrintStream out;
|
||||||
|
try {
|
||||||
|
out = new PrintStream(bos, false, "UTF-8");
|
||||||
|
} catch (UnsupportedEncodingException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
|
|
||||||
out.println(" index FST:");
|
out.println(" index FST:");
|
||||||
out.println(" " + indexNodeCount + " nodes");
|
out.println(" " + indexNodeCount + " nodes");
|
||||||
|
@ -353,7 +360,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
out.println(" " + indexNumBytes + " bytes");
|
out.println(" " + indexNumBytes + " bytes");
|
||||||
out.println(" terms:");
|
out.println(" terms:");
|
||||||
out.println(" " + totalTermCount + " terms");
|
out.println(" " + totalTermCount + " terms");
|
||||||
out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format("%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : ""));
|
out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : ""));
|
||||||
out.println(" blocks:");
|
out.println(" blocks:");
|
||||||
out.println(" " + totalBlockCount + " blocks");
|
out.println(" " + totalBlockCount + " blocks");
|
||||||
out.println(" " + termsOnlyBlockCount + " terms-only blocks");
|
out.println(" " + termsOnlyBlockCount + " terms-only blocks");
|
||||||
|
@ -362,9 +369,9 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
out.println(" " + floorBlockCount + " floor blocks");
|
out.println(" " + floorBlockCount + " floor blocks");
|
||||||
out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks");
|
out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks");
|
||||||
out.println(" " + floorSubBlockCount + " floor sub-blocks");
|
out.println(" " + floorSubBlockCount + " floor sub-blocks");
|
||||||
out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format("%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : ""));
|
out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : ""));
|
||||||
out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format("%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
|
out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
|
||||||
out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format("%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
|
out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
|
||||||
if (totalBlockCount != 0) {
|
if (totalBlockCount != 0) {
|
||||||
out.println(" by prefix length:");
|
out.println(" by prefix length:");
|
||||||
int total = 0;
|
int total = 0;
|
||||||
|
@ -372,13 +379,17 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
||||||
final int blockCount = blockCountByPrefixLen[prefix];
|
final int blockCount = blockCountByPrefixLen[prefix];
|
||||||
total += blockCount;
|
total += blockCount;
|
||||||
if (blockCount != 0) {
|
if (blockCount != 0) {
|
||||||
out.println(" " + String.format("%2d", prefix) + ": " + blockCount);
|
out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert totalBlockCount == total;
|
assert totalBlockCount == total;
|
||||||
}
|
}
|
||||||
|
|
||||||
return bos.toString();
|
try {
|
||||||
|
return bos.toString("UTF-8");
|
||||||
|
} catch (UnsupportedEncodingException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,7 @@ public class DateTools {
|
||||||
private static final ThreadLocal<Calendar> TL_CAL = new ThreadLocal<Calendar>() {
|
private static final ThreadLocal<Calendar> TL_CAL = new ThreadLocal<Calendar>() {
|
||||||
@Override
|
@Override
|
||||||
protected Calendar initialValue() {
|
protected Calendar initialValue() {
|
||||||
return Calendar.getInstance(GMT, Locale.US);
|
return Calendar.getInstance(GMT, Locale.ROOT);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -194,7 +194,7 @@ public class DateTools {
|
||||||
this.formatLen = formatLen;
|
this.formatLen = formatLen;
|
||||||
// formatLen 10's place: 11111111
|
// formatLen 10's place: 11111111
|
||||||
// formatLen 1's place: 12345678901234567
|
// formatLen 1's place: 12345678901234567
|
||||||
this.format = new SimpleDateFormat("yyyyMMddHHmmssSSS".substring(0,formatLen),Locale.US);
|
this.format = new SimpleDateFormat("yyyyMMddHHmmssSSS".substring(0,formatLen),Locale.ROOT);
|
||||||
this.format.setTimeZone(GMT);
|
this.format.setTimeZone(GMT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -202,7 +202,7 @@ public class DateTools {
|
||||||
* in lowercase (for backwards compatibility) */
|
* in lowercase (for backwards compatibility) */
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return super.toString().toLowerCase(Locale.ENGLISH);
|
return super.toString().toLowerCase(Locale.ROOT);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
import org.apache.lucene.codecs.BlockTreeTermsReader;
|
||||||
|
@ -341,7 +342,7 @@ public class CheckIndex {
|
||||||
* you only call this when the index is not opened by any
|
* you only call this when the index is not opened by any
|
||||||
* writer. */
|
* writer. */
|
||||||
public Status checkIndex(List<String> onlySegments) throws IOException {
|
public Status checkIndex(List<String> onlySegments) throws IOException {
|
||||||
NumberFormat nf = NumberFormat.getInstance();
|
NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
||||||
SegmentInfos sis = new SegmentInfos();
|
SegmentInfos sis = new SegmentInfos();
|
||||||
Status result = new Status();
|
Status result = new Status();
|
||||||
result.dir = dir;
|
result.dir = dir;
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
@ -181,7 +182,7 @@ class DocumentsWriterPerThread {
|
||||||
private int flushedDocCount;
|
private int flushedDocCount;
|
||||||
DocumentsWriterDeleteQueue deleteQueue;
|
DocumentsWriterDeleteQueue deleteQueue;
|
||||||
DeleteSlice deleteSlice;
|
DeleteSlice deleteSlice;
|
||||||
private final NumberFormat nf = NumberFormat.getInstance();
|
private final NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
||||||
final Allocator byteBlockAllocator;
|
final Allocator byteBlockAllocator;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
@ -3610,7 +3611,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
||||||
// lost...
|
// lost...
|
||||||
|
|
||||||
if (infoStream.isEnabled("IW")) {
|
if (infoStream.isEnabled("IW")) {
|
||||||
infoStream.message("IW", String.format("merged segment size=%.3f MB vs estimate=%.3f MB", merge.info.info.sizeInBytes()/1024./1024., merge.estimatedMergeBytes/1024/1024.));
|
infoStream.message("IW", String.format(Locale.ROOT, "merged segment size=%.3f MB vs estimate=%.3f MB", merge.info.info.sizeInBytes()/1024./1024., merge.estimatedMergeBytes/1024/1024.));
|
||||||
}
|
}
|
||||||
|
|
||||||
final IndexReaderWarmer mergedSegmentWarmer = config.getMergedSegmentWarmer();
|
final IndexReaderWarmer mergedSegmentWarmer = config.getMergedSegmentWarmer();
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
@ -535,7 +536,7 @@ public abstract class LogMergePolicy extends MergePolicy {
|
||||||
if (size >= maxMergeSize) {
|
if (size >= maxMergeSize) {
|
||||||
extra += " [skip: too large]";
|
extra += " [skip: too large]";
|
||||||
}
|
}
|
||||||
message("seg=" + writer.get().segString(info) + " level=" + infoLevel.level + " size=" + String.format("%.3f MB", segBytes/1024/1024.) + extra);
|
message("seg=" + writer.get().segString(info) + " level=" + infoLevel.level + " size=" + String.format(Locale.ROOT, "%.3f MB", segBytes/1024/1024.) + extra);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.index;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -289,7 +290,7 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
} else if (segBytes < floorSegmentBytes) {
|
} else if (segBytes < floorSegmentBytes) {
|
||||||
extra += " [floored]";
|
extra += " [floored]";
|
||||||
}
|
}
|
||||||
message(" seg=" + writer.get().segString(info) + " size=" + String.format("%.3f", segBytes/1024/1024.) + " MB" + extra);
|
message(" seg=" + writer.get().segString(info) + " size=" + String.format(Locale.ROOT, "%.3f", segBytes/1024/1024.) + " MB" + extra);
|
||||||
}
|
}
|
||||||
|
|
||||||
minSegmentBytes = Math.min(segBytes, minSegmentBytes);
|
minSegmentBytes = Math.min(segBytes, minSegmentBytes);
|
||||||
|
@ -388,7 +389,7 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
|
|
||||||
final MergeScore score = score(candidate, hitTooLarge, mergingBytes);
|
final MergeScore score = score(candidate, hitTooLarge, mergingBytes);
|
||||||
if (verbose()) {
|
if (verbose()) {
|
||||||
message(" maybe=" + writer.get().segString(candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format("%.3f MB", totAfterMergeBytes/1024./1024.));
|
message(" maybe=" + writer.get().segString(candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format(Locale.ROOT, "%.3f MB", totAfterMergeBytes/1024./1024.));
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we are already running a max sized merge
|
// If we are already running a max sized merge
|
||||||
|
@ -413,7 +414,7 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbose()) {
|
if (verbose()) {
|
||||||
message(" add merge=" + writer.get().segString(merge.segments) + " size=" + String.format("%.3f MB", bestMergeBytes/1024./1024.) + " score=" + String.format("%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : ""));
|
message(" add merge=" + writer.get().segString(merge.segments) + " size=" + String.format(Locale.ROOT, "%.3f MB", bestMergeBytes/1024./1024.) + " score=" + String.format(Locale.ROOT, "%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : ""));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return spec;
|
return spec;
|
||||||
|
@ -475,7 +476,7 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getExplanation() {
|
public String getExplanation() {
|
||||||
return "skew=" + String.format("%.3f", skew) + " nonDelRatio=" + String.format("%.3f", nonDelRatio);
|
return "skew=" + String.format(Locale.ROOT, "%.3f", skew) + " nonDelRatio=" + String.format(Locale.ROOT, "%.3f", nonDelRatio);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.search.similarities;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -92,6 +94,6 @@ public class LMDirichletSimilarity extends LMSimilarity {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return String.format("Dirichlet(%f)", getMu());
|
return String.format(Locale.ROOT, "Dirichlet(%f)", getMu());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.search.similarities;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -72,6 +74,6 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return String.format("Jelinek-Mercer(%f)", getLambda());
|
return String.format(Locale.ROOT, "Jelinek-Mercer(%f)", getLambda());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.search.similarities;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.search.CollectionStatistics;
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
import org.apache.lucene.search.TermStatistics;
|
import org.apache.lucene.search.TermStatistics;
|
||||||
|
@ -91,9 +93,9 @@ public abstract class LMSimilarity extends SimilarityBase {
|
||||||
public String toString() {
|
public String toString() {
|
||||||
String coll = collectionModel.getName();
|
String coll = collectionModel.getName();
|
||||||
if (coll != null) {
|
if (coll != null) {
|
||||||
return String.format("LM %s - %s", getName(), coll);
|
return String.format(Locale.ROOT, "LM %s - %s", getName(), coll);
|
||||||
} else {
|
} else {
|
||||||
return String.format("LM %s", getName());
|
return String.format(Locale.ROOT, "LM %s", getName());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.util;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.ThreadFactory;
|
import java.util.concurrent.ThreadFactory;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
@ -43,7 +44,7 @@ public class NamedThreadFactory implements ThreadFactory {
|
||||||
final SecurityManager s = System.getSecurityManager();
|
final SecurityManager s = System.getSecurityManager();
|
||||||
group = (s != null) ? s.getThreadGroup() : Thread.currentThread()
|
group = (s != null) ? s.getThreadGroup() : Thread.currentThread()
|
||||||
.getThreadGroup();
|
.getThreadGroup();
|
||||||
this.threadNamePrefix = String.format(NAME_PATTERN,
|
this.threadNamePrefix = String.format(Locale.ROOT, NAME_PATTERN,
|
||||||
checkPrefix(threadNamePrefix), threadPoolNumber.getAndIncrement());
|
checkPrefix(threadNamePrefix), threadPoolNumber.getAndIncrement());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,7 +58,7 @@ public class NamedThreadFactory implements ThreadFactory {
|
||||||
* @see java.util.concurrent.ThreadFactory#newThread(java.lang.Runnable)
|
* @see java.util.concurrent.ThreadFactory#newThread(java.lang.Runnable)
|
||||||
*/
|
*/
|
||||||
public Thread newThread(Runnable r) {
|
public Thread newThread(Runnable r) {
|
||||||
final Thread t = new Thread(group, r, String.format("%s-%d",
|
final Thread t = new Thread(group, r, String.format(Locale.ROOT, "%s-%d",
|
||||||
this.threadNamePrefix, threadNumber.getAndIncrement()), 0);
|
this.threadNamePrefix, threadNumber.getAndIncrement()), 0);
|
||||||
t.setDaemon(false);
|
t.setDaemon(false);
|
||||||
t.setPriority(Thread.NORM_PRIORITY);
|
t.setPriority(Thread.NORM_PRIORITY);
|
||||||
|
|
|
@ -559,7 +559,7 @@ public final class RamUsageEstimator {
|
||||||
*/
|
*/
|
||||||
public static String humanReadableUnits(long bytes) {
|
public static String humanReadableUnits(long bytes) {
|
||||||
return humanReadableUnits(bytes,
|
return humanReadableUnits(bytes,
|
||||||
new DecimalFormat("0.#", DecimalFormatSymbols.getInstance(Locale.ENGLISH)));
|
new DecimalFormat("0.#", DecimalFormatSymbols.getInstance(Locale.ROOT)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -73,7 +73,7 @@ public enum Version {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Version parseLeniently(String version) {
|
public static Version parseLeniently(String version) {
|
||||||
String parsedMatchVersion = version.toUpperCase(Locale.ENGLISH);
|
String parsedMatchVersion = version.toUpperCase(Locale.ROOT);
|
||||||
return Version.valueOf(parsedMatchVersion.replaceFirst("^(\\d)\\.(\\d)$", "LUCENE_$1$2"));
|
return Version.valueOf(parsedMatchVersion.replaceFirst("^(\\d)\\.(\\d)$", "LUCENE_$1$2"));
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -121,7 +121,7 @@ def main():
|
||||||
|
|
||||||
w('package org.apache.lucene.util.automaton;')
|
w('package org.apache.lucene.util.automaton;')
|
||||||
w('')
|
w('')
|
||||||
w('/**')
|
w('/*')
|
||||||
w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
|
w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
|
||||||
w(' * contributor license agreements. See the NOTICE file distributed with')
|
w(' * contributor license agreements. See the NOTICE file distributed with')
|
||||||
w(' * this work for additional information regarding copyright ownership.')
|
w(' * this work for additional information regarding copyright ownership.')
|
||||||
|
|
|
@ -159,7 +159,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
|
||||||
|
|
||||||
public void testAppendableInterface() {
|
public void testAppendableInterface() {
|
||||||
CharTermAttributeImpl t = new CharTermAttributeImpl();
|
CharTermAttributeImpl t = new CharTermAttributeImpl();
|
||||||
Formatter formatter = new Formatter(t, Locale.US);
|
Formatter formatter = new Formatter(t, Locale.ROOT);
|
||||||
formatter.format("%d", 1234);
|
formatter.format("%d", 1234);
|
||||||
assertEquals("1234", t.toString());
|
assertEquals("1234", t.toString());
|
||||||
formatter.format("%d", 5678);
|
formatter.format("%d", 5678);
|
||||||
|
|
|
@ -71,7 +71,7 @@ public class Test10KPulsings extends LuceneTestCase {
|
||||||
Field field = newField("field", "", ft);
|
Field field = newField("field", "", ft);
|
||||||
document.add(field);
|
document.add(field);
|
||||||
|
|
||||||
NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ENGLISH));
|
NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));
|
||||||
|
|
||||||
for (int i = 0; i < 10050; i++) {
|
for (int i = 0; i < 10050; i++) {
|
||||||
field.setStringValue(df.format(i));
|
field.setStringValue(df.format(i));
|
||||||
|
@ -122,7 +122,7 @@ public class Test10KPulsings extends LuceneTestCase {
|
||||||
Field field = newField("field", "", ft);
|
Field field = newField("field", "", ft);
|
||||||
document.add(field);
|
document.add(field);
|
||||||
|
|
||||||
NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ENGLISH));
|
NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));
|
||||||
|
|
||||||
final int freq = freqCutoff + 1;
|
final int freq = freqCutoff + 1;
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ public class TestBinaryDocument extends LuceneTestCase {
|
||||||
{
|
{
|
||||||
FieldType ft = new FieldType();
|
FieldType ft = new FieldType();
|
||||||
ft.setStored(true);
|
ft.setStored(true);
|
||||||
StoredField binaryFldStored = new StoredField("binaryStored", binaryValStored.getBytes());
|
StoredField binaryFldStored = new StoredField("binaryStored", binaryValStored.getBytes("UTF-8"));
|
||||||
Field stringFldStored = new Field("stringStored", binaryValStored, ft);
|
Field stringFldStored = new Field("stringStored", binaryValStored, ft);
|
||||||
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
|
@ -62,7 +62,7 @@ public class TestBinaryDocument extends LuceneTestCase {
|
||||||
/** fetch the binary stored field and compare it's content with the original one */
|
/** fetch the binary stored field and compare it's content with the original one */
|
||||||
BytesRef bytes = docFromReader.getBinaryValue("binaryStored");
|
BytesRef bytes = docFromReader.getBinaryValue("binaryStored");
|
||||||
assertNotNull(bytes);
|
assertNotNull(bytes);
|
||||||
String binaryFldStoredTest = new String(bytes.bytes, bytes.offset, bytes.length);
|
String binaryFldStoredTest = new String(bytes.bytes, bytes.offset, bytes.length, "UTF-8");
|
||||||
assertTrue(binaryFldStoredTest.equals(binaryValStored));
|
assertTrue(binaryFldStoredTest.equals(binaryValStored));
|
||||||
|
|
||||||
/** fetch the string field and compare it's content with the original one */
|
/** fetch the string field and compare it's content with the original one */
|
||||||
|
@ -75,7 +75,7 @@ public class TestBinaryDocument extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCompressionTools() throws Exception {
|
public void testCompressionTools() throws Exception {
|
||||||
StoredField binaryFldCompressed = new StoredField("binaryCompressed", CompressionTools.compress(binaryValCompressed.getBytes()));
|
StoredField binaryFldCompressed = new StoredField("binaryCompressed", CompressionTools.compress(binaryValCompressed.getBytes("UTF-8")));
|
||||||
StoredField stringFldCompressed = new StoredField("stringCompressed", CompressionTools.compressString(binaryValCompressed));
|
StoredField stringFldCompressed = new StoredField("stringCompressed", CompressionTools.compressString(binaryValCompressed));
|
||||||
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
|
@ -94,7 +94,7 @@ public class TestBinaryDocument extends LuceneTestCase {
|
||||||
assertTrue(docFromReader != null);
|
assertTrue(docFromReader != null);
|
||||||
|
|
||||||
/** fetch the binary compressed field and compare it's content with the original one */
|
/** fetch the binary compressed field and compare it's content with the original one */
|
||||||
String binaryFldCompressedTest = new String(CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed")));
|
String binaryFldCompressedTest = new String(CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed")), "UTF-8");
|
||||||
assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
|
assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
|
||||||
assertTrue(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")).equals(binaryValCompressed));
|
assertTrue(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")).equals(binaryValCompressed));
|
||||||
|
|
||||||
|
|
|
@ -61,12 +61,12 @@ public class TestDateTools extends LuceneTestCase {
|
||||||
|
|
||||||
public void testStringtoTime() throws ParseException {
|
public void testStringtoTime() throws ParseException {
|
||||||
long time = DateTools.stringToTime("197001010000");
|
long time = DateTools.stringToTime("197001010000");
|
||||||
Calendar cal = new GregorianCalendar();
|
// we use default locale since LuceneTestCase randomizes it
|
||||||
|
Calendar cal = new GregorianCalendar(TimeZone.getTimeZone("GMT"), Locale.getDefault());
|
||||||
cal.clear();
|
cal.clear();
|
||||||
cal.set(1970, 0, 1, // year=1970, month=january, day=1
|
cal.set(1970, 0, 1, // year=1970, month=january, day=1
|
||||||
0, 0, 0); // hour, minute, second
|
0, 0, 0); // hour, minute, second
|
||||||
cal.set(Calendar.MILLISECOND, 0);
|
cal.set(Calendar.MILLISECOND, 0);
|
||||||
cal.setTimeZone(TimeZone.getTimeZone("GMT"));
|
|
||||||
assertEquals(cal.getTime().getTime(), time);
|
assertEquals(cal.getTime().getTime(), time);
|
||||||
cal.set(1980, 1, 2, // year=1980, month=february, day=2
|
cal.set(1980, 1, 2, // year=1980, month=february, day=2
|
||||||
11, 5, 0); // hour, minute, second
|
11, 5, 0); // hour, minute, second
|
||||||
|
@ -76,9 +76,9 @@ public class TestDateTools extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDateAndTimetoString() throws ParseException {
|
public void testDateAndTimetoString() throws ParseException {
|
||||||
Calendar cal = new GregorianCalendar();
|
// we use default locale since LuceneTestCase randomizes it
|
||||||
|
Calendar cal = new GregorianCalendar(TimeZone.getTimeZone("GMT"), Locale.getDefault());
|
||||||
cal.clear();
|
cal.clear();
|
||||||
cal.setTimeZone(TimeZone.getTimeZone("GMT"));
|
|
||||||
cal.set(2004, 1, 3, // year=2004, month=february(!), day=3
|
cal.set(2004, 1, 3, // year=2004, month=february(!), day=3
|
||||||
22, 8, 56); // hour, minute, second
|
22, 8, 56); // hour, minute, second
|
||||||
cal.set(Calendar.MILLISECOND, 333);
|
cal.set(Calendar.MILLISECOND, 333);
|
||||||
|
@ -141,9 +141,9 @@ public class TestDateTools extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRound() {
|
public void testRound() {
|
||||||
Calendar cal = new GregorianCalendar();
|
// we use default locale since LuceneTestCase randomizes it
|
||||||
|
Calendar cal = new GregorianCalendar(TimeZone.getTimeZone("GMT"), Locale.getDefault());
|
||||||
cal.clear();
|
cal.clear();
|
||||||
cal.setTimeZone(TimeZone.getTimeZone("GMT"));
|
|
||||||
cal.set(2004, 1, 3, // year=2004, month=february(!), day=3
|
cal.set(2004, 1, 3, // year=2004, month=february(!), day=3
|
||||||
22, 8, 56); // hour, minute, second
|
22, 8, 56); // hour, minute, second
|
||||||
cal.set(Calendar.MILLISECOND, 333);
|
cal.set(Calendar.MILLISECOND, 333);
|
||||||
|
@ -180,7 +180,7 @@ public class TestDateTools extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String isoFormat(Date date) {
|
private String isoFormat(Date date) {
|
||||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS", Locale.US);
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS", Locale.ROOT);
|
||||||
sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
|
sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||||
return sdf.format(date);
|
return sdf.format(date);
|
||||||
}
|
}
|
||||||
|
|
|
@ -220,10 +220,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||||
|
|
||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||||
CheckIndex checker = new CheckIndex(dir);
|
CheckIndex checker = new CheckIndex(dir);
|
||||||
checker.setInfoStream(new PrintStream(bos));
|
checker.setInfoStream(new PrintStream(bos, false, "UTF-8"));
|
||||||
CheckIndex.Status indexStatus = checker.checkIndex();
|
CheckIndex.Status indexStatus = checker.checkIndex();
|
||||||
assertFalse(indexStatus.clean);
|
assertFalse(indexStatus.clean);
|
||||||
assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName()));
|
assertTrue(bos.toString("UTF-8").contains(IndexFormatTooOldException.class.getName()));
|
||||||
|
|
||||||
dir.close();
|
dir.close();
|
||||||
_TestUtil.rmDir(oldIndxeDir);
|
_TestUtil.rmDir(oldIndxeDir);
|
||||||
|
|
|
@ -52,12 +52,12 @@ public class TestCheckIndex extends LuceneTestCase {
|
||||||
|
|
||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||||
CheckIndex checker = new CheckIndex(dir);
|
CheckIndex checker = new CheckIndex(dir);
|
||||||
checker.setInfoStream(new PrintStream(bos));
|
checker.setInfoStream(new PrintStream(bos, false, "UTF-8"));
|
||||||
if (VERBOSE) checker.setInfoStream(System.out);
|
if (VERBOSE) checker.setInfoStream(System.out);
|
||||||
CheckIndex.Status indexStatus = checker.checkIndex();
|
CheckIndex.Status indexStatus = checker.checkIndex();
|
||||||
if (indexStatus.clean == false) {
|
if (indexStatus.clean == false) {
|
||||||
System.out.println("CheckIndex failed");
|
System.out.println("CheckIndex failed");
|
||||||
System.out.println(bos.toString());
|
System.out.println(bos.toString("UTF-8"));
|
||||||
fail();
|
fail();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,11 +17,14 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileWriter;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
|
import java.io.Writer;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
@ -78,14 +81,14 @@ public class TestDoc extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private File createOutput(String name, String text) throws IOException {
|
private File createOutput(String name, String text) throws IOException {
|
||||||
FileWriter fw = null;
|
Writer fw = null;
|
||||||
PrintWriter pw = null;
|
PrintWriter pw = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
File f = new File(workDir, name);
|
File f = new File(workDir, name);
|
||||||
if (f.exists()) f.delete();
|
if (f.exists()) f.delete();
|
||||||
|
|
||||||
fw = new FileWriter(f);
|
fw = new OutputStreamWriter(new FileOutputStream(f), "UTF-8");
|
||||||
pw = new PrintWriter(fw);
|
pw = new PrintWriter(fw);
|
||||||
pw.println(text);
|
pw.println(text);
|
||||||
return f;
|
return f;
|
||||||
|
@ -182,9 +185,11 @@ public class TestDoc extends LuceneTestCase {
|
||||||
{
|
{
|
||||||
File file = new File(workDir, fileName);
|
File file = new File(workDir, fileName);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new TextField("contents", new FileReader(file), Field.Store.NO));
|
InputStreamReader is = new InputStreamReader(new FileInputStream(file), "UTF-8");
|
||||||
|
doc.add(new TextField("contents", is, Field.Store.NO));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
writer.commit();
|
writer.commit();
|
||||||
|
is.close();
|
||||||
return writer.newestSegment();
|
return writer.newestSegment();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -43,9 +43,8 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
|
|
||||||
// Simple tests to test the Payload class
|
// Simple tests to test the Payload class
|
||||||
public void testPayload() throws Exception {
|
public void testPayload() throws Exception {
|
||||||
byte[] testData = "This is a test!".getBytes();
|
BytesRef payload = new BytesRef("This is a test!");
|
||||||
BytesRef payload = new BytesRef(testData);
|
assertEquals("Wrong payload length.", "This is a test!".length(), payload.length);
|
||||||
assertEquals("Wrong payload length.", testData.length, payload.length);
|
|
||||||
|
|
||||||
BytesRef clone = payload.clone();
|
BytesRef clone = payload.clone();
|
||||||
assertEquals(payload.length, clone.length);
|
assertEquals(payload.length, clone.length);
|
||||||
|
@ -73,7 +72,7 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
// enabled in only some documents
|
// enabled in only some documents
|
||||||
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
|
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
|
||||||
// only add payload data for field f2
|
// only add payload data for field f2
|
||||||
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
|
analyzer.setPayloadData("f2", "somedata".getBytes("UTF-8"), 0, 1);
|
||||||
writer.addDocument(d);
|
writer.addDocument(d);
|
||||||
// flush
|
// flush
|
||||||
writer.close();
|
writer.close();
|
||||||
|
@ -96,8 +95,8 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
|
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
|
||||||
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
|
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
|
||||||
// add payload data for field f2 and f3
|
// add payload data for field f2 and f3
|
||||||
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
|
analyzer.setPayloadData("f2", "somedata".getBytes("UTF-8"), 0, 1);
|
||||||
analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
|
analyzer.setPayloadData("f3", "somedata".getBytes("UTF-8"), 0, 3);
|
||||||
writer.addDocument(d);
|
writer.addDocument(d);
|
||||||
|
|
||||||
// force merge
|
// force merge
|
||||||
|
|
|
@ -29,6 +29,8 @@ import org.junit.AfterClass;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
import java.text.DecimalFormat;
|
import java.text.DecimalFormat;
|
||||||
|
import java.text.DecimalFormatSymbols;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
/** Test that BooleanQuery.setMinimumNumberShouldMatch works.
|
/** Test that BooleanQuery.setMinimumNumberShouldMatch works.
|
||||||
|
@ -378,7 +380,7 @@ public class TestBooleanMinShouldMatch extends LuceneTestCase {
|
||||||
|
|
||||||
System.err.println("------- " + test + " -------");
|
System.err.println("------- " + test + " -------");
|
||||||
|
|
||||||
DecimalFormat f = new DecimalFormat("0.000000");
|
DecimalFormat f = new DecimalFormat("0.000000", DecimalFormatSymbols.getInstance(Locale.ROOT));
|
||||||
|
|
||||||
for (int i = 0; i < h.length; i++) {
|
for (int i = 0; i < h.length; i++) {
|
||||||
StoredDocument d = searcher.doc(h[i].doc);
|
StoredDocument d = searcher.doc(h[i].doc);
|
||||||
|
|
|
@ -19,8 +19,10 @@ package org.apache.lucene.search;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
import java.util.GregorianCalendar;
|
import java.util.GregorianCalendar;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.TimeZone;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
|
@ -230,10 +232,12 @@ public class TestCustomSearcherSort extends LuceneTestCase {
|
||||||
private class RandomGen {
|
private class RandomGen {
|
||||||
RandomGen(Random random) {
|
RandomGen(Random random) {
|
||||||
this.random = random;
|
this.random = random;
|
||||||
|
base.set(1980, 1, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Random random;
|
private Random random;
|
||||||
private Calendar base = new GregorianCalendar(1980, 1, 1);
|
// we use the default Locale/TZ since LuceneTestCase randomizes it
|
||||||
|
private Calendar base = new GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
|
||||||
|
|
||||||
// Just to generate some different Lucene Date strings
|
// Just to generate some different Lucene Date strings
|
||||||
private String getLuceneDate() {
|
private String getLuceneDate() {
|
||||||
|
|
|
@ -36,6 +36,8 @@ import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
|
||||||
import java.text.DecimalFormat;
|
import java.text.DecimalFormat;
|
||||||
|
import java.text.DecimalFormatSymbols;
|
||||||
|
import java.util.Locale;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -486,7 +488,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
|
||||||
|
|
||||||
System.err.println("------- " + test + " -------");
|
System.err.println("------- " + test + " -------");
|
||||||
|
|
||||||
DecimalFormat f = new DecimalFormat("0.000000000");
|
DecimalFormat f = new DecimalFormat("0.000000000", DecimalFormatSymbols.getInstance(Locale.ROOT));
|
||||||
|
|
||||||
for (int i = 0; i < h.length; i++) {
|
for (int i = 0; i < h.length; i++) {
|
||||||
StoredDocument d = searcher.doc(h[i].doc);
|
StoredDocument d = searcher.doc(h[i].doc);
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.concurrent.CyclicBarrier;
|
import java.util.concurrent.CyclicBarrier;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
@ -117,10 +118,10 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
try {
|
try {
|
||||||
FieldCache cache = FieldCache.DEFAULT;
|
FieldCache cache = FieldCache.DEFAULT;
|
||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||||
cache.setInfoStream(new PrintStream(bos));
|
cache.setInfoStream(new PrintStream(bos, false, "UTF-8"));
|
||||||
cache.getDoubles(reader, "theDouble", false);
|
cache.getDoubles(reader, "theDouble", false);
|
||||||
cache.getFloats(reader, "theDouble", false);
|
cache.getFloats(reader, "theDouble", false);
|
||||||
assertTrue(bos.toString().indexOf("WARNING") != -1);
|
assertTrue(bos.toString("UTF-8").indexOf("WARNING") != -1);
|
||||||
} finally {
|
} finally {
|
||||||
FieldCache.DEFAULT.purgeAllCaches();
|
FieldCache.DEFAULT.purgeAllCaches();
|
||||||
}
|
}
|
||||||
|
@ -261,7 +262,7 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
if (chunk == 0) {
|
if (chunk == 0) {
|
||||||
for (int ord = 0; ord < values.size(); ord++) {
|
for (int ord = 0; ord < values.size(); ord++) {
|
||||||
BytesRef term = values.get(ord);
|
BytesRef term = values.get(ord);
|
||||||
assertNull(String.format("Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term);
|
assertNull(String.format(Locale.ROOT, "Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -275,7 +276,7 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
reuse = termOrds.lookup(i, reuse);
|
reuse = termOrds.lookup(i, reuse);
|
||||||
reuse.read(buffer);
|
reuse.read(buffer);
|
||||||
}
|
}
|
||||||
assertTrue(String.format("Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual));
|
assertTrue(String.format(Locale.ROOT, "Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chunk <= buffer.length) {
|
if (chunk <= buffer.length) {
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class TestMultiValuedNumericRangeQuery extends LuceneTestCase {
|
||||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||||
.setMaxBufferedDocs(_TestUtil.nextInt(random(), 50, 1000)));
|
.setMaxBufferedDocs(_TestUtil.nextInt(random(), 50, 1000)));
|
||||||
|
|
||||||
DecimalFormat format = new DecimalFormat("00000000000", new DecimalFormatSymbols(Locale.US));
|
DecimalFormat format = new DecimalFormat("00000000000", new DecimalFormatSymbols(Locale.ROOT));
|
||||||
|
|
||||||
int num = atLeast(500);
|
int num = atLeast(500);
|
||||||
for (int l = 0; l < num; l++) {
|
for (int l = 0; l < num; l++) {
|
||||||
|
|
|
@ -58,7 +58,7 @@ public class TestRegexpRandom extends LuceneTestCase {
|
||||||
Field field = newField("field", "", customType);
|
Field field = newField("field", "", customType);
|
||||||
doc.add(field);
|
doc.add(field);
|
||||||
|
|
||||||
NumberFormat df = new DecimalFormat("000", new DecimalFormatSymbols(Locale.ENGLISH));
|
NumberFormat df = new DecimalFormat("000", new DecimalFormatSymbols(Locale.ROOT));
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
field.setStringValue(df.format(i));
|
field.setStringValue(df.format(i));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
|
|
|
@ -54,7 +54,7 @@ public class TestWildcardRandom extends LuceneTestCase {
|
||||||
Field field = newStringField("field", "", Field.Store.NO);
|
Field field = newStringField("field", "", Field.Store.NO);
|
||||||
doc.add(field);
|
doc.add(field);
|
||||||
|
|
||||||
NumberFormat df = new DecimalFormat("000", new DecimalFormatSymbols(Locale.ENGLISH));
|
NumberFormat df = new DecimalFormat("000", new DecimalFormatSymbols(Locale.ROOT));
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
field.setStringValue(df.format(i));
|
field.setStringValue(df.format(i));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
|
|
|
@ -81,7 +81,7 @@ public class TestBasics extends LuceneTestCase {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
payloadAttr.setPayload(new BytesRef(("pos: " + pos).getBytes()));
|
payloadAttr.setPayload(new BytesRef(("pos: " + pos).getBytes("UTF-8")));
|
||||||
pos++;
|
pos++;
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -411,7 +411,7 @@ public class TestBasics extends LuceneTestCase {
|
||||||
@Test
|
@Test
|
||||||
public void testSpanPayloadCheck() throws Exception {
|
public void testSpanPayloadCheck() throws Exception {
|
||||||
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "five"));
|
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "five"));
|
||||||
BytesRef pay = new BytesRef(("pos: " + 5).getBytes());
|
BytesRef pay = new BytesRef(("pos: " + 5).getBytes("UTF-8"));
|
||||||
SpanQuery query = new SpanPayloadCheckQuery(term1, Collections.singletonList(pay.bytes));
|
SpanQuery query = new SpanPayloadCheckQuery(term1, Collections.singletonList(pay.bytes));
|
||||||
checkHits(query, new int[]
|
checkHits(query, new int[]
|
||||||
{1125, 1135, 1145, 1155, 1165, 1175, 1185, 1195, 1225, 1235, 1245, 1255, 1265, 1275, 1285, 1295, 1325, 1335, 1345, 1355, 1365, 1375, 1385, 1395, 1425, 1435, 1445, 1455, 1465, 1475, 1485, 1495, 1525, 1535, 1545, 1555, 1565, 1575, 1585, 1595, 1625, 1635, 1645, 1655, 1665, 1675, 1685, 1695, 1725, 1735, 1745, 1755, 1765, 1775, 1785, 1795, 1825, 1835, 1845, 1855, 1865, 1875, 1885, 1895, 1925, 1935, 1945, 1955, 1965, 1975, 1985, 1995});
|
{1125, 1135, 1145, 1155, 1165, 1175, 1185, 1195, 1225, 1235, 1245, 1255, 1265, 1275, 1285, 1295, 1325, 1335, 1345, 1355, 1365, 1375, 1385, 1395, 1425, 1435, 1445, 1455, 1465, 1475, 1485, 1495, 1525, 1535, 1545, 1555, 1565, 1575, 1585, 1595, 1625, 1635, 1645, 1655, 1665, 1675, 1685, 1695, 1725, 1735, 1745, 1755, 1765, 1775, 1785, 1795, 1825, 1835, 1845, 1855, 1865, 1875, 1885, 1895, 1925, 1935, 1945, 1955, 1965, 1975, 1985, 1995});
|
||||||
|
@ -426,8 +426,8 @@ public class TestBasics extends LuceneTestCase {
|
||||||
clauses[0] = term1;
|
clauses[0] = term1;
|
||||||
clauses[1] = term2;
|
clauses[1] = term2;
|
||||||
snq = new SpanNearQuery(clauses, 0, true);
|
snq = new SpanNearQuery(clauses, 0, true);
|
||||||
pay = new BytesRef(("pos: " + 0).getBytes());
|
pay = new BytesRef(("pos: " + 0).getBytes("UTF-8"));
|
||||||
pay2 = new BytesRef(("pos: " + 1).getBytes());
|
pay2 = new BytesRef(("pos: " + 1).getBytes("UTF-8"));
|
||||||
list = new ArrayList<byte[]>();
|
list = new ArrayList<byte[]>();
|
||||||
list.add(pay.bytes);
|
list.add(pay.bytes);
|
||||||
list.add(pay2.bytes);
|
list.add(pay2.bytes);
|
||||||
|
@ -439,9 +439,9 @@ public class TestBasics extends LuceneTestCase {
|
||||||
clauses[1] = term2;
|
clauses[1] = term2;
|
||||||
clauses[2] = new SpanTermQuery(new Term("field", "five"));
|
clauses[2] = new SpanTermQuery(new Term("field", "five"));
|
||||||
snq = new SpanNearQuery(clauses, 0, true);
|
snq = new SpanNearQuery(clauses, 0, true);
|
||||||
pay = new BytesRef(("pos: " + 0).getBytes());
|
pay = new BytesRef(("pos: " + 0).getBytes("UTF-8"));
|
||||||
pay2 = new BytesRef(("pos: " + 1).getBytes());
|
pay2 = new BytesRef(("pos: " + 1).getBytes("UTF-8"));
|
||||||
BytesRef pay3 = new BytesRef(("pos: " + 2).getBytes());
|
BytesRef pay3 = new BytesRef(("pos: " + 2).getBytes("UTF-8"));
|
||||||
list = new ArrayList<byte[]>();
|
list = new ArrayList<byte[]>();
|
||||||
list.add(pay.bytes);
|
list.add(pay.bytes);
|
||||||
list.add(pay2.bytes);
|
list.add(pay2.bytes);
|
||||||
|
@ -470,10 +470,10 @@ public class TestBasics extends LuceneTestCase {
|
||||||
checkHits(query, new int[]{1103, 1203,1303,1403,1503,1603,1703,1803,1903});
|
checkHits(query, new int[]{1103, 1203,1303,1403,1503,1603,1703,1803,1903});
|
||||||
|
|
||||||
Collection<byte[]> payloads = new ArrayList<byte[]>();
|
Collection<byte[]> payloads = new ArrayList<byte[]>();
|
||||||
BytesRef pay = new BytesRef(("pos: " + 0).getBytes());
|
BytesRef pay = new BytesRef(("pos: " + 0).getBytes("UTF-8"));
|
||||||
BytesRef pay2 = new BytesRef(("pos: " + 1).getBytes());
|
BytesRef pay2 = new BytesRef(("pos: " + 1).getBytes("UTF-8"));
|
||||||
BytesRef pay3 = new BytesRef(("pos: " + 3).getBytes());
|
BytesRef pay3 = new BytesRef(("pos: " + 3).getBytes("UTF-8"));
|
||||||
BytesRef pay4 = new BytesRef(("pos: " + 4).getBytes());
|
BytesRef pay4 = new BytesRef(("pos: " + 4).getBytes("UTF-8"));
|
||||||
payloads.add(pay.bytes);
|
payloads.add(pay.bytes);
|
||||||
payloads.add(pay2.bytes);
|
payloads.add(pay2.bytes);
|
||||||
payloads.add(pay3.bytes);
|
payloads.add(pay3.bytes);
|
||||||
|
|
|
@ -276,7 +276,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
||||||
Collection<byte[]> payloads = spans.getPayload();
|
Collection<byte[]> payloads = spans.getPayload();
|
||||||
|
|
||||||
for (final byte [] payload : payloads) {
|
for (final byte [] payload : payloads) {
|
||||||
payloadSet.add(new String(payload));
|
payloadSet.add(new String(payload, "UTF-8"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -311,7 +311,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
||||||
while (spans.next()) {
|
while (spans.next()) {
|
||||||
Collection<byte[]> payloads = spans.getPayload();
|
Collection<byte[]> payloads = spans.getPayload();
|
||||||
for (final byte[] payload : payloads) {
|
for (final byte[] payload : payloads) {
|
||||||
payloadSet.add(new String(payload));
|
payloadSet.add(new String(payload, "UTF-8"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -347,7 +347,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
||||||
Collection<byte[]> payloads = spans.getPayload();
|
Collection<byte[]> payloads = spans.getPayload();
|
||||||
|
|
||||||
for (final byte [] payload : payloads) {
|
for (final byte [] payload : payloads) {
|
||||||
payloadSet.add(new String(payload));
|
payloadSet.add(new String(payload, "UTF-8"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -383,7 +383,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
||||||
System.out.println("Num payloads:" + payloads.size());
|
System.out.println("Num payloads:" + payloads.size());
|
||||||
for (final byte [] bytes : payloads) {
|
for (final byte [] bytes : payloads) {
|
||||||
if(VERBOSE)
|
if(VERBOSE)
|
||||||
System.out.println(new String(bytes));
|
System.out.println(new String(bytes, "UTF-8"));
|
||||||
}
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
|
@ -456,7 +456,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
||||||
for (final byte [] bytes : payload) {
|
for (final byte [] bytes : payload) {
|
||||||
if(VERBOSE)
|
if(VERBOSE)
|
||||||
System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
|
System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
|
||||||
+ new String(bytes));
|
+ new String(bytes, "UTF-8"));
|
||||||
}
|
}
|
||||||
|
|
||||||
assertEquals(numPayloads[cnt],payload.size());
|
assertEquals(numPayloads[cnt],payload.size());
|
||||||
|
@ -505,9 +505,9 @@ public class TestPayloadSpans extends LuceneTestCase {
|
||||||
|
|
||||||
if (!nopayload.contains(token)) {
|
if (!nopayload.contains(token)) {
|
||||||
if (entities.contains(token)) {
|
if (entities.contains(token)) {
|
||||||
payloadAtt.setPayload(new BytesRef((token + ":Entity:"+ pos ).getBytes()));
|
payloadAtt.setPayload(new BytesRef(token + ":Entity:"+ pos ));
|
||||||
} else {
|
} else {
|
||||||
payloadAtt.setPayload(new BytesRef((token + ":Noise:" + pos ).getBytes()));
|
payloadAtt.setPayload(new BytesRef(token + ":Noise:" + pos ));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pos += posIncrAtt.getPositionIncrement();
|
pos += posIncrAtt.getPositionIncrement();
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue