LUCENE-3892: merge in trunk changes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-07-19 15:58:54 +00:00
commit cf36fb9a58
1539 changed files with 31599 additions and 15439 deletions

View File

@ -65,7 +65,7 @@
</subant></sequential>
</target>
<target name="resolve" description="Resolves all dependencies">
<target name="resolve" depends="clean-jars" description="Resolves all dependencies">
<sequential><subant target="resolve" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
@ -116,7 +116,7 @@
</sequential>
</target>
<target name="eclipse" description="Setup Eclipse configuration" depends="resolve">
<target name="eclipse" depends="clean-jars, resolve" description="Setup Eclipse configuration">
<copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false"/>
<copy file="dev-tools/eclipse/dot.classpath" tofile=".classpath" overwrite="true"/>
<mkdir dir=".settings"/>
@ -129,7 +129,7 @@
</echo>
</target>
<target name="idea" description="Setup IntelliJ IDEA configuration" depends="resolve">
<target name="idea" depends="clean-jars, resolve" description="Setup IntelliJ IDEA configuration">
<copy todir=".">
<fileset dir="dev-tools/idea"/>
</copy>
@ -138,6 +138,7 @@
File | Project Structure | Project | Project SDK.
</echo>
</target>
<target name="clean-idea"
description="Removes all IntelliJ IDEA configuration files">
<delete dir=".idea" failonerror="true"/>
@ -148,7 +149,7 @@
</delete>
</target>
<target name="clean" description="Clean Lucene and Solr">
<target name="clean" depends="clean-jars" description="Clean Lucene and Solr">
<delete dir="dist" />
<sequential>
<subant target="clean" inheritall="false" failonerror="true">
@ -175,7 +176,7 @@
</subant>
</target>
<target name="jar-checksums" description="Recompute SHA1 checksums for all JAR files.">
<target name="jar-checksums" depends="resolve" description="Recompute SHA1 checksums for all JAR files.">
<delete>
<fileset dir="${basedir}">
<include name="**/*.jar.sha1"/>

View File

@ -97,12 +97,14 @@
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>
<classpathentry kind="lib" path="solr/lib/httpcore-4.1.4.jar"/>
<classpathentry kind="lib" path="solr/lib/httpmime-4.1.3.jar"/>
@ -115,7 +117,7 @@
<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.4.jar"/>
<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.4.jar"/>
<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.4.jar"/>
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.5.jar"/>
<classpathentry kind="lib" path="solr/example/lib/jetty-continuation-8.1.2.v20120308.jar"/>
<classpathentry kind="lib" path="solr/example/lib/jetty-deploy-8.1.2.v20120308.jar"/>
<classpathentry kind="lib" path="solr/example/lib/jetty-http-8.1.2.v20120308.jar"/>
@ -170,6 +172,6 @@
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-beanutils-1.7.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-collections-3.2.1.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.5.0.jar"/>
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View File

@ -24,4 +24,4 @@
<option name="prefixLines" value="false" />
</LanguageOptions>
</settings>
</component>
</component>

View File

@ -7,4 +7,4 @@
<JAVADOC />
<SOURCES />
</library>
</component>
</component>

View File

@ -6,4 +6,4 @@
<JAVADOC />
<SOURCES />
</library>
</component>
</component>

View File

@ -2,9 +2,9 @@
<library name="JUnit">
<CLASSES>
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/junit-4.10.jar!/" />
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.5.0.jar!/" />
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
</component>

View File

@ -0,0 +1,10 @@
<component name="libraryTable">
<library name="Lucene tools library">
<CLASSES>
<root url="file://$PROJECT_DIR$/lucene/tools/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/lucene/tools/lib" recursive="false" />
</library>
</component>

View File

@ -7,4 +7,4 @@
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/extraction/lib" recursive="false" />
</library>
</component>
</component>

View File

@ -7,4 +7,4 @@
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/velocity/lib" recursive="false" />
</library>
</component>
</component>

View File

@ -16,6 +16,11 @@
<option name="USE_RELATIVE_INDENTS" value="false" />
</value>
</option>
<option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="20" />
<option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="20" />
<option name="PACKAGES_TO_USE_IMPORT_ON_DEMAND">
<value />
</option>
<ADDITIONAL_INDENT_OPTIONS fileType="groovy">
<option name="INDENT_SIZE" value="2" />
<option name="CONTINUATION_INDENT_SIZE" value="4" />

View File

@ -17,6 +17,7 @@
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Lucene tools library" level="project" />
<orderEntry type="library" name="Ant" level="project"/>
</component>
</module>

View File

@ -89,6 +89,11 @@
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency>
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.15</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>

View File

@ -155,6 +155,11 @@
<artifactId>commons-codec</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>commons-digester</groupId>
<artifactId>commons-digester</artifactId>
@ -293,7 +298,7 @@
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.3.4</version>
<version>3.3.5</version>
</dependency>
<dependency>
<groupId>org.carrot2</groupId>
@ -303,7 +308,7 @@
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>morfologik-polish</artifactId>
<version>1.5.2</version>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.codehaus.woodstox</groupId>
@ -383,7 +388,7 @@
<dependency>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>randomizedtesting-runner</artifactId>
<version>1.5.0</version>
<version>1.6.0</version>
</dependency>
</dependencies>
</dependencyManagement>

View File

@ -138,6 +138,10 @@
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
</dependency>
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>

View File

@ -17,8 +17,8 @@ import traceback
import os
import sys
import re
from HTMLParser import HTMLParser, HTMLParseError
import urlparse
from html.parser import HTMLParser, HTMLParseError
import urllib.parse as urlparse
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
@ -57,7 +57,7 @@ class FindHyperlinks(HTMLParser):
pass
else:
self.printFile()
print ' WARNING: anchor "%s" appears more than once' % name
print(' WARNING: anchor "%s" appears more than once' % name)
else:
self.anchors.add(name)
elif href is not None:
@ -73,8 +73,8 @@ class FindHyperlinks(HTMLParser):
def printFile(self):
if not self.printed:
print
print ' ' + self.baseURL
print()
print(' ' + self.baseURL)
self.printed = True
def parse(baseURL, html):
@ -85,8 +85,8 @@ def parse(baseURL, html):
parser.close()
except HTMLParseError:
parser.printFile()
print ' WARNING: failed to parse:'
traceback.print_exc()
print(' WARNING: failed to parse %s:' % baseURL)
traceback.print_exc(file=sys.stdout)
failures = True
return [], []
@ -104,8 +104,8 @@ def checkAll(dirName):
global failures
# Find/parse all HTML files first
print
print 'Crawl/parse...'
print()
print('Crawl/parse...')
allFiles = {}
if os.path.isfile(dirName):
@ -128,11 +128,11 @@ def checkAll(dirName):
# deprecated-list.html can fail to escape generics types
fullPath = os.path.join(root, f)
#print ' %s' % fullPath
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read())
# ... then verify:
print
print 'Verify...'
print()
print('Verify...')
for fullPath, (links, anchors) in allFiles.items():
#print fullPath
printed = False
@ -176,16 +176,16 @@ def checkAll(dirName):
and os.path.basename(fullPath) != 'Changes.html':
if not printed:
printed = True
print
print fullPath
print ' BAD EXTERNAL LINK: %s' % link
print()
print(fullPath)
print(' BAD EXTERNAL LINK: %s' % link)
elif link.startswith('mailto:'):
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
if not printed:
printed = True
print
print fullPath
print ' BROKEN MAILTO (?): %s' % link
print()
print(fullPath)
print(' BROKEN MAILTO (?): %s' % link)
elif link.startswith('javascript:'):
# ok...?
pass
@ -200,15 +200,15 @@ def checkAll(dirName):
if not os.path.exists(link):
if not printed:
printed = True
print
print fullPath
print ' BROKEN LINK: %s' % link
print()
print(fullPath)
print(' BROKEN LINK: %s' % link)
elif anchor is not None and anchor not in allFiles[link][1]:
if not printed:
printed = True
print
print fullPath
print ' BROKEN ANCHOR: %s' % origLink
print()
print(fullPath)
print(' BROKEN ANCHOR: %s' % origLink)
failures = failures or printed
@ -216,8 +216,8 @@ def checkAll(dirName):
if __name__ == '__main__':
if checkAll(sys.argv[1]):
print
print 'Broken javadocs links were found!'
print()
print('Broken javadocs links were found!')
sys.exit(1)
sys.exit(0)

View File

@ -210,16 +210,6 @@ def checkSigs(project, urlString, version, tmpDir, isSigned):
if keysURL is None:
raise RuntimeError('%s is missing KEYS' % project)
if not os.path.exists('%s/apache-rat-0.8.jar' % tmpDir):
print ' downloading Apache RAT...'
download('apache-rat-incubating-0.8-bin.tar.bz2',
'http://archive.apache.org/dist/incubator/rat/binaries/apache-rat-incubating-0.8-bin.tar.bz2',
tmpDir)
t = tarfile.open('%s/apache-rat-incubating-0.8-bin.tar.bz2' % tmpDir)
t.extract('apache-rat-0.8/apache-rat-0.8.jar', '%s/apache-rat-0.8.jar' % tmpDir)
else:
print ' apache RAT already downloaded...'
print ' get KEYS'
download('%s.KEYS' % project, keysURL, tmpDir)
@ -480,9 +470,6 @@ def verifyUnpacked(project, artifact, unpackPath, version, tmpDir):
print ' run "ant validate"'
run('%s; ant validate' % javaExe('1.7'), '%s/validate.log' % unpackPath)
print ' run "ant rat-sources"'
run('%s; ant -lib "%s/apache-rat-0.8.jar/apache-rat-0.8" rat-sources' % (javaExe('1.7'), tmpDir), '%s/rat-sources.log' % unpackPath)
if project == 'lucene':
print ' run tests w/ Java 6...'
run('%s; ant test' % javaExe('1.6'), '%s/test.log' % unpackPath)

View File

@ -7,6 +7,120 @@ http://s.apache.org/luceneversions
======================= Lucene 5.0.0 =======================
======================= Lucene 4.0.0-BETA =======================
New features
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
iteration marks. (Robert Muir, Christian Moen)
* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
create automata from a fixed collection of UTF-8 encoded BytesRef
(Dawid Weiss, Robert Muir)
* LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
respect field boundaries in the case of highlighting for multivalued fields.
(Martijn van Groningen)
API Changes
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
has a different API (carries a list of tags instead of a compound tag). Upgrade
of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
* LUCENE-4178: set 'tokenized' to true on FieldType by default, so that if you
make a custom FieldType and set indexed = true, its analyzed by the analyzer.
(Robert Muir)
* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
module and replaced by NekoHTML. HTMLParser interface was cleaned up while
changing method signatures. (Uwe Schindler, Robert Muir)
* LUCENE-2191: Rename Tokenizer.reset(Reader) to Tokenizer.setReader(Reader).
The purpose of this method was always to set a new Reader on the Tokenizer,
reusing the object. But the name was often confused with TokenStream.reset().
(Robert Muir)
* LUCENE-4228: Refactored CharFilter to extend java.io.FilterReader. CharFilters
filter another reader and you override correct() for offset correction.
(Robert Muir)
Optimizations
* LUCENE-4171: Performance improvements to Packed64.
(Toke Eskildsen via Adrien Grand)
* LUCENE-4184: Performance improvements to the aligned packed bits impl.
(Toke Eskildsen, Adrien Grand)
* LUCENE-4235: Remove enforcing of Filter rewrite for NRQ queries.
(Uwe Schindler)
Bug Fixes
* LUCENE-4176: Fix AnalyzingQueryParser to analyze range endpoints as bytes,
so that it works correctly with Analyzers that produce binary non-UTF-8 terms
such as CollationAnalyzer. (Nattapong Sirilappanich via Robert Muir)
* LUCENE-4209: Fix FSTCompletionLookup to close its sorter, so that it won't
leave temp files behind in /tmp. Fix SortedTermFreqIteratorWrapper to not
leave temp files behind in /tmp on Windows. Fix Sort to not leave
temp files behind when /tmp is a separate volume. (Uwe Schindler, Robert Muir)
* LUCENE-4221: Fix overeager CheckIndex validation for term vector offsets.
(Robert Muir)
* LUCENE-4222: TieredMergePolicy.getFloorSegmentMB was returning the
size in bytes not MB (Chris Fuller via Mike McCandless)
* LUCENE-3505: Fix bug (Lucene 4.0alpha only) where boolean conjunctions
were sometimes scored incorrectly. Conjunctions of only termqueries where
at least one term omitted term frequencies (IndexOptions.DOCS_ONLY) would
be scored as if all terms omitted term frequencies. (Robert Muir)
* LUCENE-2686, LUCENE-3505: Fixed BooleanQuery scorers to return correct
freq(). Added support for scorer navigation API (Scorer.getChildren) to
all queries. Made Scorer.freq() abstract.
(Koji Sekiguchi, Mike McCandless, Robert Muir)
Build
* LUCENE-4094: Support overriding file.encoding on forked test JVMs
(force via -Drandomized.file.encoding=XXX). (Dawid Weiss)
* LUCENE-4189: Test output should include timestamps (start/end for each
test/ suite). Added -Dtests.timestamps=[off by default]. (Dawid Weiss)
* LUCENE-4110: Report long periods of forked jvm inactivity (hung tests/ suites).
Added -Dtests.heartbeat=[seconds] with the default of 60 seconds.
(Dawid Weiss)
* LUCENE-4160: Added a property to quit the tests after a given
number of failures has occurred. This is useful in combination
with -Dtests.iters=N (you can start N iterations and wait for M
failures, in particular M = 1). -Dtests.maxfailures=M. Alternatively,
specify -Dtests.failfast=true to skip all tests after the first failure.
(Dawid Weiss)
* LUCENE-4115: JAR resolution/ cleanup should be done automatically for ant
clean/ eclipse/ resolve (Dawid Weiss)
* LUCENE-4199, LUCENE-4202, LUCENE-4206: Add a new target "check-forbidden-apis"
that parses all generated .class files for use of APIs that use default
charset, default locale, or default timezone and fail build if violations
found. This ensures, that Lucene / Solr is independent on local configuration
options. (Uwe Schindler, Robert Muir, Dawid Weiss)
* LUCENE-4217: Add the possibility to run tests with Atlassian Clover
loaded from IVY. A development License solely for Apache code was added in
the tools/ folder, but is not included in releases. (Uwe Schindler)
Documentation
* LUCENE-4195: Added package documentation and examples for
org.apache.lucene.codecs (Alan Woodward via Robert Muir)
======================= Lucene 4.0.0-ALPHA =======================
More information about this release, including any errata related to the
@ -20,7 +134,7 @@ Changes in backwards compatibility policy
* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing:
- On upgrading to 3.1, if you do not fully reindex your documents,
- On upgrading to 4.0, if you do not fully reindex your documents,
Lucene will emulate the new flex API on top of the old index,
incurring some performance cost (up to ~10% slowdown, typically).
To prevent this slowdown, use oal.index.IndexUpgrader
@ -29,7 +143,7 @@ Changes in backwards compatibility policy
Mixed flex/pre-flex indexes are perfectly fine -- the two
emulation layers (flex API on pre-flex index, and pre-flex API on
flex index) will remap the access as required. So on upgrading to
3.1 you can start indexing new documents into an existing index.
4.0 you can start indexing new documents into an existing index.
To get optimal performance, use oal.index.IndexUpgrader
to upgrade your indexes to latest file format (LUCENE-3082).
@ -283,6 +397,11 @@ Changes in backwards compatibility policy
removed, as IndexReaderContext.leaves() is now the preferred way
to access sub-readers. (Uwe Schindler)
* LUCENE-4155: oal.util.ReaderUtil, TwoPhaseCommit, TwoPhaseCommitTool
classes were moved to oal.index package. oal.util.CodecUtil class was moved
to oal.codecs package. oal.util.DummyConcurrentLock was removed
(no longer used in Lucene 4.0). (Uwe Schindler)
Changes in Runtime Behavior
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you
@ -989,6 +1108,11 @@ Optimizations
* LUCENE-4156: DirectoryTaxonomyWriter.getSize is no longer synchronized.
(Shai Erera, Sivan Yogev)
* LUCENE-4163: Improve concurrency of MMapIndexInput.clone() by using
the new WeakIdentityMap on top of a ConcurrentHashMap to manage
the cloned instances. WeakIdentityMap was extended to support
iterating over its keys. (Uwe Schindler)
Bug fixes
* LUCENE-2803: The FieldCache can miss values if an entry for a reader
@ -1062,6 +1186,13 @@ Bug fixes
* LUCENE-4114: Fix int overflow bugs in BYTES_FIXED_STRAIGHT and
BYTES_FIXED_DEREF doc values implementations (Walt Elder via Mike McCandless).
* LUCENE-4147: Fixed thread safety issues when rollback() and commit()
are called simultaneously. (Simon Willnauer, Mike McCandless)
* LUCENE-4165: Removed closing of the Reader used to read the affix file in
HunspellDictionary. Consumers are now responsible for closing all InputStreams
once the Dictionary has been instantiated. (Torsten Krah, Uwe Schindler, Chris Male)
Documentation
* LUCENE-3958: Javadocs corrections for IndexWriter.

View File

@ -145,7 +145,7 @@ enumeration APIs. Here are the major changes:
oal.util.ReaderUtil) and then step through those readers yourself,
if you can (this is how Lucene drives searches).
If you pass a SegmentReader to MultiFields.fiels it will simply
If you pass a SegmentReader to MultiFields.fields it will simply
return reader.fields(), so there is no performance hit in that
case.
@ -334,7 +334,7 @@ based on document IDs, albeit the per-segment orientation.
There are still valid use-cases where top-level readers ie. "atomic
views" on the index are desirable. Let say you want to iterate all terms
of a complete index for auto-completion or facetting, Lucene provides
of a complete index for auto-completion or faceting, Lucene provides
utility wrappers like SlowCompositeReaderWrapper (LUCENE-2597) emulating
an AtomicReader. Note: using "atomicity emulators" can cause serious
slowdowns due to the need to merge terms, postings, DocValues, and
@ -574,7 +574,7 @@ you can now do this:
Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum
is both consumer and producer of attributes: MTQ.BoostAttribute is
added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it.
The other way round MTQ.TopTermsBooleanQueryRewrite supplys a
The other way round MTQ.TopTermsBooleanQueryRewrite supplies a
global AttributeSource to each segments TermsEnum. The TermsEnum is consumer
and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute).
@ -594,7 +594,7 @@ you can now do this:
* LUCENE-1076: TieredMergePolicy is now the default merge policy.
It's able to merge non-contiguous segments; this may cause problems
for applications that rely on Lucene's internal document ID
assigment. If so, you should instead use LogByteSize/DocMergePolicy
assignment. If so, you should instead use LogByteSize/DocMergePolicy
during indexing.
* LUCENE-3722: Similarity methods and collection/term statistics now take

View File

@ -61,50 +61,50 @@
executable="${python.exe}" failonerror="true" logerror="true">
<arg value="htmlentity.py"/>
</exec>
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
</target>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/wikipedia"
nobak="on"/>
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
</target>
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std31"
nobak="on" />
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
</target>
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std31"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std34"
nobak="on" />
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target>
<!-- Remove the inappropriate JFlex-generated constructor -->
<macrodef name="run-jflex">
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
<jflex file="@{dir}/@{name}.jflex"
outdir="@{dir}"
nobak="on" />
<replaceregexp file="@{dir}/@{name}.java"
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
</sequential>
</macrodef>
<target name="clean-jflex">
<delete>
<fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>

View File

@ -1,5 +1,7 @@
package org.apache.lucene.analysis.br;
import java.util.Locale;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -21,6 +23,7 @@ package org.apache.lucene.analysis.br;
* A stemmer for Brazilian Portuguese words.
*/
public class BrazilianStemmer {
private static final Locale locale = new Locale("pt", "BR");
/**
* Changed term
@ -243,7 +246,7 @@ public class BrazilianStemmer {
return null ;
}
value = value.toLowerCase() ;
value = value.toLowerCase(locale) ;
for (j=0 ; j < value.length() ; j++) {
if ((value.charAt(j) == 'á') ||
(value.charAt(j) == 'â') ||

View File

@ -17,9 +17,10 @@
package org.apache.lucene.analysis.charfilter;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.util.ArrayUtil;
import java.io.Reader;
import java.util.Arrays;
/**
@ -34,7 +35,7 @@ public abstract class BaseCharFilter extends CharFilter {
private int diffs[];
private int size = 0;
public BaseCharFilter(CharStream in) {
public BaseCharFilter(Reader in) {
super(in);
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/18/12 12:24 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/16/12 4:05 PM */
package org.apache.lucene.analysis.charfilter;
@ -20,13 +20,13 @@ package org.apache.lucene.analysis.charfilter;
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 5/18/12 12:24 PM from the specification file
* <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
* on 7/16/12 4:05 PM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
*/
public final class HTMLStripCharFilter extends BaseCharFilter {
@ -30647,7 +30647,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
/**
* @param source
*/
public HTMLStripCharFilter(CharStream source) {
public HTMLStripCharFilter(Reader source) {
super(source);
this.zzReader = source;
}
@ -30657,7 +30657,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
super(source);
this.zzReader = source;
if (null != escapedTags) {

View File

@ -1,6 +1,6 @@
package org.apache.lucene.analysis.charfilter;
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@ -18,13 +18,13 @@ package org.apache.lucene.analysis.charfilter;
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
@ -173,7 +173,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
/**
* @param source
*/
public HTMLStripCharFilter(CharStream source) {
public HTMLStripCharFilter(Reader source) {
super(source);
this.zzReader = source;
}
@ -183,7 +183,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
super(source);
this.zzReader = source;
if (null != escapedTags) {

View File

@ -21,8 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.CharFilter; // javadocs
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.RollingCharBuffer;
import org.apache.lucene.util.fst.CharSequenceOutputs;
@ -51,8 +50,8 @@ public class MappingCharFilter extends BaseCharFilter {
private int replacementPointer;
private int inputOff;
/** Default constructor that takes a {@link CharStream}. */
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
/** Default constructor that takes a {@link Reader}. */
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
super(in);
buffer.reset(in);
@ -66,15 +65,10 @@ public class MappingCharFilter extends BaseCharFilter {
}
}
/** Easy-use constructor that takes a {@link Reader}. */
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
this(normMap, CharReader.get(in));
}
@Override
public void reset() throws IOException {
super.reset();
buffer.reset(input);
buffer.reset(in);
replacement = null;
inputOff = 0;
}

View File

@ -205,7 +205,7 @@ public final class CJKBigramFilter extends TokenFilter {
/**
* refills buffers with new data from the current token.
*/
private void refill() throws IOException {
private void refill() {
// compact buffers to keep them smallish if they become large
// just a safety check, but technically we only need the last codepoint
if (bufferLen > 64) {

View File

@ -18,6 +18,7 @@
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.File;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
@ -463,10 +464,10 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
}
@Override
public void printStats() {
System.out.println("Value space size = "
public void printStats(PrintStream out) {
out.println("Value space size = "
+ Integer.toString(vspace.length()));
super.printStats();
super.printStats(out);
}
}

View File

@ -40,7 +40,7 @@ import javax.xml.parsers.SAXParserFactory;
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class PatternParser extends DefaultHandler implements PatternConsumer {
public class PatternParser extends DefaultHandler {
XMLReader parser;
@ -64,7 +64,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
static final int ELEM_HYPHEN = 4;
public PatternParser() throws HyphenationException {
public PatternParser() {
token = new StringBuilder();
parser = createParser();
parser.setContentHandler(this);
@ -74,7 +74,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
}
public PatternParser(PatternConsumer consumer) throws HyphenationException {
public PatternParser(PatternConsumer consumer) {
this();
this.consumer = consumer;
}
@ -402,25 +402,4 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
return str.toString();
} // getLocationString(SAXParseException):String
// PatternConsumer implementation for testing purposes
public void addClass(String c) {
System.out.println("class: " + c);
}
public void addException(String w, ArrayList<Object> e) {
System.out.println("exception: " + w + " : " + e.toString());
}
public void addPattern(String p, String v) {
System.out.println("pattern: " + p + " : " + v);
}
public static void main(String[] args) throws Exception {
if (args.length > 0) {
PatternParser pp = new PatternParser();
pp.setConsumer(pp);
pp.parse(args[0]);
}
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.PrintStream;
import java.util.Enumeration;
import java.util.Stack;
@ -633,11 +634,11 @@ public class TernaryTree implements Cloneable {
}
public void printStats() {
System.out.println("Number of keys = " + Integer.toString(length));
System.out.println("Node count = " + Integer.toString(freenode));
public void printStats(PrintStream out) {
out.println("Number of keys = " + Integer.toString(length));
out.println("Node count = " + Integer.toString(freenode));
// System.out.println("Array length = " + Integer.toString(eq.length));
System.out.println("Key Array length = " + Integer.toString(kv.length()));
out.println("Key Array length = " + Integer.toString(kv.length()));
/*
* for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
@ -647,8 +648,8 @@ public class TernaryTree implements Cloneable {
*/
}
public static void main(String[] args) throws Exception {
/*
public static void main(String[] args) {
TernaryTree tt = new TernaryTree();
tt.insert("Carlos", 'C');
tt.insert("Car", 'r');
@ -658,7 +659,8 @@ public class TernaryTree implements Cloneable {
System.out.println((char) tt.find("Car"));
System.out.println((char) tt.find("Carlos"));
System.out.println((char) tt.find("alto"));
tt.printStats();
tt.printStats(System.out);
}
*/
}

View File

@ -94,8 +94,8 @@ public final class KeywordTokenizer extends Tokenizer {
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void setReader(Reader input) throws IOException {
super.setReader(input);
this.done = false;
}
}

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis.core;
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
@ -122,7 +121,7 @@ public final class StopFilter extends FilteringTokenFilter {
* Returns the next input Token whose term() is not a stop word.
*/
@Override
protected boolean accept() throws IOException {
protected boolean accept() {
return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
}

View File

@ -48,7 +48,7 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
* When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes
*/
@Override
protected boolean accept() throws IOException {
protected boolean accept() {
return useWhiteList == stopTypes.contains(typeAttribute.type());
}
}

View File

@ -1,4 +1,7 @@
package org.apache.lucene.analysis.de;
import java.util.Locale;
// This file is encoded in UTF-8
/*
@ -37,6 +40,8 @@ public class GermanStemmer
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
*/
private int substCount = 0;
private static final Locale locale = new Locale("de", "DE");
/**
* Stemms the given term to an unique <tt>discriminator</tt>.
@ -47,7 +52,7 @@ public class GermanStemmer
protected String stem( String term )
{
// Use lowercase for medium stemming.
term = term.toLowerCase();
term = term.toLowerCase(locale);
if ( !isStemmable( term ) )
return term;
// Reset the StringBuilder.

View File

@ -289,7 +289,7 @@ public class KStemmer {
entry = new DictEntry(exceptionWords[i], true);
d.put(exceptionWords[i], entry);
} else {
System.out.println("Warning: Entry [" + exceptionWords[i]
throw new RuntimeException("Warning: Entry [" + exceptionWords[i]
+ "] already in dictionary 1");
}
}
@ -299,7 +299,7 @@ public class KStemmer {
entry = new DictEntry(directConflations[i][1], false);
d.put(directConflations[i][0], entry);
} else {
System.out.println("Warning: Entry [" + directConflations[i][0]
throw new RuntimeException("Warning: Entry [" + directConflations[i][0]
+ "] already in dictionary 2");
}
}
@ -309,7 +309,7 @@ public class KStemmer {
entry = new DictEntry(countryNationality[i][1], false);
d.put(countryNationality[i][0], entry);
} else {
System.out.println("Warning: Entry [" + countryNationality[i][0]
throw new RuntimeException("Warning: Entry [" + countryNationality[i][0]
+ "] already in dictionary 3");
}
}
@ -323,7 +323,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -333,7 +333,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -343,7 +343,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -353,7 +353,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -363,7 +363,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -373,7 +373,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -383,7 +383,7 @@ public class KStemmer {
if (!d.containsKey(array[i])) {
d.put(array[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + array[i]
throw new RuntimeException("Warning: Entry [" + array[i]
+ "] already in dictionary 4");
}
}
@ -392,7 +392,7 @@ public class KStemmer {
if (!d.containsKey(KStemData8.data[i])) {
d.put(KStemData8.data[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + KStemData8.data[i]
throw new RuntimeException("Warning: Entry [" + KStemData8.data[i]
+ "] already in dictionary 4");
}
}
@ -401,7 +401,7 @@ public class KStemmer {
if (!d.containsKey(supplementDict[i])) {
d.put(supplementDict[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + supplementDict[i]
throw new RuntimeException("Warning: Entry [" + supplementDict[i]
+ "] already in dictionary 5");
}
}
@ -410,7 +410,7 @@ public class KStemmer {
if (!d.containsKey(properNouns[i])) {
d.put(properNouns[i], defaultEntry);
} else {
System.out.println("Warning: Entry [" + properNouns[i]
throw new RuntimeException("Warning: Entry [" + properNouns[i]
+ "] already in dictionary 6");
}
}

View File

@ -492,10 +492,9 @@ class PorterStemmer
return dirty;
}
/** Test program for demonstrating the Stemmer. It reads a file and
/* Test program for demonstrating the Stemmer. It reads a file and
* stems each word, writing the result to standard out.
* Usage: Stemmer file-name
*/
public static void main(String[] args) {
PorterStemmer s = new PorterStemmer();
@ -542,6 +541,6 @@ class PorterStemmer
System.out.println("error reading " + args[i]);
}
}
}
}*/
}

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
@ -134,6 +133,6 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new PersianCharFilter(CharReader.get(reader));
return new PersianCharFilter(reader);
}
}

View File

@ -18,9 +18,9 @@ package org.apache.lucene.analysis.fa;
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.CharFilter;
import org.apache.lucene.analysis.CharFilter;
/**
* CharFilter that replaces instances of Zero-width non-joiner with an
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.charfilter.CharFilter;
*/
public class PersianCharFilter extends CharFilter {
public PersianCharFilter(CharStream in) {
public PersianCharFilter(Reader in) {
super(in);
}
@ -45,4 +45,9 @@ public class PersianCharFilter extends CharFilter {
}
return charsRead;
}
@Override
protected int correct(int currentOff) {
return currentOff; // we don't change the length of the string
}
}

View File

@ -66,10 +66,11 @@ public class HunspellDictionary {
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionary InputStream for reading the hunspell dictionary file
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
@ -80,10 +81,11 @@ public class HunspellDictionary {
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionary InputStream for reading the hunspell dictionary file
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
@ -95,10 +97,11 @@ public class HunspellDictionary {
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionaries InputStreams for reading the hunspell dictionary file
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
@ -110,10 +113,11 @@ public class HunspellDictionary {
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionaries InputStreams for reading the hunspell dictionary file
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
@ -194,7 +198,6 @@ public class HunspellDictionary {
flagParsingStrategy = getFlagParsingStrategy(line);
}
}
reader.close();
}
/**
@ -252,7 +255,7 @@ public class HunspellDictionary {
}
String condition = ruleArgs[4];
affix.setCondition(condition, String.format(conditionPattern, condition));
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
affix.setCrossProduct(crossProduct);
List<HunspellAffix> list = affixes.get(affix.getAppend());
@ -376,7 +379,7 @@ public class HunspellDictionary {
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
if(ignoreCase) {
entry = entry.toLowerCase(Locale.ENGLISH);
entry = entry.toLowerCase(Locale.ROOT);
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.hunspell;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
@ -298,13 +299,12 @@ public class HunspellStemmer {
// ================================================= Entry Point ===================================================
/**
/*
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
*
* @param args Program arguments. Should contain location of affix file and location of dic file
* @throws IOException Can be thrown while reading from the files
* @throws ParseException Can be thrown while parsing the files
*/
public static void main(String[] args) throws IOException, ParseException {
boolean ignoreCase = false;
int offset = 0;
@ -330,7 +330,7 @@ public class HunspellStemmer {
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
Scanner scanner = new Scanner(System.in);
Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
System.out.print("> ");
while (scanner.hasNextLine()) {
@ -346,12 +346,10 @@ public class HunspellStemmer {
}
}
/**
* Prints the results of the stemming of a word
*
* @param originalWord Word that has been stemmed
* @param stems Stems of the word
*/
private static void printStemResults(String originalWord, List<Stem> stems) {
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
@ -381,13 +379,12 @@ public class HunspellStemmer {
System.out.println(builder);
}
/**
* Simple utility to check if the given String has any text
*
* @param str String to check if it has any text
* @return {@code true} if the String has text, {@code false} otherwise
*/
private static boolean hasText(String str) {
return str != null && str.length() > 0;
}
*/
}

View File

@ -19,15 +19,13 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
/**
* An always exhausted token stream.
*/
public final class EmptyTokenStream extends TokenStream {
@Override
public final boolean incrementToken() throws IOException {
public final boolean incrementToken() {
return false;
}

View File

@ -17,9 +17,6 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -43,7 +40,7 @@ public final class KeepWordFilter extends FilteringTokenFilter {
}
@Override
public boolean accept() throws IOException {
public boolean accept() {
return words.contains(termAtt.buffer(), 0, termAtt.length());
}
}

View File

@ -17,10 +17,7 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -48,7 +45,7 @@ public final class LengthFilter extends FilteringTokenFilter {
}
@Override
public boolean accept() throws IOException {
public boolean accept() {
final int len = termAtt.length();
return (len >= min && len <= max);
}

View File

@ -17,8 +17,6 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@ -46,7 +44,7 @@ public final class SingleTokenTokenStream extends TokenStream {
}
@Override
public final boolean incrementToken() throws IOException {
public final boolean incrementToken() {
if (exhausted) {
return false;
} else {
@ -58,7 +56,7 @@ public final class SingleTokenTokenStream extends TokenStream {
}
@Override
public void reset() throws IOException {
public void reset() {
exhausted = false;
}

View File

@ -23,7 +23,6 @@ import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
/**
@ -54,7 +53,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
private final String replacement;
private Reader transformedInput;
public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
public PatternReplaceCharFilter(Pattern pattern, String replacement, Reader in) {
super(in);
this.pattern = pattern;
this.replacement = replacement;
@ -64,16 +63,29 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
public int read(char[] cbuf, int off, int len) throws IOException {
// Buffer all input on the first call.
if (transformedInput == null) {
StringBuilder buffered = new StringBuilder();
char [] temp = new char [1024];
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
buffered.append(temp, 0, cnt);
}
transformedInput = new StringReader(processPattern(buffered).toString());
fill();
}
return transformedInput.read(cbuf, off, len);
}
private void fill() throws IOException {
StringBuilder buffered = new StringBuilder();
char [] temp = new char [1024];
for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
buffered.append(temp, 0, cnt);
}
transformedInput = new StringReader(processPattern(buffered).toString());
}
@Override
public int read() throws IOException {
if (transformedInput == null) {
fill();
}
return transformedInput.read();
}
@Override
protected int correct(int currentOff) {

View File

@ -84,7 +84,7 @@ public final class PatternTokenizer extends Tokenizer {
}
@Override
public boolean incrementToken() throws IOException {
public boolean incrementToken() {
if (index >= str.length()) return false;
clearAttributes();
if (group >= 0) {
@ -130,14 +130,14 @@ public final class PatternTokenizer extends Tokenizer {
}
@Override
public void end() throws IOException {
public void end() {
final int ofs = correctOffset(str.length());
offsetAtt.setOffset(ofs, ofs);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void setReader(Reader input) throws IOException {
super.setReader(input);
fillBuffer(str, input);
matcher.reset(str);
index = 0;

View File

@ -132,7 +132,7 @@ public abstract class RSLPStemmerBase {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new CharArraySet(Version.LUCENE_50,
Arrays.asList(exceptions), false);
@ -156,7 +156,7 @@ public abstract class RSLPStemmerBase {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new char[exceptions.length][];
for (int i = 0; i < exceptions.length; i++)

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.sinks;
import java.text.DateFormat;
import java.text.ParseException;
import java.util.Date;
import java.util.Locale;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeSource;
@ -37,10 +38,12 @@ public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
protected CharTermAttribute termAtt;
/**
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
* Uses {@link java.text.DateFormat#getDateInstance(int, Locale)
* DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as
* the {@link java.text.DateFormat} object.
*/
public DateRecognizerSinkFilter() {
this(DateFormat.getDateInstance());
this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT));
}
public DateRecognizerSinkFilter(DateFormat dateFormat) {

View File

@ -212,7 +212,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
}
@Override
public final boolean incrementToken() throws IOException {
public final boolean incrementToken() {
// lazy init the iterator
if (it == null) {
it = cachedStates.iterator();
@ -228,7 +228,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
}
@Override
public final void end() throws IOException {
public final void end() {
if (finalState != null) {
restoreState(finalState);
}

View File

@ -114,9 +114,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected void reset(final Reader reader) throws IOException {
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
super.reset(reader);
super.setReader(reader);
}
};
}

View File

@ -175,8 +175,8 @@ public final class ClassicTokenizer extends Tokenizer {
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(reader);
}
}

View File

@ -1,8 +1,8 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
package org.apache.lucene.analysis.standard;
/*
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 9/30/11 12:10 PM from the specification file
* <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
* on 08.07.12 16:59 from the specification file
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -383,15 +383,7 @@ public final void getText(CharTermAttribute t) {
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
ClassicTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros

View File

@ -115,9 +115,9 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected void reset(final Reader reader) throws IOException {
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
super.reset(reader);
super.setReader(reader);
}
};
}

View File

@ -183,8 +183,8 @@ public final class StandardTokenizer extends Tokenizer {
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(reader);
}
}

View File

@ -1,8 +1,8 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
package org.apache.lucene.analysis.standard;
/*
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@ -759,15 +759,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
public StandardTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.

View File

@ -104,9 +104,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected void reset(final Reader reader) throws IOException {
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
super.reset(reader);
super.setReader(reader);
}
};
}

View File

@ -162,8 +162,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(reader);
}
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/18/12 12:05 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
package org.apache.lucene.analysis.standard;
@ -3844,15 +3844,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
public UAX29URLEmailTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.

View File

@ -1,6 +1,6 @@
package org.apache.lucene.analysis.standard;
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.

View File

@ -92,7 +92,7 @@ public class WordnetSynonymParser extends SynonymMap.Builder {
return analyze(analyzer, text, reuse);
}
private void addInternal(CharsRef synset[], int size) throws IOException {
private void addInternal(CharsRef synset[], int size) {
if (size <= 1) {
return; // nothing to do
}

View File

@ -650,7 +650,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
}
/**
* Empty {@link UnmodifiableCharArrayMap} optimized for speed.
* Empty {@link org.apache.lucene.analysis.util.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
* Contains checks will always return <code>false</code> or throw
* NPE if necessary.
*/

View File

@ -17,13 +17,15 @@ package org.apache.lucene.analysis.util;
* limitations under the License.
*/
import org.apache.lucene.analysis.CharStream;
import java.io.Reader;
import org.apache.lucene.analysis.CharFilter;
/**
* Abstract parent class for analysis factories that create {@link CharStream}
* Abstract parent class for analysis factories that create {@link CharFilter}
* instances.
*/
public abstract class CharFilterFactory extends AbstractAnalysisFactory {
public abstract CharStream create(CharStream input);
public abstract CharFilter create(Reader input);
}

View File

@ -162,8 +162,8 @@ public abstract class CharTokenizer extends Tokenizer {
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void setReader(Reader input) throws IOException {
super.setReader(input);
bufferIndex = 0;
offset = 0;
dataLen = 0;

View File

@ -325,13 +325,13 @@ public final class WikipediaTokenizer extends Tokenizer {
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(input);
}
@Override
public void end() throws IOException {
public void end() {
// set final offset
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
this.offsetAtt.setOffset(finalOffset, finalOffset);

View File

@ -1,8 +1,8 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
package org.apache.lucene.analysis.wikipedia;
/*
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 1/22/12 10:26 PM from the specification file
* <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
* on 08.07.12 17:00 from the specification file
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@ -519,15 +519,7 @@ final void reset() {
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
WikipediaTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.

View File

@ -435,7 +435,7 @@ public abstract class SnowballProgram {
bra > ket ||
ket > limit)
{
System.err.println("faulty slice operation");
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
// FIXME: report error somehow.
/*
fprintf(stderr, "faulty slice operation:\n");

View File

@ -24,7 +24,7 @@
For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
</p>
<p>
This module contains concrete components ({@link org.apache.lucene.analysis.charfilter.CharFilter}s,
This module contains concrete components ({@link org.apache.lucene.analysis.CharFilter}s,
{@link org.apache.lucene.analysis.Tokenizer}s, and ({@link org.apache.lucene.analysis.TokenFilter}s) for
analyzing different types of content. It also provides a number of {@link org.apache.lucene.analysis.Analyzer}s
for different languages that you can use to get started quickly.

View File

@ -96,6 +96,6 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new ArabicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new ArabicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -76,6 +76,6 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new BulgarianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new BulgarianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -162,7 +162,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -58,6 +58,6 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new CatalanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new CatalanAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -29,7 +29,6 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
@ -46,7 +45,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new HTMLStripCharFilter(CharReader.get(reader));
return new HTMLStripCharFilter(reader);
}
};
}
@ -60,7 +59,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "\nthis is some text\n here is a link and " +
"another link. " +
"This is an entity: & plus a <. Here is an &. ";
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html));
StringBuilder builder = new StringBuilder();
int ch = -1;
char [] goldArray = gold.toCharArray();
@ -79,7 +78,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
StringBuilder builder = new StringBuilder();
int ch = -1;
while ((ch = reader.read()) != -1){
@ -96,7 +95,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testMSWord14GeneratedHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
String gold = "This is a test";
StringBuilder builder = new StringBuilder();
int ch = 0;
@ -117,7 +116,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "\u0393";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@ -132,7 +131,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@ -147,7 +146,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = " <junk/> ! @ and ";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@ -161,7 +160,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@ -346,7 +345,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@ -370,7 +369,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
testBuilder.append("-->foo");
String gold = "foo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -388,7 +387,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("?>");
gold = "";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
ch = 0;
builder = new StringBuilder();
try {
@ -406,7 +405,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("/>");
gold = "";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
ch = 0;
builder = new StringBuilder();
try {
@ -430,7 +429,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
private void processBuffer(String test, String assertMsg) throws IOException {
// System.out.println("-------------------processBuffer----------");
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -448,7 +447,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " ";
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -464,7 +463,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void doTestOffsets(String in) throws Exception {
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(in)));
int ch = 0;
int off = 0; // offset in the reader
int strOff = -1; // offset in the original string
@ -491,7 +490,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
static void assertLegalOffsets(String in) throws Exception {
int length = in.length();
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
HTMLStripCharFilter reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(in)));
int ch = 0;
int off = 0;
while ((ch = reader.read()) != -1) {
@ -508,12 +507,12 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testRandom() throws Exception {
int numRounds = RANDOM_MULTIPLIER * 10000;
int numRounds = RANDOM_MULTIPLIER * 1000;
checkRandomData(random(), newTestAnalyzer(), numRounds);
}
public void testRandomHugeStrings() throws Exception {
int numRounds = RANDOM_MULTIPLIER * 200;
int numRounds = RANDOM_MULTIPLIER * 100;
checkRandomData(random(), newTestAnalyzer(), numRounds, 8192);
}
@ -526,7 +525,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -540,7 +539,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
gold = "one\ntwo";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
reader = new HTMLStripCharFilter(new StringReader(test));
ch = 0;
builder = new StringBuilder();
try {
@ -557,7 +556,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testScriptQuotes() throws Exception {
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -572,7 +571,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
gold = "hello\n";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
reader = new HTMLStripCharFilter(new StringReader(test));
ch = 0;
builder = new StringBuilder();
try {
@ -591,7 +590,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "one<script no-value-attr></script>two";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -612,7 +611,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ "-->\n"
+ "</style>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -631,7 +630,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "one<style type=\"text/css\"></style>two";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -656,7 +655,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@ -671,7 +670,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String gold = "one<BR class='whatever'>two</\nBR\n>";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
(new StringReader(test), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -688,7 +687,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testInlineTagsNoSpace() throws Exception {
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
String gold = "onetwo2e.three";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -705,7 +704,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testCDATA() throws Exception {
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
String gold = "one<one><two>three<four></four></two></one>two";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -720,7 +719,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
gold = "onetwo<![CDATA[three]]>fourfive";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
reader = new HTMLStripCharFilter(new StringReader(test));
ch = 0;
builder = new StringBuilder();
try {
@ -737,7 +736,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testUppercaseCharacterEntityVariants() throws Exception {
String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
String gold = " \"-\u00A9>><<\u00AE&";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -754,7 +753,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testMSWordMalformedProcessingInstruction() throws Exception {
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -771,7 +770,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testSupplementaryCharsInTags() throws Exception {
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
Reader reader = new HTMLStripCharFilter(new StringReader(test));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@ -822,7 +821,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
}
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(text.toString())));
(new StringReader(text.toString()));
while (reader.read() != -1);
}

View File

@ -29,8 +29,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -60,7 +59,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
}
public void testReaderReset() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
char[] buf = new char[10];
int len = cs.read(buf, 0, 10);
assertEquals( 1, len );
@ -76,55 +75,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
}
public void testNothingChange() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to3() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
}
public void test2to4() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
}
public void test2to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
}
public void test3to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
}
public void test4to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
}
public void test5to0() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}
@ -149,7 +148,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
//
public void testTokenStream() throws Exception {
String testString = "h i j k ll cccc bbb aa";
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) );
CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
@ -171,8 +170,8 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
// h,8,9 => i,8,9
public void testChained() throws Exception {
String testString = "aaaa ll h";
CharStream cs = new MappingCharFilter( normMap,
new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) ) );
CharFilter cs = new MappingCharFilter( normMap,
new MappingCharFilter( normMap, new StringReader( testString ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"a","llllllll","i"},
@ -193,7 +192,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(normMap, CharReader.get(reader));
return new MappingCharFilter(normMap, reader);
}
};
@ -219,7 +218,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(map, CharReader.get(reader));
return new MappingCharFilter(map, reader);
}
};
@ -229,7 +228,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testRandomMaps() throws Exception {
int numIterations = atLeast(10);
int numIterations = atLeast(3);
for (int i = 0; i < numIterations; i++) {
final NormalizeCharMap map = randomMap();
Analyzer analyzer = new Analyzer() {
@ -241,7 +240,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(map, CharReader.get(reader));
return new MappingCharFilter(map, reader);
}
};
int numRounds = 100;
@ -270,7 +269,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
public void testRandomMaps2() throws Exception {
final Random random = random();
final int numIterations = atLeast(10);
final int numIterations = atLeast(3);
for(int iter=0;iter<numIterations;iter++) {
if (VERBOSE) {

View File

@ -23,7 +23,6 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -216,7 +215,7 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(norm, CharReader.get(reader));
return new MappingCharFilter(norm, reader);
}
};
@ -272,13 +271,13 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new CJKAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
/** blast some random strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
}
public void testEmptyTerm() throws IOException {

View File

@ -0,0 +1,67 @@
package org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t));
}
};
public void testHuge() throws Exception {
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
new String[] {
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた"
}
);
}
public void testHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
}
}

View File

@ -63,7 +63,7 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
}
public void testRandomData() throws IOException {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -48,7 +48,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
assertTrue(cgf.incrementToken());
assertEquals("the_s", term.toString());
wt.reset(new StringReader(input));
wt.setReader(new StringReader(input));
cgf.reset();
assertTrue(cgf.incrementToken());
assertEquals("How", term.toString());
@ -66,7 +66,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
assertTrue(nsf.incrementToken());
assertEquals("the_s", term.toString());
wt.reset(new StringReader(input));
wt.setReader(new StringReader(input));
nsf.reset();
assertTrue(nsf.incrementToken());
assertEquals("How_the", term.toString());
@ -81,7 +81,6 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
* "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
* out)
*
* @return Map<String,String>
*/
public void testCommonGramsQueryFilter() throws Exception {
Analyzer a = new Analyzer() {
@ -319,7 +318,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
}
};
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@ -331,6 +330,6 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
}
};
checkRandomData(random(), b, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -24,7 +24,6 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -240,7 +239,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
assertTrue(tf.incrementToken());
assertEquals("Rind", termAtt.toString());
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
tf.reset();
assertTrue(tf.incrementToken());
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
@ -327,7 +326,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(normMap, CharReader.get(reader));
return new MappingCharFilter(normMap, reader);
}
};
@ -348,7 +347,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
}
};
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
@ -361,7 +360,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random(), b, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws Exception {

View File

@ -163,7 +163,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
filter.reset();
String highSurEndingUpper = "BogustermBoguster\ud801";
String highSurEndingLower = "bogustermboguster\ud801";
tokenizer.reset(new StringReader(highSurEndingUpper));
tokenizer.setReader(new StringReader(highSurEndingUpper));
assertTokenStreamContents(filter, new String[] {highSurEndingLower});
assertTrue(filter.hasAttribute(CharTermAttribute.class));
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
@ -191,17 +191,17 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
checkRandomData(random(), new SimpleAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
checkRandomData(random(), new StopAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -1,12 +1,12 @@
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
@ -65,10 +65,10 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
}
CharStream wrappedStream = new CharStream() {
CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {
@Override
public void mark(int readAheadLimit) throws IOException {
public void mark(int readAheadLimit) {
throw new UnsupportedOperationException("mark(int)");
}
@ -78,53 +78,53 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
}
@Override
public int read() throws IOException {
public int read() {
throw new UnsupportedOperationException("read()");
}
@Override
public int read(char[] cbuf) throws IOException {
public int read(char[] cbuf) {
throw new UnsupportedOperationException("read(char[])");
}
@Override
public int read(CharBuffer target) throws IOException {
public int read(CharBuffer target) {
throw new UnsupportedOperationException("read(CharBuffer)");
}
@Override
public boolean ready() throws IOException {
public boolean ready() {
throw new UnsupportedOperationException("ready()");
}
@Override
public void reset() throws IOException {
public void reset() {
throw new UnsupportedOperationException("reset()");
}
@Override
public long skip(long n) throws IOException {
public long skip(long n) {
throw new UnsupportedOperationException("skip(long)");
}
@Override
public int correctOffset(int currentOff) {
throw new UnsupportedOperationException("correctOffset(int)");
public int correct(int currentOff) {
throw new UnsupportedOperationException("correct(int)");
}
@Override
public void close() throws IOException {
public void close() {
throw new UnsupportedOperationException("close()");
}
@Override
public int read(char[] arg0, int arg1, int arg2) throws IOException {
public int read(char[] arg0, int arg1, int arg2) {
throw new UnsupportedOperationException("read(char[], int, int)");
}
};
public void testWrapping() throws Exception {
CharStream cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
try {
cs.mark(1);
fail();
@ -178,7 +178,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
cs.correctOffset(1);
fail();
} catch (Exception e) {
assertEquals("correctOffset(int)", e.getMessage());
assertEquals("correct(int)", e.getMessage());
}
try {

View File

@ -315,12 +315,12 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -74,7 +74,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
for (int i = 0; i < 10000; i++) {
for (int i = 0; i < 1000; i++) {
String s = _TestUtil.randomSimpleString(random);
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
right.tokenStream("foo", newStringReader(s)));
@ -94,7 +94,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int numIterations = atLeast(100);
int numIterations = atLeast(50);
for (int i = 0; i < numIterations; i++) {
String s = _TestUtil.randomSimpleString(random, maxLength);
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
@ -112,7 +112,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
for (int i = 0; i < 10000; i++) {
for (int i = 0; i < 1000; i++) {
String s = _TestUtil.randomHtmlishString(random, 20);
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
right.tokenStream("foo", newStringReader(s)));
@ -121,7 +121,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
public void testLetterHtmlishHuge() throws Exception {
Random random = random();
int maxLength = 2048; // this is number of elements, not chars!
int maxLength = 1024; // this is number of elements, not chars!
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
left.setMaxTokenLength(255); // match CharTokenizer's max token length
Analyzer right = new Analyzer() {
@ -131,7 +131,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int numIterations = atLeast(100);
int numIterations = atLeast(50);
for (int i = 0; i < numIterations; i++) {
String s = _TestUtil.randomHtmlishString(random, maxLength);
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
@ -149,7 +149,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
for (int i = 0; i < 10000; i++) {
for (int i = 0; i < 1000; i++) {
String s = _TestUtil.randomUnicodeString(random);
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
right.tokenStream("foo", newStringReader(s)));
@ -158,7 +158,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
public void testLetterUnicodeHuge() throws Exception {
Random random = random();
int maxLength = 8192; // CharTokenizer.IO_BUFFER_SIZE*2
int maxLength = 4300; // CharTokenizer.IO_BUFFER_SIZE + fudge
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
left.setMaxTokenLength(255); // match CharTokenizer's max token length
Analyzer right = new Analyzer() {
@ -168,7 +168,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int numIterations = atLeast(100);
int numIterations = atLeast(50);
for (int i = 0; i < numIterations; i++) {
String s = _TestUtil.randomUnicodeString(random, maxLength);
assertEquals(s, left.tokenStream("foo", newStringReader(s)),

View File

@ -127,6 +127,6 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new KeywordAnalyzer(), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new KeywordAnalyzer(), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -44,8 +44,7 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.EmptyTokenizer;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
@ -101,7 +100,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
static List<Constructor<? extends Tokenizer>> tokenizers;
static List<Constructor<? extends TokenFilter>> tokenfilters;
static List<Constructor<? extends CharStream>> charfilters;
static List<Constructor<? extends CharFilter>> charfilters;
// TODO: fix those and remove
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
@ -170,7 +169,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
charfilters = new ArrayList<Constructor<? extends CharStream>>();
charfilters = new ArrayList<Constructor<? extends CharFilter>>();
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (
@ -179,7 +178,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|| brokenComponents.contains(c)
|| c.isAnnotationPresent(Deprecated.class)
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
) {
continue;
}
@ -197,10 +196,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
} else if (CharStream.class.isAssignableFrom(c)) {
} else if (CharFilter.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
charfilters.add(castConstructor(CharStream.class, ctor));
charfilters.add(castConstructor(CharFilter.class, ctor));
} else {
fail("Cannot get here");
}
@ -224,7 +223,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
@AfterClass
public static void afterClass() throws Exception {
public static void afterClass() {
tokenizers = null;
tokenfilters = null;
charfilters = null;
@ -524,7 +523,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedCharFilterArgs.addAll(argProducers.keySet());
allowedCharFilterArgs.add(Reader.class);
allowedCharFilterArgs.add(CharStream.class);
}
@SuppressWarnings("unchecked")
@ -560,8 +558,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
Class<?> paramType = paramTypes[i];
if (paramType == Reader.class) {
args[i] = reader;
} else if (paramType == CharStream.class) {
args[i] = CharReader.get(reader);
} else {
args[i] = newRandomArg(random, paramType);
}
@ -701,7 +697,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
int numFilters = random.nextInt(3);
for (int i = 0; i < numFilters; i++) {
while (true) {
final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
reader = createComponent(ctor, args, descr);
if (reader != null) {
@ -760,24 +756,16 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
}
// wants charfilter to be a filterreader...
// do *NOT*, do *NOT* refactor me to be a charfilter: LUCENE-3990
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharStream {
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
boolean readSomething;
CharStream in;
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
this.in = CharReader.get(in);
super(in);
}
@Override
public int correctOffset(int currentOff) {
return in.correctOffset(currentOff);
}
@Override
public void close() throws IOException {
in.close();
public int correct(int currentOff) {
return currentOff; // we don't change any offsets
}
@Override
@ -798,32 +786,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return in.read(target);
}
@Override
public void mark(int readAheadLimit) throws IOException {
in.mark(readAheadLimit);
}
@Override
public boolean markSupported() {
return in.markSupported();
}
@Override
public int read(char[] cbuf) throws IOException {
readSomething = true;
return in.read(cbuf);
}
@Override
public boolean ready() throws IOException {
return in.ready();
}
@Override
public void reset() throws IOException {
in.reset();
}
@Override
public long skip(long n) throws IOException {
readSomething = true;

View File

@ -233,13 +233,13 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
}
// Adds random graph after:
@ -254,6 +254,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, tokenStream);
}
},
200*RANDOM_MULTIPLIER, 8192);
100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -252,6 +252,6 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -455,12 +455,12 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -52,6 +52,6 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new CzechAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new CzechAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -51,6 +51,6 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new DanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new DanishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -61,6 +61,6 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new GermanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new GermanAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -46,9 +49,22 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(analyzer, "äpfel", "apfel");
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
@ -60,7 +76,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -64,7 +64,7 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -23,9 +23,13 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -58,9 +62,22 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "", new String[] { "" });
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -66,6 +66,6 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new GreekAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new GreekAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -55,6 +55,6 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new EnglishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new EnglishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -54,7 +54,7 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -42,7 +42,7 @@ public class TestKStemmer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
}
/**

View File

@ -63,7 +63,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

View File

@ -51,6 +51,6 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new SpanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), new SpanishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -48,7 +48,7 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {

Some files were not shown because too many files have changed in this diff Show More