mirror of https://github.com/apache/lucene.git
LUCENE-3892: merge in trunk changes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
cf36fb9a58
11
build.xml
11
build.xml
|
@ -65,7 +65,7 @@
|
|||
</subant></sequential>
|
||||
</target>
|
||||
|
||||
<target name="resolve" description="Resolves all dependencies">
|
||||
<target name="resolve" depends="clean-jars" description="Resolves all dependencies">
|
||||
<sequential><subant target="resolve" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
<fileset dir="solr" includes="build.xml" />
|
||||
|
@ -116,7 +116,7 @@
|
|||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="eclipse" description="Setup Eclipse configuration" depends="resolve">
|
||||
<target name="eclipse" depends="clean-jars, resolve" description="Setup Eclipse configuration">
|
||||
<copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false"/>
|
||||
<copy file="dev-tools/eclipse/dot.classpath" tofile=".classpath" overwrite="true"/>
|
||||
<mkdir dir=".settings"/>
|
||||
|
@ -129,7 +129,7 @@
|
|||
</echo>
|
||||
</target>
|
||||
|
||||
<target name="idea" description="Setup IntelliJ IDEA configuration" depends="resolve">
|
||||
<target name="idea" depends="clean-jars, resolve" description="Setup IntelliJ IDEA configuration">
|
||||
<copy todir=".">
|
||||
<fileset dir="dev-tools/idea"/>
|
||||
</copy>
|
||||
|
@ -138,6 +138,7 @@
|
|||
File | Project Structure | Project | Project SDK.
|
||||
</echo>
|
||||
</target>
|
||||
|
||||
<target name="clean-idea"
|
||||
description="Removes all IntelliJ IDEA configuration files">
|
||||
<delete dir=".idea" failonerror="true"/>
|
||||
|
@ -148,7 +149,7 @@
|
|||
</delete>
|
||||
</target>
|
||||
|
||||
<target name="clean" description="Clean Lucene and Solr">
|
||||
<target name="clean" depends="clean-jars" description="Clean Lucene and Solr">
|
||||
<delete dir="dist" />
|
||||
<sequential>
|
||||
<subant target="clean" inheritall="false" failonerror="true">
|
||||
|
@ -175,7 +176,7 @@
|
|||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="jar-checksums" description="Recompute SHA1 checksums for all JAR files.">
|
||||
<target name="jar-checksums" depends="resolve" description="Recompute SHA1 checksums for all JAR files.">
|
||||
<delete>
|
||||
<fileset dir="${basedir}">
|
||||
<include name="**/*.jar.sha1"/>
|
||||
|
|
|
@ -97,12 +97,14 @@
|
|||
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/httpcore-4.1.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/httpmime-4.1.3.jar"/>
|
||||
|
@ -115,7 +117,7 @@
|
|||
<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.5.jar"/>
|
||||
<classpathentry kind="lib" path="solr/example/lib/jetty-continuation-8.1.2.v20120308.jar"/>
|
||||
<classpathentry kind="lib" path="solr/example/lib/jetty-deploy-8.1.2.v20120308.jar"/>
|
||||
<classpathentry kind="lib" path="solr/example/lib/jetty-http-8.1.2.v20120308.jar"/>
|
||||
|
@ -170,6 +172,6 @@
|
|||
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-beanutils-1.7.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-collections-3.2.1.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.5.0.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
<library name="JUnit">
|
||||
<CLASSES>
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/junit-4.10.jar!/" />
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.5.0.jar!/" />
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Lucene tools library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/lucene/tools/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/lucene/tools/lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -16,6 +16,11 @@
|
|||
<option name="USE_RELATIVE_INDENTS" value="false" />
|
||||
</value>
|
||||
</option>
|
||||
<option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="20" />
|
||||
<option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="20" />
|
||||
<option name="PACKAGES_TO_USE_IMPORT_ON_DEMAND">
|
||||
<value />
|
||||
</option>
|
||||
<ADDITIONAL_INDENT_OPTIONS fileType="groovy">
|
||||
<option name="INDENT_SIZE" value="2" />
|
||||
<option name="CONTINUATION_INDENT_SIZE" value="4" />
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Lucene tools library" level="project" />
|
||||
<orderEntry type="library" name="Ant" level="project"/>
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -89,6 +89,11 @@
|
|||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.nekohtml</groupId>
|
||||
<artifactId>nekohtml</artifactId>
|
||||
<version>1.9.15</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
|
|
|
@ -155,6 +155,11 @@
|
|||
<artifactId>commons-codec</artifactId>
|
||||
<version>1.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
<version>1.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-digester</groupId>
|
||||
<artifactId>commons-digester</artifactId>
|
||||
|
@ -293,7 +298,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<version>3.3.4</version>
|
||||
<version>3.3.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
|
@ -303,7 +308,7 @@
|
|||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
<artifactId>morfologik-polish</artifactId>
|
||||
<version>1.5.2</version>
|
||||
<version>1.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.woodstox</groupId>
|
||||
|
@ -383,7 +388,7 @@
|
|||
<dependency>
|
||||
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
||||
<artifactId>randomizedtesting-runner</artifactId>
|
||||
<version>1.5.0</version>
|
||||
<version>1.6.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
|
|
@ -138,6 +138,10 @@
|
|||
<groupId>commons-codec</groupId>
|
||||
<artifactId>commons-codec</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-fileupload</groupId>
|
||||
<artifactId>commons-fileupload</artifactId>
|
||||
|
|
|
@ -17,8 +17,8 @@ import traceback
|
|||
import os
|
||||
import sys
|
||||
import re
|
||||
from HTMLParser import HTMLParser, HTMLParseError
|
||||
import urlparse
|
||||
from html.parser import HTMLParser, HTMLParseError
|
||||
import urllib.parse as urlparse
|
||||
|
||||
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
||||
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
||||
|
@ -57,7 +57,7 @@ class FindHyperlinks(HTMLParser):
|
|||
pass
|
||||
else:
|
||||
self.printFile()
|
||||
print ' WARNING: anchor "%s" appears more than once' % name
|
||||
print(' WARNING: anchor "%s" appears more than once' % name)
|
||||
else:
|
||||
self.anchors.add(name)
|
||||
elif href is not None:
|
||||
|
@ -73,8 +73,8 @@ class FindHyperlinks(HTMLParser):
|
|||
|
||||
def printFile(self):
|
||||
if not self.printed:
|
||||
print
|
||||
print ' ' + self.baseURL
|
||||
print()
|
||||
print(' ' + self.baseURL)
|
||||
self.printed = True
|
||||
|
||||
def parse(baseURL, html):
|
||||
|
@ -85,8 +85,8 @@ def parse(baseURL, html):
|
|||
parser.close()
|
||||
except HTMLParseError:
|
||||
parser.printFile()
|
||||
print ' WARNING: failed to parse:'
|
||||
traceback.print_exc()
|
||||
print(' WARNING: failed to parse %s:' % baseURL)
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
failures = True
|
||||
return [], []
|
||||
|
||||
|
@ -104,8 +104,8 @@ def checkAll(dirName):
|
|||
global failures
|
||||
|
||||
# Find/parse all HTML files first
|
||||
print
|
||||
print 'Crawl/parse...'
|
||||
print()
|
||||
print('Crawl/parse...')
|
||||
allFiles = {}
|
||||
|
||||
if os.path.isfile(dirName):
|
||||
|
@ -128,11 +128,11 @@ def checkAll(dirName):
|
|||
# deprecated-list.html can fail to escape generics types
|
||||
fullPath = os.path.join(root, f)
|
||||
#print ' %s' % fullPath
|
||||
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
|
||||
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read())
|
||||
|
||||
# ... then verify:
|
||||
print
|
||||
print 'Verify...'
|
||||
print()
|
||||
print('Verify...')
|
||||
for fullPath, (links, anchors) in allFiles.items():
|
||||
#print fullPath
|
||||
printed = False
|
||||
|
@ -176,16 +176,16 @@ def checkAll(dirName):
|
|||
and os.path.basename(fullPath) != 'Changes.html':
|
||||
if not printed:
|
||||
printed = True
|
||||
print
|
||||
print fullPath
|
||||
print ' BAD EXTERNAL LINK: %s' % link
|
||||
print()
|
||||
print(fullPath)
|
||||
print(' BAD EXTERNAL LINK: %s' % link)
|
||||
elif link.startswith('mailto:'):
|
||||
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
|
||||
if not printed:
|
||||
printed = True
|
||||
print
|
||||
print fullPath
|
||||
print ' BROKEN MAILTO (?): %s' % link
|
||||
print()
|
||||
print(fullPath)
|
||||
print(' BROKEN MAILTO (?): %s' % link)
|
||||
elif link.startswith('javascript:'):
|
||||
# ok...?
|
||||
pass
|
||||
|
@ -200,15 +200,15 @@ def checkAll(dirName):
|
|||
if not os.path.exists(link):
|
||||
if not printed:
|
||||
printed = True
|
||||
print
|
||||
print fullPath
|
||||
print ' BROKEN LINK: %s' % link
|
||||
print()
|
||||
print(fullPath)
|
||||
print(' BROKEN LINK: %s' % link)
|
||||
elif anchor is not None and anchor not in allFiles[link][1]:
|
||||
if not printed:
|
||||
printed = True
|
||||
print
|
||||
print fullPath
|
||||
print ' BROKEN ANCHOR: %s' % origLink
|
||||
print()
|
||||
print(fullPath)
|
||||
print(' BROKEN ANCHOR: %s' % origLink)
|
||||
|
||||
failures = failures or printed
|
||||
|
||||
|
@ -216,8 +216,8 @@ def checkAll(dirName):
|
|||
|
||||
if __name__ == '__main__':
|
||||
if checkAll(sys.argv[1]):
|
||||
print
|
||||
print 'Broken javadocs links were found!'
|
||||
print()
|
||||
print('Broken javadocs links were found!')
|
||||
sys.exit(1)
|
||||
sys.exit(0)
|
||||
|
||||
|
|
|
@ -210,16 +210,6 @@ def checkSigs(project, urlString, version, tmpDir, isSigned):
|
|||
if keysURL is None:
|
||||
raise RuntimeError('%s is missing KEYS' % project)
|
||||
|
||||
if not os.path.exists('%s/apache-rat-0.8.jar' % tmpDir):
|
||||
print ' downloading Apache RAT...'
|
||||
download('apache-rat-incubating-0.8-bin.tar.bz2',
|
||||
'http://archive.apache.org/dist/incubator/rat/binaries/apache-rat-incubating-0.8-bin.tar.bz2',
|
||||
tmpDir)
|
||||
t = tarfile.open('%s/apache-rat-incubating-0.8-bin.tar.bz2' % tmpDir)
|
||||
t.extract('apache-rat-0.8/apache-rat-0.8.jar', '%s/apache-rat-0.8.jar' % tmpDir)
|
||||
else:
|
||||
print ' apache RAT already downloaded...'
|
||||
|
||||
print ' get KEYS'
|
||||
download('%s.KEYS' % project, keysURL, tmpDir)
|
||||
|
||||
|
@ -480,9 +470,6 @@ def verifyUnpacked(project, artifact, unpackPath, version, tmpDir):
|
|||
print ' run "ant validate"'
|
||||
run('%s; ant validate' % javaExe('1.7'), '%s/validate.log' % unpackPath)
|
||||
|
||||
print ' run "ant rat-sources"'
|
||||
run('%s; ant -lib "%s/apache-rat-0.8.jar/apache-rat-0.8" rat-sources' % (javaExe('1.7'), tmpDir), '%s/rat-sources.log' % unpackPath)
|
||||
|
||||
if project == 'lucene':
|
||||
print ' run tests w/ Java 6...'
|
||||
run('%s; ant test' % javaExe('1.6'), '%s/test.log' % unpackPath)
|
||||
|
|
|
@ -7,6 +7,120 @@ http://s.apache.org/luceneversions
|
|||
======================= Lucene 5.0.0 =======================
|
||||
|
||||
|
||||
======================= Lucene 4.0.0-BETA =======================
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
|
||||
iteration marks. (Robert Muir, Christian Moen)
|
||||
|
||||
* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
|
||||
create automata from a fixed collection of UTF-8 encoded BytesRef
|
||||
(Dawid Weiss, Robert Muir)
|
||||
|
||||
* LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
|
||||
respect field boundaries in the case of highlighting for multivalued fields.
|
||||
(Martijn van Groningen)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
|
||||
The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
|
||||
has a different API (carries a list of tags instead of a compound tag). Upgrade
|
||||
of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
|
||||
|
||||
* LUCENE-4178: set 'tokenized' to true on FieldType by default, so that if you
|
||||
make a custom FieldType and set indexed = true, its analyzed by the analyzer.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
|
||||
module and replaced by NekoHTML. HTMLParser interface was cleaned up while
|
||||
changing method signatures. (Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-2191: Rename Tokenizer.reset(Reader) to Tokenizer.setReader(Reader).
|
||||
The purpose of this method was always to set a new Reader on the Tokenizer,
|
||||
reusing the object. But the name was often confused with TokenStream.reset().
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-4228: Refactored CharFilter to extend java.io.FilterReader. CharFilters
|
||||
filter another reader and you override correct() for offset correction.
|
||||
(Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4171: Performance improvements to Packed64.
|
||||
(Toke Eskildsen via Adrien Grand)
|
||||
|
||||
* LUCENE-4184: Performance improvements to the aligned packed bits impl.
|
||||
(Toke Eskildsen, Adrien Grand)
|
||||
|
||||
* LUCENE-4235: Remove enforcing of Filter rewrite for NRQ queries.
|
||||
(Uwe Schindler)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4176: Fix AnalyzingQueryParser to analyze range endpoints as bytes,
|
||||
so that it works correctly with Analyzers that produce binary non-UTF-8 terms
|
||||
such as CollationAnalyzer. (Nattapong Sirilappanich via Robert Muir)
|
||||
|
||||
* LUCENE-4209: Fix FSTCompletionLookup to close its sorter, so that it won't
|
||||
leave temp files behind in /tmp. Fix SortedTermFreqIteratorWrapper to not
|
||||
leave temp files behind in /tmp on Windows. Fix Sort to not leave
|
||||
temp files behind when /tmp is a separate volume. (Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-4221: Fix overeager CheckIndex validation for term vector offsets.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-4222: TieredMergePolicy.getFloorSegmentMB was returning the
|
||||
size in bytes not MB (Chris Fuller via Mike McCandless)
|
||||
|
||||
* LUCENE-3505: Fix bug (Lucene 4.0alpha only) where boolean conjunctions
|
||||
were sometimes scored incorrectly. Conjunctions of only termqueries where
|
||||
at least one term omitted term frequencies (IndexOptions.DOCS_ONLY) would
|
||||
be scored as if all terms omitted term frequencies. (Robert Muir)
|
||||
|
||||
* LUCENE-2686, LUCENE-3505: Fixed BooleanQuery scorers to return correct
|
||||
freq(). Added support for scorer navigation API (Scorer.getChildren) to
|
||||
all queries. Made Scorer.freq() abstract.
|
||||
(Koji Sekiguchi, Mike McCandless, Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-4094: Support overriding file.encoding on forked test JVMs
|
||||
(force via -Drandomized.file.encoding=XXX). (Dawid Weiss)
|
||||
|
||||
* LUCENE-4189: Test output should include timestamps (start/end for each
|
||||
test/ suite). Added -Dtests.timestamps=[off by default]. (Dawid Weiss)
|
||||
|
||||
* LUCENE-4110: Report long periods of forked jvm inactivity (hung tests/ suites).
|
||||
Added -Dtests.heartbeat=[seconds] with the default of 60 seconds.
|
||||
(Dawid Weiss)
|
||||
|
||||
* LUCENE-4160: Added a property to quit the tests after a given
|
||||
number of failures has occurred. This is useful in combination
|
||||
with -Dtests.iters=N (you can start N iterations and wait for M
|
||||
failures, in particular M = 1). -Dtests.maxfailures=M. Alternatively,
|
||||
specify -Dtests.failfast=true to skip all tests after the first failure.
|
||||
(Dawid Weiss)
|
||||
|
||||
* LUCENE-4115: JAR resolution/ cleanup should be done automatically for ant
|
||||
clean/ eclipse/ resolve (Dawid Weiss)
|
||||
|
||||
* LUCENE-4199, LUCENE-4202, LUCENE-4206: Add a new target "check-forbidden-apis"
|
||||
that parses all generated .class files for use of APIs that use default
|
||||
charset, default locale, or default timezone and fail build if violations
|
||||
found. This ensures, that Lucene / Solr is independent on local configuration
|
||||
options. (Uwe Schindler, Robert Muir, Dawid Weiss)
|
||||
|
||||
* LUCENE-4217: Add the possibility to run tests with Atlassian Clover
|
||||
loaded from IVY. A development License solely for Apache code was added in
|
||||
the tools/ folder, but is not included in releases. (Uwe Schindler)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-4195: Added package documentation and examples for
|
||||
org.apache.lucene.codecs (Alan Woodward via Robert Muir)
|
||||
|
||||
======================= Lucene 4.0.0-ALPHA =======================
|
||||
|
||||
More information about this release, including any errata related to the
|
||||
|
@ -20,7 +134,7 @@ Changes in backwards compatibility policy
|
|||
|
||||
* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing:
|
||||
|
||||
- On upgrading to 3.1, if you do not fully reindex your documents,
|
||||
- On upgrading to 4.0, if you do not fully reindex your documents,
|
||||
Lucene will emulate the new flex API on top of the old index,
|
||||
incurring some performance cost (up to ~10% slowdown, typically).
|
||||
To prevent this slowdown, use oal.index.IndexUpgrader
|
||||
|
@ -29,7 +143,7 @@ Changes in backwards compatibility policy
|
|||
Mixed flex/pre-flex indexes are perfectly fine -- the two
|
||||
emulation layers (flex API on pre-flex index, and pre-flex API on
|
||||
flex index) will remap the access as required. So on upgrading to
|
||||
3.1 you can start indexing new documents into an existing index.
|
||||
4.0 you can start indexing new documents into an existing index.
|
||||
To get optimal performance, use oal.index.IndexUpgrader
|
||||
to upgrade your indexes to latest file format (LUCENE-3082).
|
||||
|
||||
|
@ -283,6 +397,11 @@ Changes in backwards compatibility policy
|
|||
removed, as IndexReaderContext.leaves() is now the preferred way
|
||||
to access sub-readers. (Uwe Schindler)
|
||||
|
||||
* LUCENE-4155: oal.util.ReaderUtil, TwoPhaseCommit, TwoPhaseCommitTool
|
||||
classes were moved to oal.index package. oal.util.CodecUtil class was moved
|
||||
to oal.codecs package. oal.util.DummyConcurrentLock was removed
|
||||
(no longer used in Lucene 4.0). (Uwe Schindler)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you
|
||||
|
@ -989,6 +1108,11 @@ Optimizations
|
|||
* LUCENE-4156: DirectoryTaxonomyWriter.getSize is no longer synchronized.
|
||||
(Shai Erera, Sivan Yogev)
|
||||
|
||||
* LUCENE-4163: Improve concurrency of MMapIndexInput.clone() by using
|
||||
the new WeakIdentityMap on top of a ConcurrentHashMap to manage
|
||||
the cloned instances. WeakIdentityMap was extended to support
|
||||
iterating over its keys. (Uwe Schindler)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2803: The FieldCache can miss values if an entry for a reader
|
||||
|
@ -1062,6 +1186,13 @@ Bug fixes
|
|||
* LUCENE-4114: Fix int overflow bugs in BYTES_FIXED_STRAIGHT and
|
||||
BYTES_FIXED_DEREF doc values implementations (Walt Elder via Mike McCandless).
|
||||
|
||||
* LUCENE-4147: Fixed thread safety issues when rollback() and commit()
|
||||
are called simultaneously. (Simon Willnauer, Mike McCandless)
|
||||
|
||||
* LUCENE-4165: Removed closing of the Reader used to read the affix file in
|
||||
HunspellDictionary. Consumers are now responsible for closing all InputStreams
|
||||
once the Dictionary has been instantiated. (Torsten Krah, Uwe Schindler, Chris Male)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-3958: Javadocs corrections for IndexWriter.
|
||||
|
|
|
@ -145,7 +145,7 @@ enumeration APIs. Here are the major changes:
|
|||
oal.util.ReaderUtil) and then step through those readers yourself,
|
||||
if you can (this is how Lucene drives searches).
|
||||
|
||||
If you pass a SegmentReader to MultiFields.fiels it will simply
|
||||
If you pass a SegmentReader to MultiFields.fields it will simply
|
||||
return reader.fields(), so there is no performance hit in that
|
||||
case.
|
||||
|
||||
|
@ -334,7 +334,7 @@ based on document IDs, albeit the per-segment orientation.
|
|||
|
||||
There are still valid use-cases where top-level readers ie. "atomic
|
||||
views" on the index are desirable. Let say you want to iterate all terms
|
||||
of a complete index for auto-completion or facetting, Lucene provides
|
||||
of a complete index for auto-completion or faceting, Lucene provides
|
||||
utility wrappers like SlowCompositeReaderWrapper (LUCENE-2597) emulating
|
||||
an AtomicReader. Note: using "atomicity emulators" can cause serious
|
||||
slowdowns due to the need to merge terms, postings, DocValues, and
|
||||
|
@ -574,7 +574,7 @@ you can now do this:
|
|||
Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum
|
||||
is both consumer and producer of attributes: MTQ.BoostAttribute is
|
||||
added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it.
|
||||
The other way round MTQ.TopTermsBooleanQueryRewrite supplys a
|
||||
The other way round MTQ.TopTermsBooleanQueryRewrite supplies a
|
||||
global AttributeSource to each segments TermsEnum. The TermsEnum is consumer
|
||||
and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute).
|
||||
|
||||
|
@ -594,7 +594,7 @@ you can now do this:
|
|||
* LUCENE-1076: TieredMergePolicy is now the default merge policy.
|
||||
It's able to merge non-contiguous segments; this may cause problems
|
||||
for applications that rely on Lucene's internal document ID
|
||||
assigment. If so, you should instead use LogByteSize/DocMergePolicy
|
||||
assignment. If so, you should instead use LogByteSize/DocMergePolicy
|
||||
during indexing.
|
||||
|
||||
* LUCENE-3722: Similarity methods and collection/term statistics now take
|
||||
|
|
|
@ -61,50 +61,50 @@
|
|||
executable="${python.exe}" failonerror="true" logerror="true">
|
||||
<arg value="htmlentity.py"/>
|
||||
</exec>
|
||||
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
|
||||
</target>
|
||||
|
||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/wikipedia"
|
||||
nobak="on"/>
|
||||
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
|
||||
</target>
|
||||
|
||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
||||
nobak="on" />
|
||||
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
|
||||
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
||||
</target>
|
||||
|
||||
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
||||
nobak="on" />
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard/std34"
|
||||
nobak="on" />
|
||||
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
||||
</target>
|
||||
|
||||
<!-- Remove the inappropriate JFlex-generated constructor -->
|
||||
<macrodef name="run-jflex">
|
||||
<attribute name="dir"/>
|
||||
<attribute name="name"/>
|
||||
<sequential>
|
||||
<jflex file="@{dir}/@{name}.jflex"
|
||||
outdir="@{dir}"
|
||||
nobak="on" />
|
||||
<replaceregexp file="@{dir}/@{name}.java"
|
||||
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||
replace="" flags="sg"/>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<target name="clean-jflex">
|
||||
<delete>
|
||||
<fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package org.apache.lucene.analysis.br;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -21,6 +23,7 @@ package org.apache.lucene.analysis.br;
|
|||
* A stemmer for Brazilian Portuguese words.
|
||||
*/
|
||||
public class BrazilianStemmer {
|
||||
private static final Locale locale = new Locale("pt", "BR");
|
||||
|
||||
/**
|
||||
* Changed term
|
||||
|
@ -243,7 +246,7 @@ public class BrazilianStemmer {
|
|||
return null ;
|
||||
}
|
||||
|
||||
value = value.toLowerCase() ;
|
||||
value = value.toLowerCase(locale) ;
|
||||
for (j=0 ; j < value.length() ; j++) {
|
||||
if ((value.charAt(j) == 'á') ||
|
||||
(value.charAt(j) == 'â') ||
|
||||
|
|
|
@ -17,9 +17,10 @@
|
|||
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
|
@ -34,7 +35,7 @@ public abstract class BaseCharFilter extends CharFilter {
|
|||
private int diffs[];
|
||||
private int size = 0;
|
||||
|
||||
public BaseCharFilter(CharStream in) {
|
||||
public BaseCharFilter(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/18/12 12:24 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/16/12 4:05 PM */
|
||||
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
|
@ -20,13 +20,13 @@ package org.apache.lucene.analysis.charfilter;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||
|
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 5/18/12 12:24 PM from the specification file
|
||||
* <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
* on 7/16/12 4:05 PM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
*/
|
||||
public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||
|
||||
|
@ -30647,7 +30647,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
/**
|
||||
* @param source
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source) {
|
||||
public HTMLStripCharFilter(Reader source) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
}
|
||||
|
@ -30657,7 +30657,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
* @param escapedTags Tags in this set (both start and end tags)
|
||||
* will not be filtered out.
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
|
||||
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
if (null != escapedTags) {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
/**
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
@ -18,13 +18,13 @@ package org.apache.lucene.analysis.charfilter;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||
|
@ -173,7 +173,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
/**
|
||||
* @param source
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source) {
|
||||
public HTMLStripCharFilter(Reader source) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
}
|
||||
|
@ -183,7 +183,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
* @param escapedTags Tags in this set (both start and end tags)
|
||||
* will not be filtered out.
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
|
||||
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
if (null != escapedTags) {
|
||||
|
|
|
@ -21,8 +21,7 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.CharFilter; // javadocs
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.RollingCharBuffer;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
|
@ -51,8 +50,8 @@ public class MappingCharFilter extends BaseCharFilter {
|
|||
private int replacementPointer;
|
||||
private int inputOff;
|
||||
|
||||
/** Default constructor that takes a {@link CharStream}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
|
||||
/** Default constructor that takes a {@link Reader}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
|
||||
super(in);
|
||||
buffer.reset(in);
|
||||
|
||||
|
@ -66,15 +65,10 @@ public class MappingCharFilter extends BaseCharFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/** Easy-use constructor that takes a {@link Reader}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
|
||||
this(normMap, CharReader.get(in));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer.reset(input);
|
||||
buffer.reset(in);
|
||||
replacement = null;
|
||||
inputOff = 0;
|
||||
}
|
||||
|
|
|
@ -205,7 +205,7 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
/**
|
||||
* refills buffers with new data from the current token.
|
||||
*/
|
||||
private void refill() throws IOException {
|
||||
private void refill() {
|
||||
// compact buffers to keep them smallish if they become large
|
||||
// just a safety check, but technically we only need the last codepoint
|
||||
if (bufferLen > 64) {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
@ -463,10 +464,10 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void printStats() {
|
||||
System.out.println("Value space size = "
|
||||
public void printStats(PrintStream out) {
|
||||
out.println("Value space size = "
|
||||
+ Integer.toString(vspace.length()));
|
||||
super.printStats();
|
||||
super.printStats(out);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ import javax.xml.parsers.SAXParserFactory;
|
|||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||
public class PatternParser extends DefaultHandler {
|
||||
|
||||
XMLReader parser;
|
||||
|
||||
|
@ -64,7 +64,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
|
||||
static final int ELEM_HYPHEN = 4;
|
||||
|
||||
public PatternParser() throws HyphenationException {
|
||||
public PatternParser() {
|
||||
token = new StringBuilder();
|
||||
parser = createParser();
|
||||
parser.setContentHandler(this);
|
||||
|
@ -74,7 +74,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
|
||||
}
|
||||
|
||||
public PatternParser(PatternConsumer consumer) throws HyphenationException {
|
||||
public PatternParser(PatternConsumer consumer) {
|
||||
this();
|
||||
this.consumer = consumer;
|
||||
}
|
||||
|
@ -402,25 +402,4 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
return str.toString();
|
||||
|
||||
} // getLocationString(SAXParseException):String
|
||||
|
||||
// PatternConsumer implementation for testing purposes
|
||||
public void addClass(String c) {
|
||||
System.out.println("class: " + c);
|
||||
}
|
||||
|
||||
public void addException(String w, ArrayList<Object> e) {
|
||||
System.out.println("exception: " + w + " : " + e.toString());
|
||||
}
|
||||
|
||||
public void addPattern(String p, String v) {
|
||||
System.out.println("pattern: " + p + " : " + v);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length > 0) {
|
||||
PatternParser pp = new PatternParser();
|
||||
pp.setConsumer(pp);
|
||||
pp.parse(args[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Stack;
|
||||
|
||||
|
@ -633,11 +634,11 @@ public class TernaryTree implements Cloneable {
|
|||
|
||||
}
|
||||
|
||||
public void printStats() {
|
||||
System.out.println("Number of keys = " + Integer.toString(length));
|
||||
System.out.println("Node count = " + Integer.toString(freenode));
|
||||
public void printStats(PrintStream out) {
|
||||
out.println("Number of keys = " + Integer.toString(length));
|
||||
out.println("Node count = " + Integer.toString(freenode));
|
||||
// System.out.println("Array length = " + Integer.toString(eq.length));
|
||||
System.out.println("Key Array length = " + Integer.toString(kv.length()));
|
||||
out.println("Key Array length = " + Integer.toString(kv.length()));
|
||||
|
||||
/*
|
||||
* for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
|
||||
|
@ -647,8 +648,8 @@ public class TernaryTree implements Cloneable {
|
|||
*/
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
/*
|
||||
public static void main(String[] args) {
|
||||
TernaryTree tt = new TernaryTree();
|
||||
tt.insert("Carlos", 'C');
|
||||
tt.insert("Car", 'r');
|
||||
|
@ -658,7 +659,8 @@ public class TernaryTree implements Cloneable {
|
|||
System.out.println((char) tt.find("Car"));
|
||||
System.out.println((char) tt.find("Carlos"));
|
||||
System.out.println((char) tt.find("alto"));
|
||||
tt.printStats();
|
||||
tt.printStats(System.out);
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
|
|
@ -94,8 +94,8 @@ public final class KeywordTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
this.done = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -122,7 +121,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* Returns the next input Token whose term() is not a stop word.
|
||||
*/
|
||||
@Override
|
||||
protected boolean accept() throws IOException {
|
||||
protected boolean accept() {
|
||||
return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
|
||||
}
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
|
|||
* When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes
|
||||
*/
|
||||
@Override
|
||||
protected boolean accept() throws IOException {
|
||||
protected boolean accept() {
|
||||
return useWhiteList == stopTypes.contains(typeAttribute.type());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
// This file is encoded in UTF-8
|
||||
|
||||
/*
|
||||
|
@ -38,6 +41,8 @@ public class GermanStemmer
|
|||
*/
|
||||
private int substCount = 0;
|
||||
|
||||
private static final Locale locale = new Locale("de", "DE");
|
||||
|
||||
/**
|
||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
|
@ -47,7 +52,7 @@ public class GermanStemmer
|
|||
protected String stem( String term )
|
||||
{
|
||||
// Use lowercase for medium stemming.
|
||||
term = term.toLowerCase();
|
||||
term = term.toLowerCase(locale);
|
||||
if ( !isStemmable( term ) )
|
||||
return term;
|
||||
// Reset the StringBuilder.
|
||||
|
|
|
@ -289,7 +289,7 @@ public class KStemmer {
|
|||
entry = new DictEntry(exceptionWords[i], true);
|
||||
d.put(exceptionWords[i], entry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + exceptionWords[i]
|
||||
throw new RuntimeException("Warning: Entry [" + exceptionWords[i]
|
||||
+ "] already in dictionary 1");
|
||||
}
|
||||
}
|
||||
|
@ -299,7 +299,7 @@ public class KStemmer {
|
|||
entry = new DictEntry(directConflations[i][1], false);
|
||||
d.put(directConflations[i][0], entry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + directConflations[i][0]
|
||||
throw new RuntimeException("Warning: Entry [" + directConflations[i][0]
|
||||
+ "] already in dictionary 2");
|
||||
}
|
||||
}
|
||||
|
@ -309,7 +309,7 @@ public class KStemmer {
|
|||
entry = new DictEntry(countryNationality[i][1], false);
|
||||
d.put(countryNationality[i][0], entry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + countryNationality[i][0]
|
||||
throw new RuntimeException("Warning: Entry [" + countryNationality[i][0]
|
||||
+ "] already in dictionary 3");
|
||||
}
|
||||
}
|
||||
|
@ -323,7 +323,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -333,7 +333,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -343,7 +343,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -353,7 +353,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -363,7 +363,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -373,7 +373,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -383,7 +383,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(array[i])) {
|
||||
d.put(array[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + array[i]
|
||||
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -392,7 +392,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(KStemData8.data[i])) {
|
||||
d.put(KStemData8.data[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + KStemData8.data[i]
|
||||
throw new RuntimeException("Warning: Entry [" + KStemData8.data[i]
|
||||
+ "] already in dictionary 4");
|
||||
}
|
||||
}
|
||||
|
@ -401,7 +401,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(supplementDict[i])) {
|
||||
d.put(supplementDict[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + supplementDict[i]
|
||||
throw new RuntimeException("Warning: Entry [" + supplementDict[i]
|
||||
+ "] already in dictionary 5");
|
||||
}
|
||||
}
|
||||
|
@ -410,7 +410,7 @@ public class KStemmer {
|
|||
if (!d.containsKey(properNouns[i])) {
|
||||
d.put(properNouns[i], defaultEntry);
|
||||
} else {
|
||||
System.out.println("Warning: Entry [" + properNouns[i]
|
||||
throw new RuntimeException("Warning: Entry [" + properNouns[i]
|
||||
+ "] already in dictionary 6");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -492,10 +492,9 @@ class PorterStemmer
|
|||
return dirty;
|
||||
}
|
||||
|
||||
/** Test program for demonstrating the Stemmer. It reads a file and
|
||||
/* Test program for demonstrating the Stemmer. It reads a file and
|
||||
* stems each word, writing the result to standard out.
|
||||
* Usage: Stemmer file-name
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
PorterStemmer s = new PorterStemmer();
|
||||
|
||||
|
@ -542,6 +541,6 @@ class PorterStemmer
|
|||
System.out.println("error reading " + args[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
|
@ -134,6 +133,6 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new PersianCharFilter(CharReader.get(reader));
|
||||
return new PersianCharFilter(reader);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,9 @@ package org.apache.lucene.analysis.fa;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
|
||||
/**
|
||||
* CharFilter that replaces instances of Zero-width non-joiner with an
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.charfilter.CharFilter;
|
|||
*/
|
||||
public class PersianCharFilter extends CharFilter {
|
||||
|
||||
public PersianCharFilter(CharStream in) {
|
||||
public PersianCharFilter(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
|
@ -45,4 +45,9 @@ public class PersianCharFilter extends CharFilter {
|
|||
}
|
||||
return charsRead;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
return currentOff; // we don't change the length of the string
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,10 +66,11 @@ public class HunspellDictionary {
|
|||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file
|
||||
* @param dictionary InputStream for reading the hunspell dictionary file
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
|
@ -80,10 +81,11 @@ public class HunspellDictionary {
|
|||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file
|
||||
* @param dictionary InputStream for reading the hunspell dictionary file
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
|
@ -95,10 +97,11 @@ public class HunspellDictionary {
|
|||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file
|
||||
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
|
@ -110,10 +113,11 @@ public class HunspellDictionary {
|
|||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file
|
||||
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
|
||||
|
@ -194,7 +198,6 @@ public class HunspellDictionary {
|
|||
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -252,7 +255,7 @@ public class HunspellDictionary {
|
|||
}
|
||||
|
||||
String condition = ruleArgs[4];
|
||||
affix.setCondition(condition, String.format(conditionPattern, condition));
|
||||
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
||||
affix.setCrossProduct(crossProduct);
|
||||
|
||||
List<HunspellAffix> list = affixes.get(affix.getAppend());
|
||||
|
@ -376,7 +379,7 @@ public class HunspellDictionary {
|
|||
Arrays.sort(wordForm.getFlags());
|
||||
entry = line.substring(0, flagSep);
|
||||
if(ignoreCase) {
|
||||
entry = entry.toLowerCase(Locale.ENGLISH);
|
||||
entry = entry.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.hunspell;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -298,13 +299,12 @@ public class HunspellStemmer {
|
|||
|
||||
// ================================================= Entry Point ===================================================
|
||||
|
||||
/**
|
||||
/*
|
||||
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
|
||||
*
|
||||
* @param args Program arguments. Should contain location of affix file and location of dic file
|
||||
* @throws IOException Can be thrown while reading from the files
|
||||
* @throws ParseException Can be thrown while parsing the files
|
||||
*/
|
||||
public static void main(String[] args) throws IOException, ParseException {
|
||||
boolean ignoreCase = false;
|
||||
int offset = 0;
|
||||
|
@ -330,7 +330,7 @@ public class HunspellStemmer {
|
|||
|
||||
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
|
||||
|
||||
Scanner scanner = new Scanner(System.in);
|
||||
Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
|
||||
|
||||
System.out.print("> ");
|
||||
while (scanner.hasNextLine()) {
|
||||
|
@ -346,12 +346,10 @@ public class HunspellStemmer {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints the results of the stemming of a word
|
||||
*
|
||||
* @param originalWord Word that has been stemmed
|
||||
* @param stems Stems of the word
|
||||
*/
|
||||
private static void printStemResults(String originalWord, List<Stem> stems) {
|
||||
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
|
||||
|
||||
|
@ -381,13 +379,12 @@ public class HunspellStemmer {
|
|||
System.out.println(builder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple utility to check if the given String has any text
|
||||
*
|
||||
* @param str String to check if it has any text
|
||||
* @return {@code true} if the String has text, {@code false} otherwise
|
||||
*/
|
||||
private static boolean hasText(String str) {
|
||||
return str != null && str.length() > 0;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
|
@ -19,15 +19,13 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* An always exhausted token stream.
|
||||
*/
|
||||
public final class EmptyTokenStream extends TokenStream {
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,9 +17,6 @@
|
|||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -43,7 +40,7 @@ public final class KeepWordFilter extends FilteringTokenFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean accept() throws IOException {
|
||||
public boolean accept() {
|
||||
return words.contains(termAtt.buffer(), 0, termAtt.length());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
|
@ -48,7 +45,7 @@ public final class LengthFilter extends FilteringTokenFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean accept() throws IOException {
|
||||
public boolean accept() {
|
||||
final int len = termAtt.length();
|
||||
return (len >= min && len <= max);
|
||||
}
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -46,7 +44,7 @@ public final class SingleTokenTokenStream extends TokenStream {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() {
|
||||
if (exhausted) {
|
||||
return false;
|
||||
} else {
|
||||
|
@ -58,7 +56,7 @@ public final class SingleTokenTokenStream extends TokenStream {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
public void reset() {
|
||||
exhausted = false;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.io.StringReader;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
||||
|
||||
/**
|
||||
|
@ -54,7 +53,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
|||
private final String replacement;
|
||||
private Reader transformedInput;
|
||||
|
||||
public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
|
||||
public PatternReplaceCharFilter(Pattern pattern, String replacement, Reader in) {
|
||||
super(in);
|
||||
this.pattern = pattern;
|
||||
this.replacement = replacement;
|
||||
|
@ -64,15 +63,28 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
|||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
// Buffer all input on the first call.
|
||||
if (transformedInput == null) {
|
||||
fill();
|
||||
}
|
||||
|
||||
return transformedInput.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
private void fill() throws IOException {
|
||||
StringBuilder buffered = new StringBuilder();
|
||||
char [] temp = new char [1024];
|
||||
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
|
||||
for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
|
||||
buffered.append(temp, 0, cnt);
|
||||
}
|
||||
transformedInput = new StringReader(processPattern(buffered).toString());
|
||||
}
|
||||
|
||||
return transformedInput.read(cbuf, off, len);
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
if (transformedInput == null) {
|
||||
fill();
|
||||
}
|
||||
|
||||
return transformedInput.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -84,7 +84,7 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
public boolean incrementToken() {
|
||||
if (index >= str.length()) return false;
|
||||
clearAttributes();
|
||||
if (group >= 0) {
|
||||
|
@ -130,14 +130,14 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
public void end() {
|
||||
final int ofs = correctOffset(str.length());
|
||||
offsetAtt.setOffset(ofs, ofs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
|
|
|
@ -132,7 +132,7 @@ public abstract class RSLPStemmerBase {
|
|||
super(suffix, min, replacement);
|
||||
for (int i = 0; i < exceptions.length; i++) {
|
||||
if (!exceptions[i].endsWith(suffix))
|
||||
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
}
|
||||
this.exceptions = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList(exceptions), false);
|
||||
|
@ -156,7 +156,7 @@ public abstract class RSLPStemmerBase {
|
|||
super(suffix, min, replacement);
|
||||
for (int i = 0; i < exceptions.length; i++) {
|
||||
if (!exceptions[i].endsWith(suffix))
|
||||
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
}
|
||||
this.exceptions = new char[exceptions.length][];
|
||||
for (int i = 0; i < exceptions.length; i++)
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.sinks;
|
|||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
@ -37,10 +38,12 @@ public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
|||
protected CharTermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
||||
* Uses {@link java.text.DateFormat#getDateInstance(int, Locale)
|
||||
* DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as
|
||||
* the {@link java.text.DateFormat} object.
|
||||
*/
|
||||
public DateRecognizerSinkFilter() {
|
||||
this(DateFormat.getDateInstance());
|
||||
this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT));
|
||||
}
|
||||
|
||||
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
||||
|
|
|
@ -212,7 +212,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() {
|
||||
// lazy init the iterator
|
||||
if (it == null) {
|
||||
it = cachedStates.iterator();
|
||||
|
@ -228,7 +228,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
public final void end() {
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
|
|
|
@ -114,9 +114,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected void reset(final Reader reader) throws IOException {
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
|
||||
super.reset(reader);
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -175,8 +175,8 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 9/30/11 12:10 PM from the specification file
|
||||
* <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
* on 08.07.12 16:59 from the specification file
|
||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||
|
||||
|
@ -383,15 +383,7 @@ public final void getText(CharTermAttribute t) {
|
|||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
ClassicTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
|
||||
// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
|
|
|
@ -115,9 +115,9 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected void reset(final Reader reader) throws IOException {
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
|
||||
super.reset(reader);
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -183,8 +183,8 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
@ -759,15 +759,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
|||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
public StandardTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
|
|
|
@ -104,9 +104,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
|||
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected void reset(final Reader reader) throws IOException {
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
|
||||
super.reset(reader);
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -162,8 +162,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/18/12 12:05 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -3844,15 +3844,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
|
|||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
public UAX29URLEmailTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
|
|
@ -92,7 +92,7 @@ public class WordnetSynonymParser extends SynonymMap.Builder {
|
|||
return analyze(analyzer, text, reuse);
|
||||
}
|
||||
|
||||
private void addInternal(CharsRef synset[], int size) throws IOException {
|
||||
private void addInternal(CharsRef synset[], int size) {
|
||||
if (size <= 1) {
|
||||
return; // nothing to do
|
||||
}
|
||||
|
|
|
@ -650,7 +650,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Empty {@link UnmodifiableCharArrayMap} optimized for speed.
|
||||
* Empty {@link org.apache.lucene.analysis.util.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
|
||||
* Contains checks will always return <code>false</code> or throw
|
||||
* NPE if necessary.
|
||||
*/
|
||||
|
|
|
@ -17,13 +17,15 @@ package org.apache.lucene.analysis.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
|
||||
/**
|
||||
* Abstract parent class for analysis factories that create {@link CharStream}
|
||||
* Abstract parent class for analysis factories that create {@link CharFilter}
|
||||
* instances.
|
||||
*/
|
||||
public abstract class CharFilterFactory extends AbstractAnalysisFactory {
|
||||
|
||||
public abstract CharStream create(CharStream input);
|
||||
public abstract CharFilter create(Reader input);
|
||||
}
|
||||
|
|
|
@ -162,8 +162,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
|
|
|
@ -325,13 +325,13 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
public void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
/*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 1/22/12 10:26 PM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
* on 08.07.12 17:00 from the specification file
|
||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
||||
|
@ -519,15 +519,7 @@ final void reset() {
|
|||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
WikipediaTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
|
|
|
@ -435,7 +435,7 @@ public abstract class SnowballProgram {
|
|||
bra > ket ||
|
||||
ket > limit)
|
||||
{
|
||||
System.err.println("faulty slice operation");
|
||||
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
|
||||
// FIXME: report error somehow.
|
||||
/*
|
||||
fprintf(stderr, "faulty slice operation:\n");
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
|
||||
</p>
|
||||
<p>
|
||||
This module contains concrete components ({@link org.apache.lucene.analysis.charfilter.CharFilter}s,
|
||||
This module contains concrete components ({@link org.apache.lucene.analysis.CharFilter}s,
|
||||
{@link org.apache.lucene.analysis.Tokenizer}s, and ({@link org.apache.lucene.analysis.TokenFilter}s) for
|
||||
analyzing different types of content. It also provides a number of {@link org.apache.lucene.analysis.Analyzer}s
|
||||
for different languages that you can use to get started quickly.
|
||||
|
|
|
@ -96,6 +96,6 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new ArabicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new ArabicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -76,6 +76,6 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new BulgarianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new BulgarianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -162,7 +162,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -58,6 +58,6 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new CatalanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new CatalanAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,7 +29,6 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -46,7 +45,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||
return new HTMLStripCharFilter(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -60,7 +59,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = "\nthis is some text\n here is a link and " +
|
||||
"another link. " +
|
||||
"This is an entity: & plus a <. Here is an &. ";
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
char [] goldArray = gold.toCharArray();
|
||||
|
@ -79,7 +78,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
//Some sanity checks, but not a full-fledged check
|
||||
public void testHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -96,7 +95,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testMSWord14GeneratedHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
|
||||
String gold = "This is a test";
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
|
@ -117,7 +116,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = "\u0393";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -132,7 +131,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -147,7 +146,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = " <junk/> ! @ and ’";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -161,7 +160,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -346,7 +345,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||
String test = testGold[i];
|
||||
String gold = testGold[i + 1];
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -370,7 +369,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
testBuilder.append("-->foo");
|
||||
String gold = "foo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -388,7 +387,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
testBuilder.append("?>");
|
||||
gold = "";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -406,7 +405,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
testBuilder.append("/>");
|
||||
gold = "";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -430,7 +429,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
private void processBuffer(String test, String assertMsg) throws IOException {
|
||||
// System.out.println("-------------------processBuffer----------");
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -448,7 +447,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
String test = "<!--- three dashes, still a valid comment ---> ";
|
||||
String gold = " ";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -464,7 +463,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
|
||||
public void doTestOffsets(String in) throws Exception {
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(in)));
|
||||
int ch = 0;
|
||||
int off = 0; // offset in the reader
|
||||
int strOff = -1; // offset in the original string
|
||||
|
@ -491,7 +490,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
static void assertLegalOffsets(String in) throws Exception {
|
||||
int length = in.length();
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(in)));
|
||||
int ch = 0;
|
||||
int off = 0;
|
||||
while ((ch = reader.read()) != -1) {
|
||||
|
@ -508,12 +507,12 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||
int numRounds = RANDOM_MULTIPLIER * 1000;
|
||||
checkRandomData(random(), newTestAnalyzer(), numRounds);
|
||||
}
|
||||
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
int numRounds = RANDOM_MULTIPLIER * 200;
|
||||
int numRounds = RANDOM_MULTIPLIER * 100;
|
||||
checkRandomData(random(), newTestAnalyzer(), numRounds, 8192);
|
||||
}
|
||||
|
||||
|
@ -526,7 +525,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
||||
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
||||
String gold = "onetwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -540,7 +539,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
||||
gold = "one\ntwo";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -557,7 +556,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testScriptQuotes() throws Exception {
|
||||
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
||||
String gold = "one\ntwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -572,7 +571,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
||||
gold = "hello\n";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -591,7 +590,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = "one<script no-value-attr></script>two";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
(new StringReader(test), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -612,7 +611,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
+ "-->\n"
|
||||
+ "</style>two";
|
||||
String gold = "one\ntwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -631,7 +630,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = "one<style type=\"text/css\"></style>two";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
(new StringReader(test), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -656,7 +655,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||
String test = testGold[i];
|
||||
String gold = testGold[i + 1];
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
|
@ -671,7 +670,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
(new StringReader(test), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -688,7 +687,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testInlineTagsNoSpace() throws Exception {
|
||||
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
||||
String gold = "onetwo2e.three";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -705,7 +704,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testCDATA() throws Exception {
|
||||
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
|
||||
String gold = "one<one><two>three<four></four></two></one>two";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -720,7 +719,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
|
||||
gold = "onetwo<![CDATA[three]]>fourfive";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -737,7 +736,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testUppercaseCharacterEntityVariants() throws Exception {
|
||||
String test = " "-©>><<®&";
|
||||
String gold = " \"-\u00A9>><<\u00AE&";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -754,7 +753,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
||||
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
||||
String gold = "onetwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -771,7 +770,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testSupplementaryCharsInTags() throws Exception {
|
||||
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
||||
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
|
@ -822,7 +821,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text.toString())));
|
||||
(new StringReader(text.toString()));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
|
||||
|
|
|
@ -29,8 +29,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -60,7 +59,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testReaderReset() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
char[] buf = new char[10];
|
||||
int len = cs.read(buf, 0, 10);
|
||||
assertEquals( 1, len );
|
||||
|
@ -76,55 +75,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testNothingChange() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
|
||||
}
|
||||
|
||||
public void test1to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
|
||||
}
|
||||
|
||||
public void test1to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
|
||||
}
|
||||
|
||||
public void test1to3() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
|
||||
}
|
||||
|
||||
public void test2to4() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
|
||||
}
|
||||
|
||||
public void test2to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
|
||||
}
|
||||
|
||||
public void test3to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
|
||||
}
|
||||
|
||||
public void test4to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
|
||||
}
|
||||
|
||||
public void test5to0() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
|
||||
}
|
||||
|
@ -149,7 +148,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
//
|
||||
public void testTokenStream() throws Exception {
|
||||
String testString = "h i j k ll cccc bbb aa";
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
|
||||
|
@ -171,8 +170,8 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
// h,8,9 => i,8,9
|
||||
public void testChained() throws Exception {
|
||||
String testString = "aaaa ll h";
|
||||
CharStream cs = new MappingCharFilter( normMap,
|
||||
new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) ) );
|
||||
CharFilter cs = new MappingCharFilter( normMap,
|
||||
new MappingCharFilter( normMap, new StringReader( testString ) ) );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[]{"a","llllllll","i"},
|
||||
|
@ -193,7 +192,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new MappingCharFilter(normMap, CharReader.get(reader));
|
||||
return new MappingCharFilter(normMap, reader);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -219,7 +218,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new MappingCharFilter(map, CharReader.get(reader));
|
||||
return new MappingCharFilter(map, reader);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -229,7 +228,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
|
||||
public void testRandomMaps() throws Exception {
|
||||
int numIterations = atLeast(10);
|
||||
int numIterations = atLeast(3);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
final NormalizeCharMap map = randomMap();
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
@ -241,7 +240,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new MappingCharFilter(map, CharReader.get(reader));
|
||||
return new MappingCharFilter(map, reader);
|
||||
}
|
||||
};
|
||||
int numRounds = 100;
|
||||
|
@ -270,7 +269,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testRandomMaps2() throws Exception {
|
||||
final Random random = random();
|
||||
final int numIterations = atLeast(10);
|
||||
final int numIterations = atLeast(3);
|
||||
for(int iter=0;iter<numIterations;iter++) {
|
||||
|
||||
if (VERBOSE) {
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Random;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -216,7 +215,7 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new MappingCharFilter(norm, CharReader.get(reader));
|
||||
return new MappingCharFilter(norm, reader);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -272,13 +271,13 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new CJKAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t, new CJKBigramFilter(t));
|
||||
}
|
||||
};
|
||||
|
||||
public void testHuge() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
|
||||
new String[] {
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた"
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
public void testHanOnly() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
}
|
||||
}
|
|
@ -63,7 +63,7 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testRandomData() throws IOException {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -48,7 +48,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("the_s", term.toString());
|
||||
|
||||
wt.reset(new StringReader(input));
|
||||
wt.setReader(new StringReader(input));
|
||||
cgf.reset();
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How", term.toString());
|
||||
|
@ -66,7 +66,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("the_s", term.toString());
|
||||
|
||||
wt.reset(new StringReader(input));
|
||||
wt.setReader(new StringReader(input));
|
||||
nsf.reset();
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("How_the", term.toString());
|
||||
|
@ -81,7 +81,6 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
* "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
|
||||
* out)
|
||||
*
|
||||
* @return Map<String,String>
|
||||
*/
|
||||
public void testCommonGramsQueryFilter() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
@ -319,7 +318,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
|
||||
|
@ -331,6 +330,6 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
checkRandomData(random(), b, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Arrays;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -240,7 +239,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rind", termAtt.toString());
|
||||
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||
tf.reset();
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
|
@ -327,7 +326,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new MappingCharFilter(normMap, CharReader.get(reader));
|
||||
return new MappingCharFilter(normMap, reader);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -348,7 +347,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
|
@ -361,7 +360,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), b, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws Exception {
|
||||
|
|
|
@ -163,7 +163,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
filter.reset();
|
||||
String highSurEndingUpper = "BogustermBoguster\ud801";
|
||||
String highSurEndingLower = "bogustermboguster\ud801";
|
||||
tokenizer.reset(new StringReader(highSurEndingUpper));
|
||||
tokenizer.setReader(new StringReader(highSurEndingUpper));
|
||||
assertTokenStreamContents(filter, new String[] {highSurEndingLower});
|
||||
assertTrue(filter.hasAttribute(CharTermAttribute.class));
|
||||
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
|
||||
|
@ -191,17 +191,17 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new SimpleAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new StopAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.CharBuffer;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.MockCharFilter;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
@ -65,10 +65,10 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
|||
checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
|
||||
}
|
||||
|
||||
CharStream wrappedStream = new CharStream() {
|
||||
CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {
|
||||
|
||||
@Override
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
public void mark(int readAheadLimit) {
|
||||
throw new UnsupportedOperationException("mark(int)");
|
||||
}
|
||||
|
||||
|
@ -78,53 +78,53 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
public int read() {
|
||||
throw new UnsupportedOperationException("read()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf) throws IOException {
|
||||
public int read(char[] cbuf) {
|
||||
throw new UnsupportedOperationException("read(char[])");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(CharBuffer target) throws IOException {
|
||||
public int read(CharBuffer target) {
|
||||
throw new UnsupportedOperationException("read(CharBuffer)");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ready() throws IOException {
|
||||
public boolean ready() {
|
||||
throw new UnsupportedOperationException("ready()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
public void reset() {
|
||||
throw new UnsupportedOperationException("reset()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
public long skip(long n) {
|
||||
throw new UnsupportedOperationException("skip(long)");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int correctOffset(int currentOff) {
|
||||
throw new UnsupportedOperationException("correctOffset(int)");
|
||||
public int correct(int currentOff) {
|
||||
throw new UnsupportedOperationException("correct(int)");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
public void close() {
|
||||
throw new UnsupportedOperationException("close()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] arg0, int arg1, int arg2) throws IOException {
|
||||
public int read(char[] arg0, int arg1, int arg2) {
|
||||
throw new UnsupportedOperationException("read(char[], int, int)");
|
||||
}
|
||||
};
|
||||
|
||||
public void testWrapping() throws Exception {
|
||||
CharStream cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
|
||||
CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
|
||||
try {
|
||||
cs.mark(1);
|
||||
fail();
|
||||
|
@ -178,7 +178,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
|||
cs.correctOffset(1);
|
||||
fail();
|
||||
} catch (Exception e) {
|
||||
assertEquals("correctOffset(int)", e.getMessage());
|
||||
assertEquals("correct(int)", e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
|
|
|
@ -315,12 +315,12 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
String s = _TestUtil.randomSimpleString(random);
|
||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||
right.tokenStream("foo", newStringReader(s)));
|
||||
|
@ -94,7 +94,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
int numIterations = atLeast(100);
|
||||
int numIterations = atLeast(50);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String s = _TestUtil.randomSimpleString(random, maxLength);
|
||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||
|
@ -112,7 +112,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
String s = _TestUtil.randomHtmlishString(random, 20);
|
||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||
right.tokenStream("foo", newStringReader(s)));
|
||||
|
@ -121,7 +121,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
|
||||
public void testLetterHtmlishHuge() throws Exception {
|
||||
Random random = random();
|
||||
int maxLength = 2048; // this is number of elements, not chars!
|
||||
int maxLength = 1024; // this is number of elements, not chars!
|
||||
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
|
||||
left.setMaxTokenLength(255); // match CharTokenizer's max token length
|
||||
Analyzer right = new Analyzer() {
|
||||
|
@ -131,7 +131,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
int numIterations = atLeast(100);
|
||||
int numIterations = atLeast(50);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String s = _TestUtil.randomHtmlishString(random, maxLength);
|
||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||
|
@ -149,7 +149,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random);
|
||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||
right.tokenStream("foo", newStringReader(s)));
|
||||
|
@ -158,7 +158,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
|
||||
public void testLetterUnicodeHuge() throws Exception {
|
||||
Random random = random();
|
||||
int maxLength = 8192; // CharTokenizer.IO_BUFFER_SIZE*2
|
||||
int maxLength = 4300; // CharTokenizer.IO_BUFFER_SIZE + fudge
|
||||
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
|
||||
left.setMaxTokenLength(255); // match CharTokenizer's max token length
|
||||
Analyzer right = new Analyzer() {
|
||||
|
@ -168,7 +168,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
int numIterations = atLeast(100);
|
||||
int numIterations = atLeast(50);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random, maxLength);
|
||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||
|
|
|
@ -127,6 +127,6 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new KeywordAnalyzer(), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new KeywordAnalyzer(), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,8 +44,7 @@ import java.util.regex.Pattern;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.EmptyTokenizer;
|
||||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
||||
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
|
||||
|
@ -101,7 +100,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
|
||||
static List<Constructor<? extends Tokenizer>> tokenizers;
|
||||
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
||||
static List<Constructor<? extends CharStream>> charfilters;
|
||||
static List<Constructor<? extends CharFilter>> charfilters;
|
||||
|
||||
// TODO: fix those and remove
|
||||
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
|
@ -170,7 +169,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
|
||||
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
|
||||
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
|
||||
charfilters = new ArrayList<Constructor<? extends CharStream>>();
|
||||
charfilters = new ArrayList<Constructor<? extends CharFilter>>();
|
||||
for (final Class<?> c : analysisClasses) {
|
||||
final int modifiers = c.getModifiers();
|
||||
if (
|
||||
|
@ -179,7 +178,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
||||
|| brokenComponents.contains(c)
|
||||
|| c.isAnnotationPresent(Deprecated.class)
|
||||
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
|
||||
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
@ -197,10 +196,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
|
||||
} else if (CharStream.class.isAssignableFrom(c)) {
|
||||
} else if (CharFilter.class.isAssignableFrom(c)) {
|
||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||
charfilters.add(castConstructor(CharStream.class, ctor));
|
||||
charfilters.add(castConstructor(CharFilter.class, ctor));
|
||||
} else {
|
||||
fail("Cannot get here");
|
||||
}
|
||||
|
@ -224,7 +223,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
public static void afterClass() {
|
||||
tokenizers = null;
|
||||
tokenfilters = null;
|
||||
charfilters = null;
|
||||
|
@ -524,7 +523,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
allowedCharFilterArgs.addAll(argProducers.keySet());
|
||||
allowedCharFilterArgs.add(Reader.class);
|
||||
allowedCharFilterArgs.add(CharStream.class);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
|
@ -560,8 +558,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
Class<?> paramType = paramTypes[i];
|
||||
if (paramType == Reader.class) {
|
||||
args[i] = reader;
|
||||
} else if (paramType == CharStream.class) {
|
||||
args[i] = CharReader.get(reader);
|
||||
} else {
|
||||
args[i] = newRandomArg(random, paramType);
|
||||
}
|
||||
|
@ -701,7 +697,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
int numFilters = random.nextInt(3);
|
||||
for (int i = 0; i < numFilters; i++) {
|
||||
while (true) {
|
||||
final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
||||
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
||||
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
|
||||
reader = createComponent(ctor, args, descr);
|
||||
if (reader != null) {
|
||||
|
@ -760,24 +756,16 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// wants charfilter to be a filterreader...
|
||||
// do *NOT*, do *NOT* refactor me to be a charfilter: LUCENE-3990
|
||||
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharStream {
|
||||
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
|
||||
boolean readSomething;
|
||||
CharStream in;
|
||||
|
||||
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
|
||||
this.in = CharReader.get(in);
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int correctOffset(int currentOff) {
|
||||
return in.correctOffset(currentOff);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
public int correct(int currentOff) {
|
||||
return currentOff; // we don't change any offsets
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -798,32 +786,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
return in.read(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
in.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return in.markSupported();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf) throws IOException {
|
||||
readSomething = true;
|
||||
return in.read(cbuf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ready() throws IOException {
|
||||
return in.ready();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
in.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
readSomething = true;
|
||||
|
|
|
@ -233,13 +233,13 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
|
||||
// Adds random graph after:
|
||||
|
@ -254,6 +254,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
},
|
||||
200*RANDOM_MULTIPLIER, 8192);
|
||||
100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -252,6 +252,6 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -455,12 +455,12 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,6 +52,6 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new CzechAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new CzechAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,6 +51,6 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new DanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new DanishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,6 +61,6 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new GermanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new GermanAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
|
@ -46,9 +49,22 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
|||
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
|
@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(analyzer, "äpfel", "apfel");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||
}
|
||||
|
||||
/** Test against a vocabulary from the reference impl */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
||||
|
@ -60,7 +76,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -64,7 +64,7 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -23,9 +23,13 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
|
@ -58,9 +62,22 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -66,6 +66,6 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new GreekAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new GreekAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,6 +55,6 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new EnglishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new EnglishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -42,7 +42,7 @@ public class TestKStemmer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -63,7 +63,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -51,6 +51,6 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new SpanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new SpanishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -51,6 +51,6 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new BasqueAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new BasqueAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -224,6 +224,6 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new PersianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new PersianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,6 +51,6 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new FinnishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new FinnishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
|
@ -46,9 +49,22 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
|
|
@ -164,7 +164,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random(), new FrenchAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** test accent-insensitive */
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue