mirror of https://github.com/apache/lucene.git
LUCENE-3892: merge in trunk changes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1363400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
cf36fb9a58
11
build.xml
11
build.xml
|
@ -65,7 +65,7 @@
|
||||||
</subant></sequential>
|
</subant></sequential>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="resolve" description="Resolves all dependencies">
|
<target name="resolve" depends="clean-jars" description="Resolves all dependencies">
|
||||||
<sequential><subant target="resolve" inheritall="false" failonerror="true">
|
<sequential><subant target="resolve" inheritall="false" failonerror="true">
|
||||||
<fileset dir="lucene" includes="build.xml" />
|
<fileset dir="lucene" includes="build.xml" />
|
||||||
<fileset dir="solr" includes="build.xml" />
|
<fileset dir="solr" includes="build.xml" />
|
||||||
|
@ -116,7 +116,7 @@
|
||||||
</sequential>
|
</sequential>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="eclipse" description="Setup Eclipse configuration" depends="resolve">
|
<target name="eclipse" depends="clean-jars, resolve" description="Setup Eclipse configuration">
|
||||||
<copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false"/>
|
<copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false"/>
|
||||||
<copy file="dev-tools/eclipse/dot.classpath" tofile=".classpath" overwrite="true"/>
|
<copy file="dev-tools/eclipse/dot.classpath" tofile=".classpath" overwrite="true"/>
|
||||||
<mkdir dir=".settings"/>
|
<mkdir dir=".settings"/>
|
||||||
|
@ -129,7 +129,7 @@
|
||||||
</echo>
|
</echo>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="idea" description="Setup IntelliJ IDEA configuration" depends="resolve">
|
<target name="idea" depends="clean-jars, resolve" description="Setup IntelliJ IDEA configuration">
|
||||||
<copy todir=".">
|
<copy todir=".">
|
||||||
<fileset dir="dev-tools/idea"/>
|
<fileset dir="dev-tools/idea"/>
|
||||||
</copy>
|
</copy>
|
||||||
|
@ -138,6 +138,7 @@
|
||||||
File | Project Structure | Project | Project SDK.
|
File | Project Structure | Project | Project SDK.
|
||||||
</echo>
|
</echo>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="clean-idea"
|
<target name="clean-idea"
|
||||||
description="Removes all IntelliJ IDEA configuration files">
|
description="Removes all IntelliJ IDEA configuration files">
|
||||||
<delete dir=".idea" failonerror="true"/>
|
<delete dir=".idea" failonerror="true"/>
|
||||||
|
@ -148,7 +149,7 @@
|
||||||
</delete>
|
</delete>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="clean" description="Clean Lucene and Solr">
|
<target name="clean" depends="clean-jars" description="Clean Lucene and Solr">
|
||||||
<delete dir="dist" />
|
<delete dir="dist" />
|
||||||
<sequential>
|
<sequential>
|
||||||
<subant target="clean" inheritall="false" failonerror="true">
|
<subant target="clean" inheritall="false" failonerror="true">
|
||||||
|
@ -175,7 +176,7 @@
|
||||||
</subant>
|
</subant>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jar-checksums" description="Recompute SHA1 checksums for all JAR files.">
|
<target name="jar-checksums" depends="resolve" description="Recompute SHA1 checksums for all JAR files.">
|
||||||
<delete>
|
<delete>
|
||||||
<fileset dir="${basedir}">
|
<fileset dir="${basedir}">
|
||||||
<include name="**/*.jar.sha1"/>
|
<include name="**/*.jar.sha1"/>
|
||||||
|
|
|
@ -97,12 +97,14 @@
|
||||||
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
||||||
|
<classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||||
|
<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>
|
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/httpcore-4.1.4.jar"/>
|
<classpathentry kind="lib" path="solr/lib/httpcore-4.1.4.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/httpmime-4.1.3.jar"/>
|
<classpathentry kind="lib" path="solr/lib/httpmime-4.1.3.jar"/>
|
||||||
|
@ -115,7 +117,7 @@
|
||||||
<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.4.jar"/>
|
<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.4.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.4.jar"/>
|
<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.4.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
|
<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.4.jar"/>
|
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.5.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/example/lib/jetty-continuation-8.1.2.v20120308.jar"/>
|
<classpathentry kind="lib" path="solr/example/lib/jetty-continuation-8.1.2.v20120308.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/example/lib/jetty-deploy-8.1.2.v20120308.jar"/>
|
<classpathentry kind="lib" path="solr/example/lib/jetty-deploy-8.1.2.v20120308.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/example/lib/jetty-http-8.1.2.v20120308.jar"/>
|
<classpathentry kind="lib" path="solr/example/lib/jetty-http-8.1.2.v20120308.jar"/>
|
||||||
|
@ -170,6 +172,6 @@
|
||||||
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-beanutils-1.7.0.jar"/>
|
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-beanutils-1.7.0.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-collections-3.2.1.jar"/>
|
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-collections-3.2.1.jar"/>
|
||||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||||
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.5.0.jar"/>
|
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar"/>
|
||||||
<classpathentry kind="output" path="bin"/>
|
<classpathentry kind="output" path="bin"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
<library name="JUnit">
|
<library name="JUnit">
|
||||||
<CLASSES>
|
<CLASSES>
|
||||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/junit-4.10.jar!/" />
|
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/junit-4.10.jar!/" />
|
||||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.5.0.jar!/" />
|
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar!/" />
|
||||||
</CLASSES>
|
</CLASSES>
|
||||||
<JAVADOC />
|
<JAVADOC />
|
||||||
<SOURCES />
|
<SOURCES />
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
<component name="libraryTable">
|
||||||
|
<library name="Lucene tools library">
|
||||||
|
<CLASSES>
|
||||||
|
<root url="file://$PROJECT_DIR$/lucene/tools/lib" />
|
||||||
|
</CLASSES>
|
||||||
|
<JAVADOC />
|
||||||
|
<SOURCES />
|
||||||
|
<jarDirectory url="file://$PROJECT_DIR$/lucene/tools/lib" recursive="false" />
|
||||||
|
</library>
|
||||||
|
</component>
|
|
@ -16,6 +16,11 @@
|
||||||
<option name="USE_RELATIVE_INDENTS" value="false" />
|
<option name="USE_RELATIVE_INDENTS" value="false" />
|
||||||
</value>
|
</value>
|
||||||
</option>
|
</option>
|
||||||
|
<option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="20" />
|
||||||
|
<option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="20" />
|
||||||
|
<option name="PACKAGES_TO_USE_IMPORT_ON_DEMAND">
|
||||||
|
<value />
|
||||||
|
</option>
|
||||||
<ADDITIONAL_INDENT_OPTIONS fileType="groovy">
|
<ADDITIONAL_INDENT_OPTIONS fileType="groovy">
|
||||||
<option name="INDENT_SIZE" value="2" />
|
<option name="INDENT_SIZE" value="2" />
|
||||||
<option name="CONTINUATION_INDENT_SIZE" value="4" />
|
<option name="CONTINUATION_INDENT_SIZE" value="4" />
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
<orderEntry type="inheritedJdk" />
|
<orderEntry type="inheritedJdk" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||||
|
<orderEntry type="library" name="Lucene tools library" level="project" />
|
||||||
<orderEntry type="library" name="Ant" level="project"/>
|
<orderEntry type="library" name="Ant" level="project"/>
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
||||||
|
|
|
@ -89,6 +89,11 @@
|
||||||
<groupId>com.ibm.icu</groupId>
|
<groupId>com.ibm.icu</groupId>
|
||||||
<artifactId>icu4j</artifactId>
|
<artifactId>icu4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sourceforge.nekohtml</groupId>
|
||||||
|
<artifactId>nekohtml</artifactId>
|
||||||
|
<version>1.9.15</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-compress</artifactId>
|
<artifactId>commons-compress</artifactId>
|
||||||
|
|
|
@ -155,6 +155,11 @@
|
||||||
<artifactId>commons-codec</artifactId>
|
<artifactId>commons-codec</artifactId>
|
||||||
<version>1.6</version>
|
<version>1.6</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-cli</groupId>
|
||||||
|
<artifactId>commons-cli</artifactId>
|
||||||
|
<version>1.2</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-digester</groupId>
|
<groupId>commons-digester</groupId>
|
||||||
<artifactId>commons-digester</artifactId>
|
<artifactId>commons-digester</artifactId>
|
||||||
|
@ -293,7 +298,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.zookeeper</groupId>
|
<groupId>org.apache.zookeeper</groupId>
|
||||||
<artifactId>zookeeper</artifactId>
|
<artifactId>zookeeper</artifactId>
|
||||||
<version>3.3.4</version>
|
<version>3.3.5</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.carrot2</groupId>
|
<groupId>org.carrot2</groupId>
|
||||||
|
@ -303,7 +308,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.carrot2</groupId>
|
<groupId>org.carrot2</groupId>
|
||||||
<artifactId>morfologik-polish</artifactId>
|
<artifactId>morfologik-polish</artifactId>
|
||||||
<version>1.5.2</version>
|
<version>1.5.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.codehaus.woodstox</groupId>
|
<groupId>org.codehaus.woodstox</groupId>
|
||||||
|
@ -383,7 +388,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
||||||
<artifactId>randomizedtesting-runner</artifactId>
|
<artifactId>randomizedtesting-runner</artifactId>
|
||||||
<version>1.5.0</version>
|
<version>1.6.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
|
@ -138,6 +138,10 @@
|
||||||
<groupId>commons-codec</groupId>
|
<groupId>commons-codec</groupId>
|
||||||
<artifactId>commons-codec</artifactId>
|
<artifactId>commons-codec</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-cli</groupId>
|
||||||
|
<artifactId>commons-cli</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-fileupload</groupId>
|
<groupId>commons-fileupload</groupId>
|
||||||
<artifactId>commons-fileupload</artifactId>
|
<artifactId>commons-fileupload</artifactId>
|
||||||
|
|
|
@ -17,8 +17,8 @@ import traceback
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
from HTMLParser import HTMLParser, HTMLParseError
|
from html.parser import HTMLParser, HTMLParseError
|
||||||
import urlparse
|
import urllib.parse as urlparse
|
||||||
|
|
||||||
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
||||||
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
||||||
|
@ -57,7 +57,7 @@ class FindHyperlinks(HTMLParser):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
self.printFile()
|
self.printFile()
|
||||||
print ' WARNING: anchor "%s" appears more than once' % name
|
print(' WARNING: anchor "%s" appears more than once' % name)
|
||||||
else:
|
else:
|
||||||
self.anchors.add(name)
|
self.anchors.add(name)
|
||||||
elif href is not None:
|
elif href is not None:
|
||||||
|
@ -73,8 +73,8 @@ class FindHyperlinks(HTMLParser):
|
||||||
|
|
||||||
def printFile(self):
|
def printFile(self):
|
||||||
if not self.printed:
|
if not self.printed:
|
||||||
print
|
print()
|
||||||
print ' ' + self.baseURL
|
print(' ' + self.baseURL)
|
||||||
self.printed = True
|
self.printed = True
|
||||||
|
|
||||||
def parse(baseURL, html):
|
def parse(baseURL, html):
|
||||||
|
@ -85,8 +85,8 @@ def parse(baseURL, html):
|
||||||
parser.close()
|
parser.close()
|
||||||
except HTMLParseError:
|
except HTMLParseError:
|
||||||
parser.printFile()
|
parser.printFile()
|
||||||
print ' WARNING: failed to parse:'
|
print(' WARNING: failed to parse %s:' % baseURL)
|
||||||
traceback.print_exc()
|
traceback.print_exc(file=sys.stdout)
|
||||||
failures = True
|
failures = True
|
||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
|
@ -104,8 +104,8 @@ def checkAll(dirName):
|
||||||
global failures
|
global failures
|
||||||
|
|
||||||
# Find/parse all HTML files first
|
# Find/parse all HTML files first
|
||||||
print
|
print()
|
||||||
print 'Crawl/parse...'
|
print('Crawl/parse...')
|
||||||
allFiles = {}
|
allFiles = {}
|
||||||
|
|
||||||
if os.path.isfile(dirName):
|
if os.path.isfile(dirName):
|
||||||
|
@ -128,11 +128,11 @@ def checkAll(dirName):
|
||||||
# deprecated-list.html can fail to escape generics types
|
# deprecated-list.html can fail to escape generics types
|
||||||
fullPath = os.path.join(root, f)
|
fullPath = os.path.join(root, f)
|
||||||
#print ' %s' % fullPath
|
#print ' %s' % fullPath
|
||||||
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
|
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read())
|
||||||
|
|
||||||
# ... then verify:
|
# ... then verify:
|
||||||
print
|
print()
|
||||||
print 'Verify...'
|
print('Verify...')
|
||||||
for fullPath, (links, anchors) in allFiles.items():
|
for fullPath, (links, anchors) in allFiles.items():
|
||||||
#print fullPath
|
#print fullPath
|
||||||
printed = False
|
printed = False
|
||||||
|
@ -176,16 +176,16 @@ def checkAll(dirName):
|
||||||
and os.path.basename(fullPath) != 'Changes.html':
|
and os.path.basename(fullPath) != 'Changes.html':
|
||||||
if not printed:
|
if not printed:
|
||||||
printed = True
|
printed = True
|
||||||
print
|
print()
|
||||||
print fullPath
|
print(fullPath)
|
||||||
print ' BAD EXTERNAL LINK: %s' % link
|
print(' BAD EXTERNAL LINK: %s' % link)
|
||||||
elif link.startswith('mailto:'):
|
elif link.startswith('mailto:'):
|
||||||
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
|
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
|
||||||
if not printed:
|
if not printed:
|
||||||
printed = True
|
printed = True
|
||||||
print
|
print()
|
||||||
print fullPath
|
print(fullPath)
|
||||||
print ' BROKEN MAILTO (?): %s' % link
|
print(' BROKEN MAILTO (?): %s' % link)
|
||||||
elif link.startswith('javascript:'):
|
elif link.startswith('javascript:'):
|
||||||
# ok...?
|
# ok...?
|
||||||
pass
|
pass
|
||||||
|
@ -200,15 +200,15 @@ def checkAll(dirName):
|
||||||
if not os.path.exists(link):
|
if not os.path.exists(link):
|
||||||
if not printed:
|
if not printed:
|
||||||
printed = True
|
printed = True
|
||||||
print
|
print()
|
||||||
print fullPath
|
print(fullPath)
|
||||||
print ' BROKEN LINK: %s' % link
|
print(' BROKEN LINK: %s' % link)
|
||||||
elif anchor is not None and anchor not in allFiles[link][1]:
|
elif anchor is not None and anchor not in allFiles[link][1]:
|
||||||
if not printed:
|
if not printed:
|
||||||
printed = True
|
printed = True
|
||||||
print
|
print()
|
||||||
print fullPath
|
print(fullPath)
|
||||||
print ' BROKEN ANCHOR: %s' % origLink
|
print(' BROKEN ANCHOR: %s' % origLink)
|
||||||
|
|
||||||
failures = failures or printed
|
failures = failures or printed
|
||||||
|
|
||||||
|
@ -216,8 +216,8 @@ def checkAll(dirName):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if checkAll(sys.argv[1]):
|
if checkAll(sys.argv[1]):
|
||||||
print
|
print()
|
||||||
print 'Broken javadocs links were found!'
|
print('Broken javadocs links were found!')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
|
@ -210,16 +210,6 @@ def checkSigs(project, urlString, version, tmpDir, isSigned):
|
||||||
if keysURL is None:
|
if keysURL is None:
|
||||||
raise RuntimeError('%s is missing KEYS' % project)
|
raise RuntimeError('%s is missing KEYS' % project)
|
||||||
|
|
||||||
if not os.path.exists('%s/apache-rat-0.8.jar' % tmpDir):
|
|
||||||
print ' downloading Apache RAT...'
|
|
||||||
download('apache-rat-incubating-0.8-bin.tar.bz2',
|
|
||||||
'http://archive.apache.org/dist/incubator/rat/binaries/apache-rat-incubating-0.8-bin.tar.bz2',
|
|
||||||
tmpDir)
|
|
||||||
t = tarfile.open('%s/apache-rat-incubating-0.8-bin.tar.bz2' % tmpDir)
|
|
||||||
t.extract('apache-rat-0.8/apache-rat-0.8.jar', '%s/apache-rat-0.8.jar' % tmpDir)
|
|
||||||
else:
|
|
||||||
print ' apache RAT already downloaded...'
|
|
||||||
|
|
||||||
print ' get KEYS'
|
print ' get KEYS'
|
||||||
download('%s.KEYS' % project, keysURL, tmpDir)
|
download('%s.KEYS' % project, keysURL, tmpDir)
|
||||||
|
|
||||||
|
@ -480,9 +470,6 @@ def verifyUnpacked(project, artifact, unpackPath, version, tmpDir):
|
||||||
print ' run "ant validate"'
|
print ' run "ant validate"'
|
||||||
run('%s; ant validate' % javaExe('1.7'), '%s/validate.log' % unpackPath)
|
run('%s; ant validate' % javaExe('1.7'), '%s/validate.log' % unpackPath)
|
||||||
|
|
||||||
print ' run "ant rat-sources"'
|
|
||||||
run('%s; ant -lib "%s/apache-rat-0.8.jar/apache-rat-0.8" rat-sources' % (javaExe('1.7'), tmpDir), '%s/rat-sources.log' % unpackPath)
|
|
||||||
|
|
||||||
if project == 'lucene':
|
if project == 'lucene':
|
||||||
print ' run tests w/ Java 6...'
|
print ' run tests w/ Java 6...'
|
||||||
run('%s; ant test' % javaExe('1.6'), '%s/test.log' % unpackPath)
|
run('%s; ant test' % javaExe('1.6'), '%s/test.log' % unpackPath)
|
||||||
|
|
|
@ -7,6 +7,120 @@ http://s.apache.org/luceneversions
|
||||||
======================= Lucene 5.0.0 =======================
|
======================= Lucene 5.0.0 =======================
|
||||||
|
|
||||||
|
|
||||||
|
======================= Lucene 4.0.0-BETA =======================
|
||||||
|
|
||||||
|
New features
|
||||||
|
|
||||||
|
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
|
||||||
|
iteration marks. (Robert Muir, Christian Moen)
|
||||||
|
|
||||||
|
* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
|
||||||
|
create automata from a fixed collection of UTF-8 encoded BytesRef
|
||||||
|
(Dawid Weiss, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
|
||||||
|
respect field boundaries in the case of highlighting for multivalued fields.
|
||||||
|
(Martijn van Groningen)
|
||||||
|
|
||||||
|
API Changes
|
||||||
|
|
||||||
|
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
|
||||||
|
The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
|
||||||
|
has a different API (carries a list of tags instead of a compound tag). Upgrade
|
||||||
|
of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4178: set 'tokenized' to true on FieldType by default, so that if you
|
||||||
|
make a custom FieldType and set indexed = true, its analyzed by the analyzer.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
|
||||||
|
module and replaced by NekoHTML. HTMLParser interface was cleaned up while
|
||||||
|
changing method signatures. (Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2191: Rename Tokenizer.reset(Reader) to Tokenizer.setReader(Reader).
|
||||||
|
The purpose of this method was always to set a new Reader on the Tokenizer,
|
||||||
|
reusing the object. But the name was often confused with TokenStream.reset().
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4228: Refactored CharFilter to extend java.io.FilterReader. CharFilters
|
||||||
|
filter another reader and you override correct() for offset correction.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
|
Optimizations
|
||||||
|
|
||||||
|
* LUCENE-4171: Performance improvements to Packed64.
|
||||||
|
(Toke Eskildsen via Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-4184: Performance improvements to the aligned packed bits impl.
|
||||||
|
(Toke Eskildsen, Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-4235: Remove enforcing of Filter rewrite for NRQ queries.
|
||||||
|
(Uwe Schindler)
|
||||||
|
|
||||||
|
Bug Fixes
|
||||||
|
|
||||||
|
* LUCENE-4176: Fix AnalyzingQueryParser to analyze range endpoints as bytes,
|
||||||
|
so that it works correctly with Analyzers that produce binary non-UTF-8 terms
|
||||||
|
such as CollationAnalyzer. (Nattapong Sirilappanich via Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4209: Fix FSTCompletionLookup to close its sorter, so that it won't
|
||||||
|
leave temp files behind in /tmp. Fix SortedTermFreqIteratorWrapper to not
|
||||||
|
leave temp files behind in /tmp on Windows. Fix Sort to not leave
|
||||||
|
temp files behind when /tmp is a separate volume. (Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4221: Fix overeager CheckIndex validation for term vector offsets.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4222: TieredMergePolicy.getFloorSegmentMB was returning the
|
||||||
|
size in bytes not MB (Chris Fuller via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3505: Fix bug (Lucene 4.0alpha only) where boolean conjunctions
|
||||||
|
were sometimes scored incorrectly. Conjunctions of only termqueries where
|
||||||
|
at least one term omitted term frequencies (IndexOptions.DOCS_ONLY) would
|
||||||
|
be scored as if all terms omitted term frequencies. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2686, LUCENE-3505: Fixed BooleanQuery scorers to return correct
|
||||||
|
freq(). Added support for scorer navigation API (Scorer.getChildren) to
|
||||||
|
all queries. Made Scorer.freq() abstract.
|
||||||
|
(Koji Sekiguchi, Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
|
Build
|
||||||
|
|
||||||
|
* LUCENE-4094: Support overriding file.encoding on forked test JVMs
|
||||||
|
(force via -Drandomized.file.encoding=XXX). (Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4189: Test output should include timestamps (start/end for each
|
||||||
|
test/ suite). Added -Dtests.timestamps=[off by default]. (Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4110: Report long periods of forked jvm inactivity (hung tests/ suites).
|
||||||
|
Added -Dtests.heartbeat=[seconds] with the default of 60 seconds.
|
||||||
|
(Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4160: Added a property to quit the tests after a given
|
||||||
|
number of failures has occurred. This is useful in combination
|
||||||
|
with -Dtests.iters=N (you can start N iterations and wait for M
|
||||||
|
failures, in particular M = 1). -Dtests.maxfailures=M. Alternatively,
|
||||||
|
specify -Dtests.failfast=true to skip all tests after the first failure.
|
||||||
|
(Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4115: JAR resolution/ cleanup should be done automatically for ant
|
||||||
|
clean/ eclipse/ resolve (Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4199, LUCENE-4202, LUCENE-4206: Add a new target "check-forbidden-apis"
|
||||||
|
that parses all generated .class files for use of APIs that use default
|
||||||
|
charset, default locale, or default timezone and fail build if violations
|
||||||
|
found. This ensures, that Lucene / Solr is independent on local configuration
|
||||||
|
options. (Uwe Schindler, Robert Muir, Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4217: Add the possibility to run tests with Atlassian Clover
|
||||||
|
loaded from IVY. A development License solely for Apache code was added in
|
||||||
|
the tools/ folder, but is not included in releases. (Uwe Schindler)
|
||||||
|
|
||||||
|
Documentation
|
||||||
|
|
||||||
|
* LUCENE-4195: Added package documentation and examples for
|
||||||
|
org.apache.lucene.codecs (Alan Woodward via Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 4.0.0-ALPHA =======================
|
======================= Lucene 4.0.0-ALPHA =======================
|
||||||
|
|
||||||
More information about this release, including any errata related to the
|
More information about this release, including any errata related to the
|
||||||
|
@ -20,7 +134,7 @@ Changes in backwards compatibility policy
|
||||||
|
|
||||||
* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing:
|
* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing:
|
||||||
|
|
||||||
- On upgrading to 3.1, if you do not fully reindex your documents,
|
- On upgrading to 4.0, if you do not fully reindex your documents,
|
||||||
Lucene will emulate the new flex API on top of the old index,
|
Lucene will emulate the new flex API on top of the old index,
|
||||||
incurring some performance cost (up to ~10% slowdown, typically).
|
incurring some performance cost (up to ~10% slowdown, typically).
|
||||||
To prevent this slowdown, use oal.index.IndexUpgrader
|
To prevent this slowdown, use oal.index.IndexUpgrader
|
||||||
|
@ -29,7 +143,7 @@ Changes in backwards compatibility policy
|
||||||
Mixed flex/pre-flex indexes are perfectly fine -- the two
|
Mixed flex/pre-flex indexes are perfectly fine -- the two
|
||||||
emulation layers (flex API on pre-flex index, and pre-flex API on
|
emulation layers (flex API on pre-flex index, and pre-flex API on
|
||||||
flex index) will remap the access as required. So on upgrading to
|
flex index) will remap the access as required. So on upgrading to
|
||||||
3.1 you can start indexing new documents into an existing index.
|
4.0 you can start indexing new documents into an existing index.
|
||||||
To get optimal performance, use oal.index.IndexUpgrader
|
To get optimal performance, use oal.index.IndexUpgrader
|
||||||
to upgrade your indexes to latest file format (LUCENE-3082).
|
to upgrade your indexes to latest file format (LUCENE-3082).
|
||||||
|
|
||||||
|
@ -283,6 +397,11 @@ Changes in backwards compatibility policy
|
||||||
removed, as IndexReaderContext.leaves() is now the preferred way
|
removed, as IndexReaderContext.leaves() is now the preferred way
|
||||||
to access sub-readers. (Uwe Schindler)
|
to access sub-readers. (Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-4155: oal.util.ReaderUtil, TwoPhaseCommit, TwoPhaseCommitTool
|
||||||
|
classes were moved to oal.index package. oal.util.CodecUtil class was moved
|
||||||
|
to oal.codecs package. oal.util.DummyConcurrentLock was removed
|
||||||
|
(no longer used in Lucene 4.0). (Uwe Schindler)
|
||||||
|
|
||||||
Changes in Runtime Behavior
|
Changes in Runtime Behavior
|
||||||
|
|
||||||
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you
|
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you
|
||||||
|
@ -989,6 +1108,11 @@ Optimizations
|
||||||
* LUCENE-4156: DirectoryTaxonomyWriter.getSize is no longer synchronized.
|
* LUCENE-4156: DirectoryTaxonomyWriter.getSize is no longer synchronized.
|
||||||
(Shai Erera, Sivan Yogev)
|
(Shai Erera, Sivan Yogev)
|
||||||
|
|
||||||
|
* LUCENE-4163: Improve concurrency of MMapIndexInput.clone() by using
|
||||||
|
the new WeakIdentityMap on top of a ConcurrentHashMap to manage
|
||||||
|
the cloned instances. WeakIdentityMap was extended to support
|
||||||
|
iterating over its keys. (Uwe Schindler)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
* LUCENE-2803: The FieldCache can miss values if an entry for a reader
|
* LUCENE-2803: The FieldCache can miss values if an entry for a reader
|
||||||
|
@ -1062,6 +1186,13 @@ Bug fixes
|
||||||
* LUCENE-4114: Fix int overflow bugs in BYTES_FIXED_STRAIGHT and
|
* LUCENE-4114: Fix int overflow bugs in BYTES_FIXED_STRAIGHT and
|
||||||
BYTES_FIXED_DEREF doc values implementations (Walt Elder via Mike McCandless).
|
BYTES_FIXED_DEREF doc values implementations (Walt Elder via Mike McCandless).
|
||||||
|
|
||||||
|
* LUCENE-4147: Fixed thread safety issues when rollback() and commit()
|
||||||
|
are called simultaneously. (Simon Willnauer, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-4165: Removed closing of the Reader used to read the affix file in
|
||||||
|
HunspellDictionary. Consumers are now responsible for closing all InputStreams
|
||||||
|
once the Dictionary has been instantiated. (Torsten Krah, Uwe Schindler, Chris Male)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-3958: Javadocs corrections for IndexWriter.
|
* LUCENE-3958: Javadocs corrections for IndexWriter.
|
||||||
|
|
|
@ -145,7 +145,7 @@ enumeration APIs. Here are the major changes:
|
||||||
oal.util.ReaderUtil) and then step through those readers yourself,
|
oal.util.ReaderUtil) and then step through those readers yourself,
|
||||||
if you can (this is how Lucene drives searches).
|
if you can (this is how Lucene drives searches).
|
||||||
|
|
||||||
If you pass a SegmentReader to MultiFields.fiels it will simply
|
If you pass a SegmentReader to MultiFields.fields it will simply
|
||||||
return reader.fields(), so there is no performance hit in that
|
return reader.fields(), so there is no performance hit in that
|
||||||
case.
|
case.
|
||||||
|
|
||||||
|
@ -334,7 +334,7 @@ based on document IDs, albeit the per-segment orientation.
|
||||||
|
|
||||||
There are still valid use-cases where top-level readers ie. "atomic
|
There are still valid use-cases where top-level readers ie. "atomic
|
||||||
views" on the index are desirable. Let say you want to iterate all terms
|
views" on the index are desirable. Let say you want to iterate all terms
|
||||||
of a complete index for auto-completion or facetting, Lucene provides
|
of a complete index for auto-completion or faceting, Lucene provides
|
||||||
utility wrappers like SlowCompositeReaderWrapper (LUCENE-2597) emulating
|
utility wrappers like SlowCompositeReaderWrapper (LUCENE-2597) emulating
|
||||||
an AtomicReader. Note: using "atomicity emulators" can cause serious
|
an AtomicReader. Note: using "atomicity emulators" can cause serious
|
||||||
slowdowns due to the need to merge terms, postings, DocValues, and
|
slowdowns due to the need to merge terms, postings, DocValues, and
|
||||||
|
@ -574,7 +574,7 @@ you can now do this:
|
||||||
Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum
|
Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum
|
||||||
is both consumer and producer of attributes: MTQ.BoostAttribute is
|
is both consumer and producer of attributes: MTQ.BoostAttribute is
|
||||||
added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it.
|
added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it.
|
||||||
The other way round MTQ.TopTermsBooleanQueryRewrite supplys a
|
The other way round MTQ.TopTermsBooleanQueryRewrite supplies a
|
||||||
global AttributeSource to each segments TermsEnum. The TermsEnum is consumer
|
global AttributeSource to each segments TermsEnum. The TermsEnum is consumer
|
||||||
and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute).
|
and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute).
|
||||||
|
|
||||||
|
@ -594,7 +594,7 @@ you can now do this:
|
||||||
* LUCENE-1076: TieredMergePolicy is now the default merge policy.
|
* LUCENE-1076: TieredMergePolicy is now the default merge policy.
|
||||||
It's able to merge non-contiguous segments; this may cause problems
|
It's able to merge non-contiguous segments; this may cause problems
|
||||||
for applications that rely on Lucene's internal document ID
|
for applications that rely on Lucene's internal document ID
|
||||||
assigment. If so, you should instead use LogByteSize/DocMergePolicy
|
assignment. If so, you should instead use LogByteSize/DocMergePolicy
|
||||||
during indexing.
|
during indexing.
|
||||||
|
|
||||||
* LUCENE-3722: Similarity methods and collection/term statistics now take
|
* LUCENE-3722: Similarity methods and collection/term statistics now take
|
||||||
|
|
|
@ -61,50 +61,50 @@
|
||||||
executable="${python.exe}" failonerror="true" logerror="true">
|
executable="${python.exe}" failonerror="true" logerror="true">
|
||||||
<arg value="htmlentity.py"/>
|
<arg value="htmlentity.py"/>
|
||||||
</exec>
|
</exec>
|
||||||
|
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex"
|
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
|
||||||
outdir="src/java/org/apache/lucene/analysis/wikipedia"
|
|
||||||
nobak="on"/>
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
|
||||||
nobak="on" />
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex"
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
|
||||||
nobak="on" />
|
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard/std34"
|
|
||||||
nobak="on" />
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<!-- Remove the inappropriate JFlex-generated constructor -->
|
||||||
|
<macrodef name="run-jflex">
|
||||||
|
<attribute name="dir"/>
|
||||||
|
<attribute name="name"/>
|
||||||
|
<sequential>
|
||||||
|
<jflex file="@{dir}/@{name}.jflex"
|
||||||
|
outdir="@{dir}"
|
||||||
|
nobak="on" />
|
||||||
|
<replaceregexp file="@{dir}/@{name}.java"
|
||||||
|
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||||
|
replace="" flags="sg"/>
|
||||||
|
</sequential>
|
||||||
|
</macrodef>
|
||||||
|
|
||||||
<target name="clean-jflex">
|
<target name="clean-jflex">
|
||||||
<delete>
|
<delete>
|
||||||
|
<fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">
|
||||||
|
<containsregexp expression="generated.*by.*JFlex"/>
|
||||||
|
</fileset>
|
||||||
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
||||||
<containsregexp expression="generated.*by.*JFlex"/>
|
<containsregexp expression="generated.*by.*JFlex"/>
|
||||||
</fileset>
|
</fileset>
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package org.apache.lucene.analysis.br;
|
package org.apache.lucene.analysis.br;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -21,6 +23,7 @@ package org.apache.lucene.analysis.br;
|
||||||
* A stemmer for Brazilian Portuguese words.
|
* A stemmer for Brazilian Portuguese words.
|
||||||
*/
|
*/
|
||||||
public class BrazilianStemmer {
|
public class BrazilianStemmer {
|
||||||
|
private static final Locale locale = new Locale("pt", "BR");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Changed term
|
* Changed term
|
||||||
|
@ -243,7 +246,7 @@ public class BrazilianStemmer {
|
||||||
return null ;
|
return null ;
|
||||||
}
|
}
|
||||||
|
|
||||||
value = value.toLowerCase() ;
|
value = value.toLowerCase(locale) ;
|
||||||
for (j=0 ; j < value.length() ; j++) {
|
for (j=0 ; j < value.length() ; j++) {
|
||||||
if ((value.charAt(j) == 'á') ||
|
if ((value.charAt(j) == 'á') ||
|
||||||
(value.charAt(j) == 'â') ||
|
(value.charAt(j) == 'â') ||
|
||||||
|
|
|
@ -17,9 +17,10 @@
|
||||||
|
|
||||||
package org.apache.lucene.analysis.charfilter;
|
package org.apache.lucene.analysis.charfilter;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharStream;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -34,7 +35,7 @@ public abstract class BaseCharFilter extends CharFilter {
|
||||||
private int diffs[];
|
private int diffs[];
|
||||||
private int size = 0;
|
private int size = 0;
|
||||||
|
|
||||||
public BaseCharFilter(CharStream in) {
|
public BaseCharFilter(Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/18/12 12:24 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/16/12 4:05 PM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.charfilter;
|
package org.apache.lucene.analysis.charfilter;
|
||||||
|
|
||||||
|
@ -20,13 +20,13 @@ package org.apache.lucene.analysis.charfilter;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||||
|
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 5/18/12 12:24 PM from the specification file
|
* on 7/16/12 4:05 PM from the specification file
|
||||||
* <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||||
*/
|
*/
|
||||||
public final class HTMLStripCharFilter extends BaseCharFilter {
|
public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
|
|
||||||
|
@ -30647,7 +30647,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
/**
|
/**
|
||||||
* @param source
|
* @param source
|
||||||
*/
|
*/
|
||||||
public HTMLStripCharFilter(CharStream source) {
|
public HTMLStripCharFilter(Reader source) {
|
||||||
super(source);
|
super(source);
|
||||||
this.zzReader = source;
|
this.zzReader = source;
|
||||||
}
|
}
|
||||||
|
@ -30657,7 +30657,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
* @param escapedTags Tags in this set (both start and end tags)
|
* @param escapedTags Tags in this set (both start and end tags)
|
||||||
* will not be filtered out.
|
* will not be filtered out.
|
||||||
*/
|
*/
|
||||||
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
|
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
|
||||||
super(source);
|
super(source);
|
||||||
this.zzReader = source;
|
this.zzReader = source;
|
||||||
if (null != escapedTags) {
|
if (null != escapedTags) {
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package org.apache.lucene.analysis.charfilter;
|
package org.apache.lucene.analysis.charfilter;
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -18,13 +18,13 @@ package org.apache.lucene.analysis.charfilter;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||||
|
@ -173,7 +173,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
/**
|
/**
|
||||||
* @param source
|
* @param source
|
||||||
*/
|
*/
|
||||||
public HTMLStripCharFilter(CharStream source) {
|
public HTMLStripCharFilter(Reader source) {
|
||||||
super(source);
|
super(source);
|
||||||
this.zzReader = source;
|
this.zzReader = source;
|
||||||
}
|
}
|
||||||
|
@ -183,7 +183,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
* @param escapedTags Tags in this set (both start and end tags)
|
* @param escapedTags Tags in this set (both start and end tags)
|
||||||
* will not be filtered out.
|
* will not be filtered out.
|
||||||
*/
|
*/
|
||||||
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
|
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
|
||||||
super(source);
|
super(source);
|
||||||
this.zzReader = source;
|
this.zzReader = source;
|
||||||
if (null != escapedTags) {
|
if (null != escapedTags) {
|
||||||
|
|
|
@ -21,8 +21,7 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharReader;
|
import org.apache.lucene.analysis.CharFilter; // javadocs
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.RollingCharBuffer;
|
import org.apache.lucene.util.RollingCharBuffer;
|
||||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||||
|
@ -51,8 +50,8 @@ public class MappingCharFilter extends BaseCharFilter {
|
||||||
private int replacementPointer;
|
private int replacementPointer;
|
||||||
private int inputOff;
|
private int inputOff;
|
||||||
|
|
||||||
/** Default constructor that takes a {@link CharStream}. */
|
/** Default constructor that takes a {@link Reader}. */
|
||||||
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
|
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
buffer.reset(in);
|
buffer.reset(in);
|
||||||
|
|
||||||
|
@ -66,15 +65,10 @@ public class MappingCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Easy-use constructor that takes a {@link Reader}. */
|
|
||||||
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
|
|
||||||
this(normMap, CharReader.get(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
super.reset();
|
||||||
buffer.reset(input);
|
buffer.reset(in);
|
||||||
replacement = null;
|
replacement = null;
|
||||||
inputOff = 0;
|
inputOff = 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -205,7 +205,7 @@ public final class CJKBigramFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* refills buffers with new data from the current token.
|
* refills buffers with new data from the current token.
|
||||||
*/
|
*/
|
||||||
private void refill() throws IOException {
|
private void refill() {
|
||||||
// compact buffers to keep them smallish if they become large
|
// compact buffers to keep them smallish if they become large
|
||||||
// just a safety check, but technically we only need the last codepoint
|
// just a safety check, but technically we only need the last codepoint
|
||||||
if (bufferLen > 64) {
|
if (bufferLen > 64) {
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
package org.apache.lucene.analysis.compound.hyphenation;
|
package org.apache.lucene.analysis.compound.hyphenation;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.PrintStream;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -463,10 +464,10 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void printStats() {
|
public void printStats(PrintStream out) {
|
||||||
System.out.println("Value space size = "
|
out.println("Value space size = "
|
||||||
+ Integer.toString(vspace.length()));
|
+ Integer.toString(vspace.length()));
|
||||||
super.printStats();
|
super.printStats(out);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,7 +40,7 @@ import javax.xml.parsers.SAXParserFactory;
|
||||||
*
|
*
|
||||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||||
*/
|
*/
|
||||||
public class PatternParser extends DefaultHandler implements PatternConsumer {
|
public class PatternParser extends DefaultHandler {
|
||||||
|
|
||||||
XMLReader parser;
|
XMLReader parser;
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
|
|
||||||
static final int ELEM_HYPHEN = 4;
|
static final int ELEM_HYPHEN = 4;
|
||||||
|
|
||||||
public PatternParser() throws HyphenationException {
|
public PatternParser() {
|
||||||
token = new StringBuilder();
|
token = new StringBuilder();
|
||||||
parser = createParser();
|
parser = createParser();
|
||||||
parser.setContentHandler(this);
|
parser.setContentHandler(this);
|
||||||
|
@ -74,7 +74,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public PatternParser(PatternConsumer consumer) throws HyphenationException {
|
public PatternParser(PatternConsumer consumer) {
|
||||||
this();
|
this();
|
||||||
this.consumer = consumer;
|
this.consumer = consumer;
|
||||||
}
|
}
|
||||||
|
@ -402,25 +402,4 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
return str.toString();
|
return str.toString();
|
||||||
|
|
||||||
} // getLocationString(SAXParseException):String
|
} // getLocationString(SAXParseException):String
|
||||||
|
|
||||||
// PatternConsumer implementation for testing purposes
|
|
||||||
public void addClass(String c) {
|
|
||||||
System.out.println("class: " + c);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addException(String w, ArrayList<Object> e) {
|
|
||||||
System.out.println("exception: " + w + " : " + e.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addPattern(String p, String v) {
|
|
||||||
System.out.println("pattern: " + p + " : " + v);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
if (args.length > 0) {
|
|
||||||
PatternParser pp = new PatternParser();
|
|
||||||
pp.setConsumer(pp);
|
|
||||||
pp.parse(args[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
|
|
||||||
package org.apache.lucene.analysis.compound.hyphenation;
|
package org.apache.lucene.analysis.compound.hyphenation;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
import java.util.Stack;
|
import java.util.Stack;
|
||||||
|
|
||||||
|
@ -633,11 +634,11 @@ public class TernaryTree implements Cloneable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void printStats() {
|
public void printStats(PrintStream out) {
|
||||||
System.out.println("Number of keys = " + Integer.toString(length));
|
out.println("Number of keys = " + Integer.toString(length));
|
||||||
System.out.println("Node count = " + Integer.toString(freenode));
|
out.println("Node count = " + Integer.toString(freenode));
|
||||||
// System.out.println("Array length = " + Integer.toString(eq.length));
|
// System.out.println("Array length = " + Integer.toString(eq.length));
|
||||||
System.out.println("Key Array length = " + Integer.toString(kv.length()));
|
out.println("Key Array length = " + Integer.toString(kv.length()));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
|
* for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
|
||||||
|
@ -647,8 +648,8 @@ public class TernaryTree implements Cloneable {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) {
|
||||||
TernaryTree tt = new TernaryTree();
|
TernaryTree tt = new TernaryTree();
|
||||||
tt.insert("Carlos", 'C');
|
tt.insert("Carlos", 'C');
|
||||||
tt.insert("Car", 'r');
|
tt.insert("Car", 'r');
|
||||||
|
@ -658,7 +659,8 @@ public class TernaryTree implements Cloneable {
|
||||||
System.out.println((char) tt.find("Car"));
|
System.out.println((char) tt.find("Car"));
|
||||||
System.out.println((char) tt.find("Carlos"));
|
System.out.println((char) tt.find("Carlos"));
|
||||||
System.out.println((char) tt.find("alto"));
|
System.out.println((char) tt.find("alto"));
|
||||||
tt.printStats();
|
tt.printStats(System.out);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -94,8 +94,8 @@ public final class KeywordTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader input) throws IOException {
|
public void setReader(Reader input) throws IOException {
|
||||||
super.reset(input);
|
super.setReader(input);
|
||||||
this.done = false;
|
this.done = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.core;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -122,7 +121,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* Returns the next input Token whose term() is not a stop word.
|
* Returns the next input Token whose term() is not a stop word.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected boolean accept() throws IOException {
|
protected boolean accept() {
|
||||||
return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
|
return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
|
||||||
* When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes
|
* When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected boolean accept() throws IOException {
|
protected boolean accept() {
|
||||||
return useWhiteList == stopTypes.contains(typeAttribute.type());
|
return useWhiteList == stopTypes.contains(typeAttribute.type());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
package org.apache.lucene.analysis.de;
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
// This file is encoded in UTF-8
|
// This file is encoded in UTF-8
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -38,6 +41,8 @@ public class GermanStemmer
|
||||||
*/
|
*/
|
||||||
private int substCount = 0;
|
private int substCount = 0;
|
||||||
|
|
||||||
|
private static final Locale locale = new Locale("de", "DE");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||||
*
|
*
|
||||||
|
@ -47,7 +52,7 @@ public class GermanStemmer
|
||||||
protected String stem( String term )
|
protected String stem( String term )
|
||||||
{
|
{
|
||||||
// Use lowercase for medium stemming.
|
// Use lowercase for medium stemming.
|
||||||
term = term.toLowerCase();
|
term = term.toLowerCase(locale);
|
||||||
if ( !isStemmable( term ) )
|
if ( !isStemmable( term ) )
|
||||||
return term;
|
return term;
|
||||||
// Reset the StringBuilder.
|
// Reset the StringBuilder.
|
||||||
|
|
|
@ -289,7 +289,7 @@ public class KStemmer {
|
||||||
entry = new DictEntry(exceptionWords[i], true);
|
entry = new DictEntry(exceptionWords[i], true);
|
||||||
d.put(exceptionWords[i], entry);
|
d.put(exceptionWords[i], entry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + exceptionWords[i]
|
throw new RuntimeException("Warning: Entry [" + exceptionWords[i]
|
||||||
+ "] already in dictionary 1");
|
+ "] already in dictionary 1");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -299,7 +299,7 @@ public class KStemmer {
|
||||||
entry = new DictEntry(directConflations[i][1], false);
|
entry = new DictEntry(directConflations[i][1], false);
|
||||||
d.put(directConflations[i][0], entry);
|
d.put(directConflations[i][0], entry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + directConflations[i][0]
|
throw new RuntimeException("Warning: Entry [" + directConflations[i][0]
|
||||||
+ "] already in dictionary 2");
|
+ "] already in dictionary 2");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -309,7 +309,7 @@ public class KStemmer {
|
||||||
entry = new DictEntry(countryNationality[i][1], false);
|
entry = new DictEntry(countryNationality[i][1], false);
|
||||||
d.put(countryNationality[i][0], entry);
|
d.put(countryNationality[i][0], entry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + countryNationality[i][0]
|
throw new RuntimeException("Warning: Entry [" + countryNationality[i][0]
|
||||||
+ "] already in dictionary 3");
|
+ "] already in dictionary 3");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -323,7 +323,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -333,7 +333,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -343,7 +343,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -353,7 +353,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -363,7 +363,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -373,7 +373,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -383,7 +383,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(array[i])) {
|
if (!d.containsKey(array[i])) {
|
||||||
d.put(array[i], defaultEntry);
|
d.put(array[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + array[i]
|
throw new RuntimeException("Warning: Entry [" + array[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -392,7 +392,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(KStemData8.data[i])) {
|
if (!d.containsKey(KStemData8.data[i])) {
|
||||||
d.put(KStemData8.data[i], defaultEntry);
|
d.put(KStemData8.data[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + KStemData8.data[i]
|
throw new RuntimeException("Warning: Entry [" + KStemData8.data[i]
|
||||||
+ "] already in dictionary 4");
|
+ "] already in dictionary 4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -401,7 +401,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(supplementDict[i])) {
|
if (!d.containsKey(supplementDict[i])) {
|
||||||
d.put(supplementDict[i], defaultEntry);
|
d.put(supplementDict[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + supplementDict[i]
|
throw new RuntimeException("Warning: Entry [" + supplementDict[i]
|
||||||
+ "] already in dictionary 5");
|
+ "] already in dictionary 5");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -410,7 +410,7 @@ public class KStemmer {
|
||||||
if (!d.containsKey(properNouns[i])) {
|
if (!d.containsKey(properNouns[i])) {
|
||||||
d.put(properNouns[i], defaultEntry);
|
d.put(properNouns[i], defaultEntry);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Warning: Entry [" + properNouns[i]
|
throw new RuntimeException("Warning: Entry [" + properNouns[i]
|
||||||
+ "] already in dictionary 6");
|
+ "] already in dictionary 6");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -492,10 +492,9 @@ class PorterStemmer
|
||||||
return dirty;
|
return dirty;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test program for demonstrating the Stemmer. It reads a file and
|
/* Test program for demonstrating the Stemmer. It reads a file and
|
||||||
* stems each word, writing the result to standard out.
|
* stems each word, writing the result to standard out.
|
||||||
* Usage: Stemmer file-name
|
* Usage: Stemmer file-name
|
||||||
*/
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
PorterStemmer s = new PorterStemmer();
|
PorterStemmer s = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -542,6 +541,6 @@ class PorterStemmer
|
||||||
System.out.println("error reading " + args[i]);
|
System.out.println("error reading " + args[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
|
@ -134,6 +133,6 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new PersianCharFilter(CharReader.get(reader));
|
return new PersianCharFilter(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,9 +18,9 @@ package org.apache.lucene.analysis.fa;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharStream;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CharFilter that replaces instances of Zero-width non-joiner with an
|
* CharFilter that replaces instances of Zero-width non-joiner with an
|
||||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||||
*/
|
*/
|
||||||
public class PersianCharFilter extends CharFilter {
|
public class PersianCharFilter extends CharFilter {
|
||||||
|
|
||||||
public PersianCharFilter(CharStream in) {
|
public PersianCharFilter(Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,4 +45,9 @@ public class PersianCharFilter extends CharFilter {
|
||||||
}
|
}
|
||||||
return charsRead;
|
return charsRead;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int correct(int currentOff) {
|
||||||
|
return currentOff; // we don't change the length of the string
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,10 +66,11 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
* and dictionary files
|
* and dictionary files.
|
||||||
|
* You have to close the provided InputStreams yourself.
|
||||||
*
|
*
|
||||||
* @param affix InputStream for reading the hunspell affix file
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
* @param dictionary InputStream for reading the hunspell dictionary file
|
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||||
* @param version Lucene Version
|
* @param version Lucene Version
|
||||||
* @throws IOException Can be thrown while reading from the InputStreams
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
|
@ -80,10 +81,11 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
* and dictionary files
|
* and dictionary files.
|
||||||
|
* You have to close the provided InputStreams yourself.
|
||||||
*
|
*
|
||||||
* @param affix InputStream for reading the hunspell affix file
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
* @param dictionary InputStream for reading the hunspell dictionary file
|
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||||
* @param version Lucene Version
|
* @param version Lucene Version
|
||||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||||
* @throws IOException Can be thrown while reading from the InputStreams
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
@ -95,10 +97,11 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
* and dictionary files
|
* and dictionary files.
|
||||||
|
* You have to close the provided InputStreams yourself.
|
||||||
*
|
*
|
||||||
* @param affix InputStream for reading the hunspell affix file
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
|
||||||
* @param version Lucene Version
|
* @param version Lucene Version
|
||||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||||
* @throws IOException Can be thrown while reading from the InputStreams
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
@ -110,10 +113,11 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
* and dictionary files
|
* and dictionary files.
|
||||||
|
* You have to close the provided InputStreams yourself.
|
||||||
*
|
*
|
||||||
* @param affix InputStream for reading the hunspell affix file
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
|
||||||
* @param version Lucene Version
|
* @param version Lucene Version
|
||||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||||
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
|
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
|
||||||
|
@ -194,7 +198,6 @@ public class HunspellDictionary {
|
||||||
flagParsingStrategy = getFlagParsingStrategy(line);
|
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
reader.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -252,7 +255,7 @@ public class HunspellDictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
String condition = ruleArgs[4];
|
String condition = ruleArgs[4];
|
||||||
affix.setCondition(condition, String.format(conditionPattern, condition));
|
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
||||||
affix.setCrossProduct(crossProduct);
|
affix.setCrossProduct(crossProduct);
|
||||||
|
|
||||||
List<HunspellAffix> list = affixes.get(affix.getAppend());
|
List<HunspellAffix> list = affixes.get(affix.getAppend());
|
||||||
|
@ -376,7 +379,7 @@ public class HunspellDictionary {
|
||||||
Arrays.sort(wordForm.getFlags());
|
Arrays.sort(wordForm.getFlags());
|
||||||
entry = line.substring(0, flagSep);
|
entry = line.substring(0, flagSep);
|
||||||
if(ignoreCase) {
|
if(ignoreCase) {
|
||||||
entry = entry.toLowerCase(Locale.ENGLISH);
|
entry = entry.toLowerCase(Locale.ROOT);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.hunspell;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -298,13 +299,12 @@ public class HunspellStemmer {
|
||||||
|
|
||||||
// ================================================= Entry Point ===================================================
|
// ================================================= Entry Point ===================================================
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
|
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
|
||||||
*
|
*
|
||||||
* @param args Program arguments. Should contain location of affix file and location of dic file
|
* @param args Program arguments. Should contain location of affix file and location of dic file
|
||||||
* @throws IOException Can be thrown while reading from the files
|
* @throws IOException Can be thrown while reading from the files
|
||||||
* @throws ParseException Can be thrown while parsing the files
|
* @throws ParseException Can be thrown while parsing the files
|
||||||
*/
|
|
||||||
public static void main(String[] args) throws IOException, ParseException {
|
public static void main(String[] args) throws IOException, ParseException {
|
||||||
boolean ignoreCase = false;
|
boolean ignoreCase = false;
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
@ -330,7 +330,7 @@ public class HunspellStemmer {
|
||||||
|
|
||||||
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
|
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
|
||||||
|
|
||||||
Scanner scanner = new Scanner(System.in);
|
Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
|
||||||
|
|
||||||
System.out.print("> ");
|
System.out.print("> ");
|
||||||
while (scanner.hasNextLine()) {
|
while (scanner.hasNextLine()) {
|
||||||
|
@ -346,12 +346,10 @@ public class HunspellStemmer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Prints the results of the stemming of a word
|
* Prints the results of the stemming of a word
|
||||||
*
|
*
|
||||||
* @param originalWord Word that has been stemmed
|
* @param originalWord Word that has been stemmed
|
||||||
* @param stems Stems of the word
|
* @param stems Stems of the word
|
||||||
*/
|
|
||||||
private static void printStemResults(String originalWord, List<Stem> stems) {
|
private static void printStemResults(String originalWord, List<Stem> stems) {
|
||||||
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
|
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
|
||||||
|
|
||||||
|
@ -381,13 +379,12 @@ public class HunspellStemmer {
|
||||||
System.out.println(builder);
|
System.out.println(builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Simple utility to check if the given String has any text
|
* Simple utility to check if the given String has any text
|
||||||
*
|
*
|
||||||
* @param str String to check if it has any text
|
* @param str String to check if it has any text
|
||||||
* @return {@code true} if the String has text, {@code false} otherwise
|
* @return {@code true} if the String has text, {@code false} otherwise
|
||||||
*/
|
|
||||||
private static boolean hasText(String str) {
|
private static boolean hasText(String str) {
|
||||||
return str != null && str.length() > 0;
|
return str != null && str.length() > 0;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,15 +19,13 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An always exhausted token stream.
|
* An always exhausted token stream.
|
||||||
*/
|
*/
|
||||||
public final class EmptyTokenStream extends TokenStream {
|
public final class EmptyTokenStream extends TokenStream {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,6 @@
|
||||||
|
|
||||||
package org.apache.lucene.analysis.miscellaneous;
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
@ -43,7 +40,7 @@ public final class KeepWordFilter extends FilteringTokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean accept() throws IOException {
|
public boolean accept() {
|
||||||
return words.contains(termAtt.buffer(), 0, termAtt.length());
|
return words.contains(termAtt.buffer(), 0, termAtt.length());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
@ -48,7 +45,7 @@ public final class LengthFilter extends FilteringTokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean accept() throws IOException {
|
public boolean accept() {
|
||||||
final int len = termAtt.length();
|
final int len = termAtt.length();
|
||||||
return (len >= min && len <= max);
|
return (len >= min && len <= max);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -46,7 +44,7 @@ public final class SingleTokenTokenStream extends TokenStream {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() {
|
||||||
if (exhausted) {
|
if (exhausted) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
|
@ -58,7 +56,7 @@ public final class SingleTokenTokenStream extends TokenStream {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset() {
|
||||||
exhausted = false;
|
exhausted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ import java.io.StringReader;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -54,7 +53,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
||||||
private final String replacement;
|
private final String replacement;
|
||||||
private Reader transformedInput;
|
private Reader transformedInput;
|
||||||
|
|
||||||
public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
|
public PatternReplaceCharFilter(Pattern pattern, String replacement, Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
this.pattern = pattern;
|
this.pattern = pattern;
|
||||||
this.replacement = replacement;
|
this.replacement = replacement;
|
||||||
|
@ -64,15 +63,28 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
||||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
// Buffer all input on the first call.
|
// Buffer all input on the first call.
|
||||||
if (transformedInput == null) {
|
if (transformedInput == null) {
|
||||||
|
fill();
|
||||||
|
}
|
||||||
|
|
||||||
|
return transformedInput.read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void fill() throws IOException {
|
||||||
StringBuilder buffered = new StringBuilder();
|
StringBuilder buffered = new StringBuilder();
|
||||||
char [] temp = new char [1024];
|
char [] temp = new char [1024];
|
||||||
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
|
for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
|
||||||
buffered.append(temp, 0, cnt);
|
buffered.append(temp, 0, cnt);
|
||||||
}
|
}
|
||||||
transformedInput = new StringReader(processPattern(buffered).toString());
|
transformedInput = new StringReader(processPattern(buffered).toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
return transformedInput.read(cbuf, off, len);
|
@Override
|
||||||
|
public int read() throws IOException {
|
||||||
|
if (transformedInput == null) {
|
||||||
|
fill();
|
||||||
|
}
|
||||||
|
|
||||||
|
return transformedInput.read();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -84,7 +84,7 @@ public final class PatternTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() {
|
||||||
if (index >= str.length()) return false;
|
if (index >= str.length()) return false;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
if (group >= 0) {
|
if (group >= 0) {
|
||||||
|
@ -130,14 +130,14 @@ public final class PatternTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void end() throws IOException {
|
public void end() {
|
||||||
final int ofs = correctOffset(str.length());
|
final int ofs = correctOffset(str.length());
|
||||||
offsetAtt.setOffset(ofs, ofs);
|
offsetAtt.setOffset(ofs, ofs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader input) throws IOException {
|
public void setReader(Reader input) throws IOException {
|
||||||
super.reset(input);
|
super.setReader(input);
|
||||||
fillBuffer(str, input);
|
fillBuffer(str, input);
|
||||||
matcher.reset(str);
|
matcher.reset(str);
|
||||||
index = 0;
|
index = 0;
|
||||||
|
|
|
@ -132,7 +132,7 @@ public abstract class RSLPStemmerBase {
|
||||||
super(suffix, min, replacement);
|
super(suffix, min, replacement);
|
||||||
for (int i = 0; i < exceptions.length; i++) {
|
for (int i = 0; i < exceptions.length; i++) {
|
||||||
if (!exceptions[i].endsWith(suffix))
|
if (!exceptions[i].endsWith(suffix))
|
||||||
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||||
}
|
}
|
||||||
this.exceptions = new CharArraySet(Version.LUCENE_50,
|
this.exceptions = new CharArraySet(Version.LUCENE_50,
|
||||||
Arrays.asList(exceptions), false);
|
Arrays.asList(exceptions), false);
|
||||||
|
@ -156,7 +156,7 @@ public abstract class RSLPStemmerBase {
|
||||||
super(suffix, min, replacement);
|
super(suffix, min, replacement);
|
||||||
for (int i = 0; i < exceptions.length; i++) {
|
for (int i = 0; i < exceptions.length; i++) {
|
||||||
if (!exceptions[i].endsWith(suffix))
|
if (!exceptions[i].endsWith(suffix))
|
||||||
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||||
}
|
}
|
||||||
this.exceptions = new char[exceptions.length][];
|
this.exceptions = new char[exceptions.length][];
|
||||||
for (int i = 0; i < exceptions.length; i++)
|
for (int i = 0; i < exceptions.length; i++)
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.sinks;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
@ -37,10 +38,12 @@ public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||||
protected CharTermAttribute termAtt;
|
protected CharTermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
* Uses {@link java.text.DateFormat#getDateInstance(int, Locale)
|
||||||
|
* DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as
|
||||||
|
* the {@link java.text.DateFormat} object.
|
||||||
*/
|
*/
|
||||||
public DateRecognizerSinkFilter() {
|
public DateRecognizerSinkFilter() {
|
||||||
this(DateFormat.getDateInstance());
|
this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT));
|
||||||
}
|
}
|
||||||
|
|
||||||
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
||||||
|
|
|
@ -212,7 +212,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() {
|
||||||
// lazy init the iterator
|
// lazy init the iterator
|
||||||
if (it == null) {
|
if (it == null) {
|
||||||
it = cachedStates.iterator();
|
it = cachedStates.iterator();
|
||||||
|
@ -228,7 +228,7 @@ public final class TeeSinkTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final void end() throws IOException {
|
public final void end() {
|
||||||
if (finalState != null) {
|
if (finalState != null) {
|
||||||
restoreState(finalState);
|
restoreState(finalState);
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,9 +114,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
||||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||||
return new TokenStreamComponents(src, tok) {
|
return new TokenStreamComponents(src, tok) {
|
||||||
@Override
|
@Override
|
||||||
protected void reset(final Reader reader) throws IOException {
|
protected void setReader(final Reader reader) throws IOException {
|
||||||
src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
|
src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -175,8 +175,8 @@ public final class ClassicTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader reader) throws IOException {
|
public void setReader(Reader reader) throws IOException {
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
scanner.yyreset(reader);
|
scanner.yyreset(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 9/30/11 12:10 PM from the specification file
|
* on 08.07.12 16:59 from the specification file
|
||||||
* <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||||
|
|
||||||
|
@ -383,15 +383,7 @@ public final void getText(CharTermAttribute t) {
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
ClassicTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
|
// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
|
||||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -115,9 +115,9 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
||||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||||
return new TokenStreamComponents(src, tok) {
|
return new TokenStreamComponents(src, tok) {
|
||||||
@Override
|
@Override
|
||||||
protected void reset(final Reader reader) throws IOException {
|
protected void setReader(final Reader reader) throws IOException {
|
||||||
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
|
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -183,8 +183,8 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader reader) throws IOException {
|
public void setReader(Reader reader) throws IOException {
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
scanner.yyreset(reader);
|
scanner.yyreset(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:10 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -759,15 +759,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
public StandardTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -104,9 +104,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
||||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||||
return new TokenStreamComponents(src, tok) {
|
return new TokenStreamComponents(src, tok) {
|
||||||
@Override
|
@Override
|
||||||
protected void reset(final Reader reader) throws IOException {
|
protected void setReader(final Reader reader) throws IOException {
|
||||||
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
|
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -162,8 +162,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader reader) throws IOException {
|
public void setReader(Reader reader) throws IOException {
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
scanner.yyreset(reader);
|
scanner.yyreset(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/18/12 12:05 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
@ -3844,15 +3844,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
public UAX29URLEmailTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
|
|
@ -92,7 +92,7 @@ public class WordnetSynonymParser extends SynonymMap.Builder {
|
||||||
return analyze(analyzer, text, reuse);
|
return analyze(analyzer, text, reuse);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addInternal(CharsRef synset[], int size) throws IOException {
|
private void addInternal(CharsRef synset[], int size) {
|
||||||
if (size <= 1) {
|
if (size <= 1) {
|
||||||
return; // nothing to do
|
return; // nothing to do
|
||||||
}
|
}
|
||||||
|
|
|
@ -650,7 +650,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Empty {@link UnmodifiableCharArrayMap} optimized for speed.
|
* Empty {@link org.apache.lucene.analysis.util.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
|
||||||
* Contains checks will always return <code>false</code> or throw
|
* Contains checks will always return <code>false</code> or throw
|
||||||
* NPE if necessary.
|
* NPE if necessary.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -17,13 +17,15 @@ package org.apache.lucene.analysis.util;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharStream;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Abstract parent class for analysis factories that create {@link CharStream}
|
* Abstract parent class for analysis factories that create {@link CharFilter}
|
||||||
* instances.
|
* instances.
|
||||||
*/
|
*/
|
||||||
public abstract class CharFilterFactory extends AbstractAnalysisFactory {
|
public abstract class CharFilterFactory extends AbstractAnalysisFactory {
|
||||||
|
|
||||||
public abstract CharStream create(CharStream input);
|
public abstract CharFilter create(Reader input);
|
||||||
}
|
}
|
||||||
|
|
|
@ -162,8 +162,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader input) throws IOException {
|
public void setReader(Reader input) throws IOException {
|
||||||
super.reset(input);
|
super.setReader(input);
|
||||||
bufferIndex = 0;
|
bufferIndex = 0;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
dataLen = 0;
|
dataLen = 0;
|
||||||
|
|
|
@ -325,13 +325,13 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader reader) throws IOException {
|
public void setReader(Reader reader) throws IOException {
|
||||||
super.reset(reader);
|
super.setReader(reader);
|
||||||
scanner.yyreset(input);
|
scanner.yyreset(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void end() throws IOException {
|
public void end() {
|
||||||
// set final offset
|
// set final offset
|
||||||
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
|
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
|
||||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.wikipedia;
|
package org.apache.lucene.analysis.wikipedia;
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
* this work for additional information regarding copyright ownership.
|
* this work for additional information regarding copyright ownership.
|
||||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 1/22/12 10:26 PM from the specification file
|
* on 08.07.12 17:00 from the specification file
|
||||||
* <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class WikipediaTokenizerImpl {
|
class WikipediaTokenizerImpl {
|
||||||
|
|
||||||
|
@ -519,15 +519,7 @@ final void reset() {
|
||||||
this.zzReader = in;
|
this.zzReader = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
WikipediaTokenizerImpl(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unpacks the compressed character translation table.
|
* Unpacks the compressed character translation table.
|
||||||
|
|
|
@ -435,7 +435,7 @@ public abstract class SnowballProgram {
|
||||||
bra > ket ||
|
bra > ket ||
|
||||||
ket > limit)
|
ket > limit)
|
||||||
{
|
{
|
||||||
System.err.println("faulty slice operation");
|
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
|
||||||
// FIXME: report error somehow.
|
// FIXME: report error somehow.
|
||||||
/*
|
/*
|
||||||
fprintf(stderr, "faulty slice operation:\n");
|
fprintf(stderr, "faulty slice operation:\n");
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
|
For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
This module contains concrete components ({@link org.apache.lucene.analysis.charfilter.CharFilter}s,
|
This module contains concrete components ({@link org.apache.lucene.analysis.CharFilter}s,
|
||||||
{@link org.apache.lucene.analysis.Tokenizer}s, and ({@link org.apache.lucene.analysis.TokenFilter}s) for
|
{@link org.apache.lucene.analysis.Tokenizer}s, and ({@link org.apache.lucene.analysis.TokenFilter}s) for
|
||||||
analyzing different types of content. It also provides a number of {@link org.apache.lucene.analysis.Analyzer}s
|
analyzing different types of content. It also provides a number of {@link org.apache.lucene.analysis.Analyzer}s
|
||||||
for different languages that you can use to get started quickly.
|
for different languages that you can use to get started quickly.
|
||||||
|
|
|
@ -96,6 +96,6 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new ArabicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new ArabicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,6 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new BulgarianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new BulgarianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -162,7 +162,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -58,6 +58,6 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new CatalanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new CatalanAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
@ -46,7 +45,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
return new HTMLStripCharFilter(reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -60,7 +59,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = "\nthis is some text\n here is a link and " +
|
String gold = "\nthis is some text\n here is a link and " +
|
||||||
"another link. " +
|
"another link. " +
|
||||||
"This is an entity: & plus a <. Here is an &. ";
|
"This is an entity: & plus a <. Here is an &. ";
|
||||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html));
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = -1;
|
int ch = -1;
|
||||||
char [] goldArray = gold.toCharArray();
|
char [] goldArray = gold.toCharArray();
|
||||||
|
@ -79,7 +78,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
//Some sanity checks, but not a full-fledged check
|
//Some sanity checks, but not a full-fledged check
|
||||||
public void testHTML() throws Exception {
|
public void testHTML() throws Exception {
|
||||||
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
|
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
|
||||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = -1;
|
int ch = -1;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -96,7 +95,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testMSWord14GeneratedHTML() throws Exception {
|
public void testMSWord14GeneratedHTML() throws Exception {
|
||||||
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
|
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
|
||||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8"));
|
||||||
String gold = "This is a test";
|
String gold = "This is a test";
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
|
@ -117,7 +116,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = "\u0393";
|
String gold = "\u0393";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -132,7 +131,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
|
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -147,7 +146,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = " <junk/> ! @ and ’";
|
String gold = " <junk/> ! @ and ’";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -161,7 +160,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
|
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("reserved");
|
set.add("reserved");
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -346,7 +345,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||||
String test = testGold[i];
|
String test = testGold[i];
|
||||||
String gold = testGold[i + 1];
|
String gold = testGold[i + 1];
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -370,7 +369,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
testBuilder.append("-->foo");
|
testBuilder.append("-->foo");
|
||||||
String gold = "foo";
|
String gold = "foo";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -388,7 +387,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||||
testBuilder.append("?>");
|
testBuilder.append("?>");
|
||||||
gold = "";
|
gold = "";
|
||||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
||||||
ch = 0;
|
ch = 0;
|
||||||
builder = new StringBuilder();
|
builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -406,7 +405,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||||
testBuilder.append("/>");
|
testBuilder.append("/>");
|
||||||
gold = "";
|
gold = "";
|
||||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
|
||||||
ch = 0;
|
ch = 0;
|
||||||
builder = new StringBuilder();
|
builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -430,7 +429,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private void processBuffer(String test, String assertMsg) throws IOException {
|
private void processBuffer(String test, String assertMsg) throws IOException {
|
||||||
// System.out.println("-------------------processBuffer----------");
|
// System.out.println("-------------------processBuffer----------");
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -448,7 +447,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
String test = "<!--- three dashes, still a valid comment ---> ";
|
String test = "<!--- three dashes, still a valid comment ---> ";
|
||||||
String gold = " ";
|
String gold = " ";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -464,7 +463,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void doTestOffsets(String in) throws Exception {
|
public void doTestOffsets(String in) throws Exception {
|
||||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(in)));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
int off = 0; // offset in the reader
|
int off = 0; // offset in the reader
|
||||||
int strOff = -1; // offset in the original string
|
int strOff = -1; // offset in the original string
|
||||||
|
@ -491,7 +490,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
static void assertLegalOffsets(String in) throws Exception {
|
static void assertLegalOffsets(String in) throws Exception {
|
||||||
int length = in.length();
|
int length = in.length();
|
||||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(in)));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
int off = 0;
|
int off = 0;
|
||||||
while ((ch = reader.read()) != -1) {
|
while ((ch = reader.read()) != -1) {
|
||||||
|
@ -508,12 +507,12 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandom() throws Exception {
|
public void testRandom() throws Exception {
|
||||||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
int numRounds = RANDOM_MULTIPLIER * 1000;
|
||||||
checkRandomData(random(), newTestAnalyzer(), numRounds);
|
checkRandomData(random(), newTestAnalyzer(), numRounds);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
int numRounds = RANDOM_MULTIPLIER * 200;
|
int numRounds = RANDOM_MULTIPLIER * 100;
|
||||||
checkRandomData(random(), newTestAnalyzer(), numRounds, 8192);
|
checkRandomData(random(), newTestAnalyzer(), numRounds, 8192);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -526,7 +525,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
||||||
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
||||||
String gold = "onetwo";
|
String gold = "onetwo";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -540,7 +539,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
||||||
gold = "one\ntwo";
|
gold = "one\ntwo";
|
||||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
ch = 0;
|
ch = 0;
|
||||||
builder = new StringBuilder();
|
builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -557,7 +556,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testScriptQuotes() throws Exception {
|
public void testScriptQuotes() throws Exception {
|
||||||
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
||||||
String gold = "one\ntwo";
|
String gold = "one\ntwo";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -572,7 +571,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
||||||
gold = "hello\n";
|
gold = "hello\n";
|
||||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
ch = 0;
|
ch = 0;
|
||||||
builder = new StringBuilder();
|
builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -591,7 +590,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = "one<script no-value-attr></script>two";
|
String gold = "one<script no-value-attr></script>two";
|
||||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
|
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
|
||||||
Reader reader = new HTMLStripCharFilter
|
Reader reader = new HTMLStripCharFilter
|
||||||
(CharReader.get(new StringReader(test)), escapedTags);
|
(new StringReader(test), escapedTags);
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -612,7 +611,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
+ "-->\n"
|
+ "-->\n"
|
||||||
+ "</style>two";
|
+ "</style>two";
|
||||||
String gold = "one\ntwo";
|
String gold = "one\ntwo";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -631,7 +630,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = "one<style type=\"text/css\"></style>two";
|
String gold = "one<style type=\"text/css\"></style>two";
|
||||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
|
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
|
||||||
Reader reader = new HTMLStripCharFilter
|
Reader reader = new HTMLStripCharFilter
|
||||||
(CharReader.get(new StringReader(test)), escapedTags);
|
(new StringReader(test), escapedTags);
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -656,7 +655,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||||
String test = testGold[i];
|
String test = testGold[i];
|
||||||
String gold = testGold[i + 1];
|
String gold = testGold[i + 1];
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
while ((ch = reader.read()) != -1){
|
while ((ch = reader.read()) != -1){
|
||||||
|
@ -671,7 +670,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
||||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
|
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
|
||||||
Reader reader = new HTMLStripCharFilter
|
Reader reader = new HTMLStripCharFilter
|
||||||
(CharReader.get(new StringReader(test)), escapedTags);
|
(new StringReader(test), escapedTags);
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -688,7 +687,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testInlineTagsNoSpace() throws Exception {
|
public void testInlineTagsNoSpace() throws Exception {
|
||||||
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
||||||
String gold = "onetwo2e.three";
|
String gold = "onetwo2e.three";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -705,7 +704,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testCDATA() throws Exception {
|
public void testCDATA() throws Exception {
|
||||||
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
|
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
|
||||||
String gold = "one<one><two>three<four></four></two></one>two";
|
String gold = "one<one><two>three<four></four></two></one>two";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -720,7 +719,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
|
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
|
||||||
gold = "onetwo<![CDATA[three]]>fourfive";
|
gold = "onetwo<![CDATA[three]]>fourfive";
|
||||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
ch = 0;
|
ch = 0;
|
||||||
builder = new StringBuilder();
|
builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -737,7 +736,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testUppercaseCharacterEntityVariants() throws Exception {
|
public void testUppercaseCharacterEntityVariants() throws Exception {
|
||||||
String test = " "-©>><<®&";
|
String test = " "-©>><<®&";
|
||||||
String gold = " \"-\u00A9>><<\u00AE&";
|
String gold = " \"-\u00A9>><<\u00AE&";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -754,7 +753,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
||||||
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
||||||
String gold = "onetwo";
|
String gold = "onetwo";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -771,7 +770,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testSupplementaryCharsInTags() throws Exception {
|
public void testSupplementaryCharsInTags() throws Exception {
|
||||||
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
||||||
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
||||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
Reader reader = new HTMLStripCharFilter(new StringReader(test));
|
||||||
int ch = 0;
|
int ch = 0;
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -822,7 +821,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reader reader = new HTMLStripCharFilter
|
Reader reader = new HTMLStripCharFilter
|
||||||
(CharReader.get(new StringReader(text.toString())));
|
(new StringReader(text.toString()));
|
||||||
while (reader.read() != -1);
|
while (reader.read() != -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,8 +29,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
@ -60,7 +59,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReaderReset() throws Exception {
|
public void testReaderReset() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||||
char[] buf = new char[10];
|
char[] buf = new char[10];
|
||||||
int len = cs.read(buf, 0, 10);
|
int len = cs.read(buf, 0, 10);
|
||||||
assertEquals( 1, len );
|
assertEquals( 1, len );
|
||||||
|
@ -76,55 +75,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNothingChange() throws Exception {
|
public void testNothingChange() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
|
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test1to1() throws Exception {
|
public void test1to1() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
|
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test1to2() throws Exception {
|
public void test1to2() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
|
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test1to3() throws Exception {
|
public void test1to3() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
|
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test2to4() throws Exception {
|
public void test2to4() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
|
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test2to1() throws Exception {
|
public void test2to1() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
|
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test3to1() throws Exception {
|
public void test3to1() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
|
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test4to2() throws Exception {
|
public void test4to2() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
|
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test5to0() throws Exception {
|
public void test5to0() throws Exception {
|
||||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
|
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
|
||||||
}
|
}
|
||||||
|
@ -149,7 +148,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
//
|
//
|
||||||
public void testTokenStream() throws Exception {
|
public void testTokenStream() throws Exception {
|
||||||
String testString = "h i j k ll cccc bbb aa";
|
String testString = "h i j k ll cccc bbb aa";
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) );
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
|
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
|
||||||
|
@ -171,8 +170,8 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
// h,8,9 => i,8,9
|
// h,8,9 => i,8,9
|
||||||
public void testChained() throws Exception {
|
public void testChained() throws Exception {
|
||||||
String testString = "aaaa ll h";
|
String testString = "aaaa ll h";
|
||||||
CharStream cs = new MappingCharFilter( normMap,
|
CharFilter cs = new MappingCharFilter( normMap,
|
||||||
new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) ) );
|
new MappingCharFilter( normMap, new StringReader( testString ) ) );
|
||||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[]{"a","llllllll","i"},
|
new String[]{"a","llllllll","i"},
|
||||||
|
@ -193,7 +192,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new MappingCharFilter(normMap, CharReader.get(reader));
|
return new MappingCharFilter(normMap, reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -219,7 +218,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new MappingCharFilter(map, CharReader.get(reader));
|
return new MappingCharFilter(map, reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -229,7 +228,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
|
//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
|
||||||
public void testRandomMaps() throws Exception {
|
public void testRandomMaps() throws Exception {
|
||||||
int numIterations = atLeast(10);
|
int numIterations = atLeast(3);
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
final NormalizeCharMap map = randomMap();
|
final NormalizeCharMap map = randomMap();
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@ -241,7 +240,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new MappingCharFilter(map, CharReader.get(reader));
|
return new MappingCharFilter(map, reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
int numRounds = 100;
|
int numRounds = 100;
|
||||||
|
@ -270,7 +269,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testRandomMaps2() throws Exception {
|
public void testRandomMaps2() throws Exception {
|
||||||
final Random random = random();
|
final Random random = random();
|
||||||
final int numIterations = atLeast(10);
|
final int numIterations = atLeast(3);
|
||||||
for(int iter=0;iter<numIterations;iter++) {
|
for(int iter=0;iter<numIterations;iter++) {
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -216,7 +215,7 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new MappingCharFilter(norm, CharReader.get(reader));
|
return new MappingCharFilter(norm, reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -272,13 +271,13 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new CJKAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.apache.lucene.analysis.cjk;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(t, new CJKBigramFilter(t));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public void testHuge() throws Exception {
|
||||||
|
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||||
|
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||||
|
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
|
||||||
|
new String[] {
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHanOnly() throws Exception {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||||
|
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -63,7 +63,7 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandomData() throws IOException {
|
public void testRandomData() throws IOException {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -48,7 +48,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||||
assertTrue(cgf.incrementToken());
|
assertTrue(cgf.incrementToken());
|
||||||
assertEquals("the_s", term.toString());
|
assertEquals("the_s", term.toString());
|
||||||
|
|
||||||
wt.reset(new StringReader(input));
|
wt.setReader(new StringReader(input));
|
||||||
cgf.reset();
|
cgf.reset();
|
||||||
assertTrue(cgf.incrementToken());
|
assertTrue(cgf.incrementToken());
|
||||||
assertEquals("How", term.toString());
|
assertEquals("How", term.toString());
|
||||||
|
@ -66,7 +66,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||||
assertTrue(nsf.incrementToken());
|
assertTrue(nsf.incrementToken());
|
||||||
assertEquals("the_s", term.toString());
|
assertEquals("the_s", term.toString());
|
||||||
|
|
||||||
wt.reset(new StringReader(input));
|
wt.setReader(new StringReader(input));
|
||||||
nsf.reset();
|
nsf.reset();
|
||||||
assertTrue(nsf.incrementToken());
|
assertTrue(nsf.incrementToken());
|
||||||
assertEquals("How_the", term.toString());
|
assertEquals("How_the", term.toString());
|
||||||
|
@ -81,7 +81,6 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||||
* "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
|
* "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
|
||||||
* out)
|
* out)
|
||||||
*
|
*
|
||||||
* @return Map<String,String>
|
|
||||||
*/
|
*/
|
||||||
public void testCommonGramsQueryFilter() throws Exception {
|
public void testCommonGramsQueryFilter() throws Exception {
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
|
@ -319,7 +318,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
||||||
Analyzer b = new Analyzer() {
|
Analyzer b = new Analyzer() {
|
||||||
|
|
||||||
|
@ -331,6 +330,6 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
checkRandomData(random(), b, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -240,7 +239,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||||
assertTrue(tf.incrementToken());
|
assertTrue(tf.incrementToken());
|
||||||
assertEquals("Rind", termAtt.toString());
|
assertEquals("Rind", termAtt.toString());
|
||||||
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
|
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||||
tf.reset();
|
tf.reset();
|
||||||
assertTrue(tf.incrementToken());
|
assertTrue(tf.incrementToken());
|
||||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||||
|
@ -327,7 +326,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
return new MappingCharFilter(normMap, CharReader.get(reader));
|
return new MappingCharFilter(normMap, reader);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -348,7 +347,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
|
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
||||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||||
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||||
|
@ -361,7 +360,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, filter);
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), b, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws Exception {
|
public void testEmptyTerm() throws Exception {
|
||||||
|
|
|
@ -163,7 +163,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||||
filter.reset();
|
filter.reset();
|
||||||
String highSurEndingUpper = "BogustermBoguster\ud801";
|
String highSurEndingUpper = "BogustermBoguster\ud801";
|
||||||
String highSurEndingLower = "bogustermboguster\ud801";
|
String highSurEndingLower = "bogustermboguster\ud801";
|
||||||
tokenizer.reset(new StringReader(highSurEndingUpper));
|
tokenizer.setReader(new StringReader(highSurEndingUpper));
|
||||||
assertTokenStreamContents(filter, new String[] {highSurEndingLower});
|
assertTokenStreamContents(filter, new String[] {highSurEndingLower});
|
||||||
assertTrue(filter.hasAttribute(CharTermAttribute.class));
|
assertTrue(filter.hasAttribute(CharTermAttribute.class));
|
||||||
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
|
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
|
||||||
|
@ -191,17 +191,17 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
checkRandomData(random(), new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new SimpleAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
checkRandomData(random(), new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new StopAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random large strings through the analyzer */
|
/** blast some random large strings through the analyzer */
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||||
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||||
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
package org.apache.lucene.analysis.core;
|
package org.apache.lucene.analysis.core;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharStream;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.MockCharFilter;
|
import org.apache.lucene.analysis.MockCharFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenFilter;
|
import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
@ -65,10 +65,10 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
||||||
checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
|
checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
|
||||||
}
|
}
|
||||||
|
|
||||||
CharStream wrappedStream = new CharStream() {
|
CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void mark(int readAheadLimit) throws IOException {
|
public void mark(int readAheadLimit) {
|
||||||
throw new UnsupportedOperationException("mark(int)");
|
throw new UnsupportedOperationException("mark(int)");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,53 +78,53 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read() throws IOException {
|
public int read() {
|
||||||
throw new UnsupportedOperationException("read()");
|
throw new UnsupportedOperationException("read()");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(char[] cbuf) throws IOException {
|
public int read(char[] cbuf) {
|
||||||
throw new UnsupportedOperationException("read(char[])");
|
throw new UnsupportedOperationException("read(char[])");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(CharBuffer target) throws IOException {
|
public int read(CharBuffer target) {
|
||||||
throw new UnsupportedOperationException("read(CharBuffer)");
|
throw new UnsupportedOperationException("read(CharBuffer)");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean ready() throws IOException {
|
public boolean ready() {
|
||||||
throw new UnsupportedOperationException("ready()");
|
throw new UnsupportedOperationException("ready()");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset() {
|
||||||
throw new UnsupportedOperationException("reset()");
|
throw new UnsupportedOperationException("reset()");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long skip(long n) throws IOException {
|
public long skip(long n) {
|
||||||
throw new UnsupportedOperationException("skip(long)");
|
throw new UnsupportedOperationException("skip(long)");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int correctOffset(int currentOff) {
|
public int correct(int currentOff) {
|
||||||
throw new UnsupportedOperationException("correctOffset(int)");
|
throw new UnsupportedOperationException("correct(int)");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() {
|
||||||
throw new UnsupportedOperationException("close()");
|
throw new UnsupportedOperationException("close()");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(char[] arg0, int arg1, int arg2) throws IOException {
|
public int read(char[] arg0, int arg1, int arg2) {
|
||||||
throw new UnsupportedOperationException("read(char[], int, int)");
|
throw new UnsupportedOperationException("read(char[], int, int)");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public void testWrapping() throws Exception {
|
public void testWrapping() throws Exception {
|
||||||
CharStream cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
|
CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
|
||||||
try {
|
try {
|
||||||
cs.mark(1);
|
cs.mark(1);
|
||||||
fail();
|
fail();
|
||||||
|
@ -178,7 +178,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
||||||
cs.correctOffset(1);
|
cs.correctOffset(1);
|
||||||
fail();
|
fail();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
assertEquals("correctOffset(int)", e.getMessage());
|
assertEquals("correct(int)", e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -315,12 +315,12 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random large strings through the analyzer */
|
/** blast some random large strings through the analyzer */
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -74,7 +74,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
String s = _TestUtil.randomSimpleString(random);
|
String s = _TestUtil.randomSimpleString(random);
|
||||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||||
right.tokenStream("foo", newStringReader(s)));
|
right.tokenStream("foo", newStringReader(s)));
|
||||||
|
@ -94,7 +94,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
int numIterations = atLeast(100);
|
int numIterations = atLeast(50);
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
String s = _TestUtil.randomSimpleString(random, maxLength);
|
String s = _TestUtil.randomSimpleString(random, maxLength);
|
||||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||||
|
@ -112,7 +112,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
String s = _TestUtil.randomHtmlishString(random, 20);
|
String s = _TestUtil.randomHtmlishString(random, 20);
|
||||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||||
right.tokenStream("foo", newStringReader(s)));
|
right.tokenStream("foo", newStringReader(s)));
|
||||||
|
@ -121,7 +121,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
|
|
||||||
public void testLetterHtmlishHuge() throws Exception {
|
public void testLetterHtmlishHuge() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
int maxLength = 2048; // this is number of elements, not chars!
|
int maxLength = 1024; // this is number of elements, not chars!
|
||||||
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
|
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
|
||||||
left.setMaxTokenLength(255); // match CharTokenizer's max token length
|
left.setMaxTokenLength(255); // match CharTokenizer's max token length
|
||||||
Analyzer right = new Analyzer() {
|
Analyzer right = new Analyzer() {
|
||||||
|
@ -131,7 +131,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
int numIterations = atLeast(100);
|
int numIterations = atLeast(50);
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
String s = _TestUtil.randomHtmlishString(random, maxLength);
|
String s = _TestUtil.randomHtmlishString(random, maxLength);
|
||||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||||
|
@ -149,7 +149,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for (int i = 0; i < 10000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
String s = _TestUtil.randomUnicodeString(random);
|
String s = _TestUtil.randomUnicodeString(random);
|
||||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||||
right.tokenStream("foo", newStringReader(s)));
|
right.tokenStream("foo", newStringReader(s)));
|
||||||
|
@ -158,7 +158,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
|
|
||||||
public void testLetterUnicodeHuge() throws Exception {
|
public void testLetterUnicodeHuge() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
int maxLength = 8192; // CharTokenizer.IO_BUFFER_SIZE*2
|
int maxLength = 4300; // CharTokenizer.IO_BUFFER_SIZE + fudge
|
||||||
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
|
MockAnalyzer left = new MockAnalyzer(random, jvmLetter, false);
|
||||||
left.setMaxTokenLength(255); // match CharTokenizer's max token length
|
left.setMaxTokenLength(255); // match CharTokenizer's max token length
|
||||||
Analyzer right = new Analyzer() {
|
Analyzer right = new Analyzer() {
|
||||||
|
@ -168,7 +168,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
int numIterations = atLeast(100);
|
int numIterations = atLeast(50);
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
String s = _TestUtil.randomUnicodeString(random, maxLength);
|
String s = _TestUtil.randomUnicodeString(random, maxLength);
|
||||||
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
assertEquals(s, left.tokenStream("foo", newStringReader(s)),
|
||||||
|
|
|
@ -127,6 +127,6 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new KeywordAnalyzer(), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new KeywordAnalyzer(), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,8 +44,7 @@ import java.util.regex.Pattern;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.analysis.EmptyTokenizer;
|
import org.apache.lucene.analysis.EmptyTokenizer;
|
||||||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
|
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
|
||||||
|
@ -101,7 +100,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
static List<Constructor<? extends Tokenizer>> tokenizers;
|
static List<Constructor<? extends Tokenizer>> tokenizers;
|
||||||
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
||||||
static List<Constructor<? extends CharStream>> charfilters;
|
static List<Constructor<? extends CharFilter>> charfilters;
|
||||||
|
|
||||||
// TODO: fix those and remove
|
// TODO: fix those and remove
|
||||||
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||||
|
@ -170,7 +169,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
|
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
|
||||||
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
|
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
|
||||||
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
|
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
|
||||||
charfilters = new ArrayList<Constructor<? extends CharStream>>();
|
charfilters = new ArrayList<Constructor<? extends CharFilter>>();
|
||||||
for (final Class<?> c : analysisClasses) {
|
for (final Class<?> c : analysisClasses) {
|
||||||
final int modifiers = c.getModifiers();
|
final int modifiers = c.getModifiers();
|
||||||
if (
|
if (
|
||||||
|
@ -179,7 +178,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
||||||
|| brokenComponents.contains(c)
|
|| brokenComponents.contains(c)
|
||||||
|| c.isAnnotationPresent(Deprecated.class)
|
|| c.isAnnotationPresent(Deprecated.class)
|
||||||
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
|
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -197,10 +196,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||||
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||||
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
|
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
|
||||||
} else if (CharStream.class.isAssignableFrom(c)) {
|
} else if (CharFilter.class.isAssignableFrom(c)) {
|
||||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||||
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||||
charfilters.add(castConstructor(CharStream.class, ctor));
|
charfilters.add(castConstructor(CharFilter.class, ctor));
|
||||||
} else {
|
} else {
|
||||||
fail("Cannot get here");
|
fail("Cannot get here");
|
||||||
}
|
}
|
||||||
|
@ -224,7 +223,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterClass
|
@AfterClass
|
||||||
public static void afterClass() throws Exception {
|
public static void afterClass() {
|
||||||
tokenizers = null;
|
tokenizers = null;
|
||||||
tokenfilters = null;
|
tokenfilters = null;
|
||||||
charfilters = null;
|
charfilters = null;
|
||||||
|
@ -524,7 +523,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||||
allowedCharFilterArgs.addAll(argProducers.keySet());
|
allowedCharFilterArgs.addAll(argProducers.keySet());
|
||||||
allowedCharFilterArgs.add(Reader.class);
|
allowedCharFilterArgs.add(Reader.class);
|
||||||
allowedCharFilterArgs.add(CharStream.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
|
@ -560,8 +558,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
Class<?> paramType = paramTypes[i];
|
Class<?> paramType = paramTypes[i];
|
||||||
if (paramType == Reader.class) {
|
if (paramType == Reader.class) {
|
||||||
args[i] = reader;
|
args[i] = reader;
|
||||||
} else if (paramType == CharStream.class) {
|
|
||||||
args[i] = CharReader.get(reader);
|
|
||||||
} else {
|
} else {
|
||||||
args[i] = newRandomArg(random, paramType);
|
args[i] = newRandomArg(random, paramType);
|
||||||
}
|
}
|
||||||
|
@ -701,7 +697,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
int numFilters = random.nextInt(3);
|
int numFilters = random.nextInt(3);
|
||||||
for (int i = 0; i < numFilters; i++) {
|
for (int i = 0; i < numFilters; i++) {
|
||||||
while (true) {
|
while (true) {
|
||||||
final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
||||||
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
|
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
|
||||||
reader = createComponent(ctor, args, descr);
|
reader = createComponent(ctor, args, descr);
|
||||||
if (reader != null) {
|
if (reader != null) {
|
||||||
|
@ -760,24 +756,16 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// wants charfilter to be a filterreader...
|
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
|
||||||
// do *NOT*, do *NOT* refactor me to be a charfilter: LUCENE-3990
|
|
||||||
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharStream {
|
|
||||||
boolean readSomething;
|
boolean readSomething;
|
||||||
CharStream in;
|
|
||||||
|
|
||||||
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
|
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
|
||||||
this.in = CharReader.get(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int correctOffset(int currentOff) {
|
public int correct(int currentOff) {
|
||||||
return in.correctOffset(currentOff);
|
return currentOff; // we don't change any offsets
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
in.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -798,32 +786,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
return in.read(target);
|
return in.read(target);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void mark(int readAheadLimit) throws IOException {
|
|
||||||
in.mark(readAheadLimit);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean markSupported() {
|
|
||||||
return in.markSupported();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(char[] cbuf) throws IOException {
|
public int read(char[] cbuf) throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
return in.read(cbuf);
|
return in.read(cbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean ready() throws IOException {
|
|
||||||
return in.ready();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reset() throws IOException {
|
|
||||||
in.reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long skip(long n) throws IOException {
|
public long skip(long n) throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
|
|
|
@ -233,13 +233,13 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random large strings through the analyzer */
|
/** blast some random large strings through the analyzer */
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adds random graph after:
|
// Adds random graph after:
|
||||||
|
@ -254,6 +254,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
200*RANDOM_MULTIPLIER, 8192);
|
100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -252,6 +252,6 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new UAX29URLEmailAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -455,12 +455,12 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random large strings through the analyzer */
|
/** blast some random large strings through the analyzer */
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,6 +52,6 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new CzechAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new CzechAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,6 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new DanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new DanishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,6 +61,6 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new GermanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new GermanAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,9 +49,22 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(analyzer, "äpfel", "apfel");
|
checkOneTerm(analyzer, "äpfel", "apfel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||||
|
}
|
||||||
|
|
||||||
/** Test against a vocabulary from the reference impl */
|
/** Test against a vocabulary from the reference impl */
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
||||||
|
@ -60,7 +76,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -64,7 +64,7 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -23,9 +23,13 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -58,9 +62,22 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GermanStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -66,6 +66,6 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new GreekAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new GreekAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,6 +55,6 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new EnglishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new EnglishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,7 +54,7 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -42,7 +42,7 @@ public class TestKStemmer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -63,7 +63,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -51,6 +51,6 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new SpanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new SpanishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,7 +48,7 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -51,6 +51,6 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new BasqueAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new BasqueAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -224,6 +224,6 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new PersianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new PersianAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,6 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new FinnishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new FinnishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,9 +49,22 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
|
|
|
@ -164,7 +164,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), new FrenchAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test accent-insensitive */
|
/** test accent-insensitive */
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue