mirror of https://github.com/apache/lucene.git
LUCENE-3312: Merge up to trunk and fix basic Javadocs merge conflicts. The new classes now need method descriptions, mainly oal.index.StorableField(Type)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3312@1379200 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
50d2639308
|
@ -1,49 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="clover" basedir=".">
|
||||
<import file="lucene/common-build.xml"/>
|
||||
|
||||
<!--
|
||||
Run after Junit tests.
|
||||
|
||||
This target is in a separate file, as it needs to include common-build.xml,
|
||||
but must run from top-level!
|
||||
-->
|
||||
<target name="generate-clover-reports" depends="clover">
|
||||
<fail unless="run.clover">Clover not enabled!</fail>
|
||||
<mkdir dir="${clover.report.dir}"/>
|
||||
<fileset dir="." id="clover.test.result.files">
|
||||
<include name="*/build/**/test/TEST-*.xml"/>
|
||||
<exclude name="lucene/build/backwards/**"/>
|
||||
</fileset>
|
||||
<clover-report>
|
||||
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
|
||||
<format type="html" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
|
||||
<format type="xml" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
</clover-report>
|
||||
<echo>You can find the merged Lucene/Solr Clover report in '${clover.report.dir}'.</echo>
|
||||
</target>
|
||||
|
||||
</project>
|
39
build.xml
39
build.xml
|
@ -74,7 +74,7 @@
|
|||
</pathconvert>
|
||||
<fail if="validate.patternsFound">The following files contain @author tags or nocommits:${line.separator}${validate.patternsFound}</fail>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="rat-sources" description="Runs rat across all sources and tests">
|
||||
<sequential><subant target="rat-sources" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
|
@ -248,15 +248,21 @@
|
|||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="check-svn-working-copy">
|
||||
<subant target="check-svn-working-copy" inheritall="false" failonerror="true">
|
||||
<fileset dir="." includes="extra-targets.xml" />
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<!-- Calls only generate-clover-reports on Lucene, as Solr's is just a clone with other target; the database itsself is fixed -->
|
||||
<target name="generate-clover-reports">
|
||||
<subant target="generate-clover-reports" inheritall="false" failonerror="true">
|
||||
<fileset dir="." includes="build-clover.xml" />
|
||||
<fileset dir="." includes="extra-targets.xml" />
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<!-- Jenkins tasks -->
|
||||
<target name="jenkins-hourly" depends="clean,test,validate,-jenkins-javadocs-lint,-svn-status"/>
|
||||
<target name="jenkins-hourly" depends="clean,test,validate,-jenkins-javadocs-lint,check-svn-working-copy"/>
|
||||
|
||||
<target name="jenkins-clover">
|
||||
<antcall target="-jenkins-clover">
|
||||
|
@ -280,31 +286,4 @@
|
|||
<target name="-jenkins-javadocs-lint" unless="-disable.javadocs-lint">
|
||||
<antcall target="javadocs-lint"/>
|
||||
</target>
|
||||
|
||||
<!-- define here, as common-build is not included! -->
|
||||
<property name="svn.exe" value="svn" />
|
||||
|
||||
<target name="-svn-status">
|
||||
<exec executable="${svn.exe}" dir="." failonerror="true">
|
||||
<arg value="status"/>
|
||||
<redirector outputproperty="svn.status.output">
|
||||
<outputfilterchain>
|
||||
<linecontainsregexp>
|
||||
<regexp pattern="^\?" />
|
||||
</linecontainsregexp>
|
||||
<tokenfilter>
|
||||
<replaceregex pattern="^........" replace="* " />
|
||||
<replacestring from="${file.separator}" to="/" />
|
||||
</tokenfilter>
|
||||
</outputfilterchain>
|
||||
</redirector>
|
||||
</exec>
|
||||
<fail message="Source checkout is dirty after running tests!!! Offending files:${line.separator}${svn.status.output}">
|
||||
<condition>
|
||||
<not>
|
||||
<equals arg1="${svn.status.output}" arg2=""/>
|
||||
</not>
|
||||
</condition>
|
||||
</fail>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
@ -95,7 +95,6 @@
|
|||
<classpathentry kind="src" path="solr/contrib/velocity/src/test"/>
|
||||
<classpathentry kind="src" path="solr/contrib/velocity/src/test-files"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/ant-1.8.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/ant-junit-1.8.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/junit-4.10.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-49.1.jar"/>
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
<library name="Ant">
|
||||
<CLASSES>
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/ant-1.8.2.jar!/" />
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/ant-junit-1.8.2.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
<component name="libraryTable">
|
||||
<library name="HSQLDB">
|
||||
<CLASSES>
|
||||
<root url="jar://$PROJECT_DIR$/solr/example/example-DIH/solr/db/lib/hsqldb-1.8.0.10.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
</library>
|
||||
<component name="libraryTable">
|
||||
<library name="HSQLDB">
|
||||
<CLASSES>
|
||||
<root url="jar://$PROJECT_DIR$/solr/example/example-DIH/solr/db/lib/hsqldb-1.8.0.10.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
</library>
|
||||
</component>
|
|
@ -52,11 +52,6 @@
|
|||
<artifactId>ant</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.ant</groupId>
|
||||
<artifactId>ant-junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
||||
<artifactId>randomizedtesting-runner</artifactId>
|
||||
|
|
|
@ -51,10 +51,6 @@
|
|||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.ant</groupId>
|
||||
<artifactId>ant-junit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
||||
<artifactId>randomizedtesting-runner</artifactId>
|
||||
|
|
|
@ -227,11 +227,6 @@
|
|||
<artifactId>ant</artifactId>
|
||||
<version>1.8.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.ant</groupId>
|
||||
<artifactId>ant-junit</artifactId>
|
||||
<version>1.8.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
|
|
|
@ -20,7 +20,225 @@ import re
|
|||
reHREF = re.compile('<a.*?>(.*?)</a>', re.IGNORECASE)
|
||||
|
||||
reMarkup = re.compile('<.*?>')
|
||||
reDivBlock = re.compile('<div class="block">(.*?)</div>', re.IGNORECASE)
|
||||
reCaption = re.compile('<caption><span>(.*?)</span>', re.IGNORECASE)
|
||||
reTDLastNested = re.compile('^<td class="colLast"><code><strong><a href="[^>]*\.([^>]*?)\.html" title="class in[^>]*">', re.IGNORECASE)
|
||||
reTDLast = re.compile('^<td class="colLast"><code><strong><a href="[^>]*#([^>]*?)">', re.IGNORECASE)
|
||||
reColOne = re.compile('^<td class="colOne"><code><strong><a href="[^>]*#([^>]*?)">', re.IGNORECASE)
|
||||
|
||||
# the Method detail section at the end
|
||||
reMethodDetail = re.compile('^<h3>Method Detail</h3>$', re.IGNORECASE)
|
||||
reMethodDetailAnchor = re.compile('^(?:</a>)?<a name="([^>]*?)">$', re.IGNORECASE)
|
||||
reMethodOverridden = re.compile('^<dt><strong>(Specified by:|Overrides:)</strong></dt>$', re.IGNORECASE)
|
||||
|
||||
reTag = re.compile("(?i)<(\/?\w+)((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>")
|
||||
|
||||
def verifyHTML(s):
|
||||
|
||||
stack = []
|
||||
upto = 0
|
||||
while True:
|
||||
m = reTag.search(s, upto)
|
||||
if m is None:
|
||||
break
|
||||
tag = m.group(1)
|
||||
upto = m.end(0)
|
||||
|
||||
if tag[:1] == '/':
|
||||
justTag = tag[1:]
|
||||
else:
|
||||
justTag = tag
|
||||
|
||||
if justTag.lower() in ('br', 'li', 'p', 'col'):
|
||||
continue
|
||||
|
||||
if tag[:1] == '/':
|
||||
if len(stack) == 0:
|
||||
raise RuntimeError('saw closing "%s" without opening <%s...>' % (m.group(0), tag[1:]))
|
||||
elif stack[-1][0] != tag[1:].lower():
|
||||
raise RuntimeError('closing "%s" does not match opening "%s"' % (m.group(0), stack[-1][1]))
|
||||
stack.pop()
|
||||
else:
|
||||
stack.append((tag.lower(), m.group(0)))
|
||||
|
||||
if len(stack) != 0:
|
||||
raise RuntimeError('"%s" was never closed' % stack[-1][1])
|
||||
|
||||
def cleanHTML(s):
|
||||
s = reMarkup.sub('', s)
|
||||
s = s.replace(' ', ' ')
|
||||
s = s.replace('<', '<')
|
||||
s = s.replace('>', '>')
|
||||
s = s.replace('&', '&')
|
||||
return s.strip()
|
||||
|
||||
reH3 = re.compile('^<h3>(.*?)</h3>', re.IGNORECASE | re.MULTILINE)
|
||||
reH4 = re.compile('^<h4>(.*?)</h4>', re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
def checkClassDetails(fullPath):
|
||||
"""
|
||||
Checks for invalid HTML in the full javadocs under each field/method.
|
||||
"""
|
||||
|
||||
# TODO: only works with java7 generated javadocs now!
|
||||
with open(fullPath, encoding='UTF-8') as f:
|
||||
desc = None
|
||||
cat = None
|
||||
item = None
|
||||
errors = []
|
||||
for line in f.readlines():
|
||||
m = reH3.search(line)
|
||||
if m is not None:
|
||||
if desc is not None:
|
||||
# Have to fake <ul> context because we pulled a fragment out "across" two <ul>s:
|
||||
desc = ''.join(desc)
|
||||
if True or cat == 'Constructor Detail':
|
||||
idx = desc.find('</div>')
|
||||
if idx == -1:
|
||||
# Ctor missing javadocs ... checkClassSummaries catches it
|
||||
desc = None
|
||||
continue
|
||||
desc = desc[:idx+6]
|
||||
else:
|
||||
desc = '<ul>%s</ul>' % ''.join(desc)
|
||||
#print(' VERIFY %s: %s: %s' % (cat, item, desc))
|
||||
try:
|
||||
verifyHTML(desc)
|
||||
except RuntimeError as re:
|
||||
#print(' FAILED: %s' % re)
|
||||
errors.append((cat, item, str(re)))
|
||||
desc = None
|
||||
cat = m.group(1)
|
||||
continue
|
||||
|
||||
m = reH4.search(line)
|
||||
if m is not None:
|
||||
if desc is not None:
|
||||
# Have to fake <ul> context because we pulled a fragment out "across" two <ul>s:
|
||||
desc = '<ul>%s</ul>' % ''.join(desc)
|
||||
#print(' VERIFY %s: %s: %s' % (cat, item, desc))
|
||||
try:
|
||||
verifyHTML(desc)
|
||||
except RuntimeError as re:
|
||||
#print(' FAILED: %s' % re)
|
||||
errors.append((cat, item, str(re)))
|
||||
item = m.group(1)
|
||||
desc = []
|
||||
continue
|
||||
|
||||
if desc is not None:
|
||||
desc.append(line)
|
||||
|
||||
if len(errors) != 0:
|
||||
print()
|
||||
print(fullPath)
|
||||
for cat, item, message in errors:
|
||||
print(' broken details HTML: %s: %s: %s' % (cat, item, message))
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def checkClassSummaries(fullPath):
|
||||
|
||||
# TODO: only works with java7 generated javadocs now!
|
||||
f = open(fullPath, encoding='UTF-8')
|
||||
|
||||
missing = []
|
||||
broken = []
|
||||
inThing = False
|
||||
lastCaption = None
|
||||
lastItem = None
|
||||
|
||||
desc = None
|
||||
|
||||
foundMethodDetail = False
|
||||
lastMethodAnchor = None
|
||||
|
||||
for line in f.readlines():
|
||||
m = reMethodDetail.search(line)
|
||||
if m is not None:
|
||||
foundMethodDetail = True
|
||||
continue
|
||||
|
||||
# prune methods that are just @Overrides of other interface/classes,
|
||||
# they should be specified elsewhere, if they are e.g. jdk or
|
||||
# external classes we cannot inherit their docs anyway
|
||||
if foundMethodDetail:
|
||||
m = reMethodDetailAnchor.search(line)
|
||||
if m is not None:
|
||||
lastMethodAnchor = m.group(1)
|
||||
continue
|
||||
m = reMethodOverridden.search(line)
|
||||
if m is not None and ('Methods', lastMethodAnchor) in missing:
|
||||
#print('removing @overridden method: %s' % lastMethodAnchor)
|
||||
missing.remove(('Methods', lastMethodAnchor))
|
||||
|
||||
m = reCaption.search(line)
|
||||
if m is not None:
|
||||
lastCaption = m.group(1)
|
||||
#print(' caption %s' % lastCaption)
|
||||
m = reTDLastNested.search(line)
|
||||
if m is not None:
|
||||
# nested classes
|
||||
lastItem = m.group(1)
|
||||
#print(' item %s' % lastItem)
|
||||
else:
|
||||
m = reTDLast.search(line)
|
||||
if m is not None:
|
||||
# methods etc
|
||||
lastItem = m.group(1)
|
||||
else:
|
||||
# ctors etc
|
||||
m = reColOne.search(line)
|
||||
if m is not None:
|
||||
lastItem = m.group(1)
|
||||
#print(' item %s' % lastItem)
|
||||
|
||||
lineLower = line.strip().lower()
|
||||
|
||||
if lineLower.find('<tr class="') != -1:
|
||||
inThing = True
|
||||
hasDesc = False
|
||||
continue
|
||||
|
||||
if inThing:
|
||||
if lineLower.find('</tr>') != -1:
|
||||
if not hasDesc:
|
||||
missing.append((lastCaption, lastItem))
|
||||
inThing = False
|
||||
continue
|
||||
else:
|
||||
if line.find('<div class="block">') != -1:
|
||||
desc = []
|
||||
if desc is not None:
|
||||
desc.append(line)
|
||||
if line.find('</div>') != -1:
|
||||
desc = ''.join(desc)
|
||||
|
||||
try:
|
||||
verifyHTML(desc)
|
||||
except RuntimeError as e:
|
||||
broken.append((lastCaption, lastItem, str(e)))
|
||||
#print('FAIL: %s: %s: %s: %s' % (lastCaption, lastItem, e, desc))
|
||||
|
||||
desc = desc.replace('<div class="block">', '')
|
||||
desc = desc.replace('</div>', '')
|
||||
desc = desc.strip()
|
||||
hasDesc = len(desc) > 0
|
||||
|
||||
desc = None
|
||||
f.close()
|
||||
if len(missing) > 0 or len(broken) > 0:
|
||||
print()
|
||||
print(fullPath)
|
||||
for (caption, item) in missing:
|
||||
print(' missing %s: %s' % (caption, item))
|
||||
for (caption, item, why) in broken:
|
||||
print(' broken HTML: %s: %s: %s' % (caption, item, why))
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def checkSummary(fullPath):
|
||||
printed = False
|
||||
f = open(fullPath, encoding='UTF-8')
|
||||
|
@ -84,8 +302,8 @@ def checkPackageSummaries(root, level='class'):
|
|||
True if there are problems.
|
||||
"""
|
||||
|
||||
if level != 'class' and level != 'package':
|
||||
print('unsupported level: %s, must be "class" or "package"' % level)
|
||||
if level != 'class' and level != 'package' and level != 'method':
|
||||
print('unsupported level: %s, must be "class" or "package" or "method"' % level)
|
||||
sys.exit(1)
|
||||
|
||||
#for dirPath, dirNames, fileNames in os.walk('%s/lucene/build/docs/api' % root):
|
||||
|
@ -99,8 +317,13 @@ def checkPackageSummaries(root, level='class'):
|
|||
sys.exit(1)
|
||||
|
||||
anyMissing = False
|
||||
for dirPath, dirNames, fileNames in os.walk(root):
|
||||
if not os.path.isdir(root):
|
||||
checkClassSummaries(root)
|
||||
checkClassDetails(root)
|
||||
sys.exit(0)
|
||||
|
||||
for dirPath, dirNames, fileNames in os.walk(root):
|
||||
|
||||
if dirPath.find('/all/') != -1:
|
||||
# These are dups (this is a bit risk, eg, root IS this /all/ directory..)
|
||||
continue
|
||||
|
@ -108,6 +331,16 @@ def checkPackageSummaries(root, level='class'):
|
|||
if 'package-summary.html' in fileNames:
|
||||
if level != 'package' and checkSummary('%s/package-summary.html' % dirPath):
|
||||
anyMissing = True
|
||||
for fileName in fileNames:
|
||||
fullPath = '%s/%s' % (dirPath, fileName)
|
||||
if not fileName.startswith('package-') and fileName.endswith('.html') and os.path.isfile(fullPath):
|
||||
if level == 'method':
|
||||
if checkClassSummaries(fullPath):
|
||||
anyMissing = True
|
||||
# always look for broken html, regardless of level supplied
|
||||
if checkClassDetails(fullPath):
|
||||
anyMissing = True
|
||||
|
||||
if 'overview-summary.html' in fileNames:
|
||||
if checkSummary('%s/overview-summary.html' % dirPath):
|
||||
anyMissing = True
|
||||
|
@ -116,7 +349,7 @@ def checkPackageSummaries(root, level='class'):
|
|||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2 or len(sys.argv) > 3:
|
||||
print('usage: %s <dir> [class|package]' % sys.argv[0])
|
||||
print('usage: %s <dir> [class|package|method]' % sys.argv[0])
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
level = 'class'
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="extra-targets" basedir=".">
|
||||
<description>
|
||||
This file is designed for importing into a main build file, and not intended
|
||||
for standalone use.
|
||||
</description>
|
||||
|
||||
<import file="lucene/common-build.xml"/>
|
||||
|
||||
<!--
|
||||
Run after Junit tests.
|
||||
|
||||
This target is in a separate file, as it needs to include common-build.xml,
|
||||
but must run from top-level!
|
||||
-->
|
||||
<target name="generate-clover-reports" depends="clover">
|
||||
<fail unless="run.clover">Clover not enabled!</fail>
|
||||
<mkdir dir="${clover.report.dir}"/>
|
||||
<fileset dir="." id="clover.test.result.files">
|
||||
<include name="*/build/**/test/TEST-*.xml"/>
|
||||
<exclude name="lucene/build/backwards/**"/>
|
||||
</fileset>
|
||||
<clover-report>
|
||||
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
|
||||
<format type="html" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
|
||||
<format type="xml" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
</clover-report>
|
||||
<echo>You can find the merged Lucene/Solr Clover report in '${clover.report.dir}'.</echo>
|
||||
</target>
|
||||
|
||||
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="check-svn-working-copy" depends="ivy-availability-check,ivy-fail,ivy-configure">
|
||||
<ivy:cachepath organisation="org.tmatesoft.svnkit" module="svnkit" revision="1.7.5-v1"
|
||||
inline="true" conf="default" type="jar" transitive="true" pathid="svnkit.classpath"/>
|
||||
<script language="javascript" classpathref="svnkit.classpath" taskname="svn"><![CDATA[
|
||||
importClass(java.io.File);
|
||||
importClass(java.util.TreeSet);
|
||||
importPackage(org.tmatesoft.svn.core);
|
||||
importPackage(org.tmatesoft.svn.core.wc);
|
||||
var manager = SVNClientManager.newInstance();
|
||||
var statusClient = manager.getStatusClient();
|
||||
var wcClient = manager.getWCClient();
|
||||
|
||||
var basedir = new File(project.getProperty("basedir")).getAbsoluteFile();
|
||||
var baseLen = basedir.toString().length();
|
||||
var convertRelative = function(file) {
|
||||
return file.getAbsolutePath().substring(baseLen + 1).replace(File.separatorChar, '/');
|
||||
}
|
||||
|
||||
var missingProps = new TreeSet(), unversioned = new TreeSet();
|
||||
|
||||
self.log("Getting all versioned and unversioned files...");
|
||||
statusClient.doStatus(basedir, SVNRevision.WORKING, SVNDepth.fromRecurse(true), false, true, false, false, new ISVNStatusHandler({
|
||||
handleStatus: function(status) {
|
||||
var nodeStatus = status.getNodeStatus();
|
||||
if (nodeStatus == SVNStatusType.STATUS_UNVERSIONED) {
|
||||
unversioned.add(convertRelative(status.getFile()));
|
||||
} else if (status.getKind() == SVNNodeKind.FILE && nodeStatus != SVNStatusType.STATUS_DELETED) {
|
||||
missingProps.add(convertRelative(status.getFile()));
|
||||
}
|
||||
}
|
||||
}), null);
|
||||
|
||||
self.log("Filtering files with existing svn:eol-style...");
|
||||
wcClient.doGetProperty(basedir, "svn:eol-style", SVNRevision.WORKING, SVNRevision.WORKING, true, new ISVNPropertyHandler({
|
||||
handleProperty: function(file, prop) {
|
||||
missingProps.remove(convertRelative(file));
|
||||
}
|
||||
}));
|
||||
|
||||
self.log("Filtering files with binary svn:mime-type...");
|
||||
wcClient.doGetProperty(basedir, "svn:mime-type", SVNRevision.WORKING, SVNRevision.WORKING, true, new ISVNPropertyHandler({
|
||||
handleProperty: function(file, prop) {
|
||||
prop = SVNPropertyValue.getPropertyAsString(prop.getValue());
|
||||
if (prop.startsWith("application/") || prop.startsWith("image/")) {
|
||||
missingProps.remove(convertRelative(file));
|
||||
}
|
||||
}
|
||||
}));
|
||||
|
||||
var convertSet2String = function(set) {
|
||||
return set.isEmpty() ? null : ("* " + set.toArray().join(project.getProperty("line.separator") + "* "))
|
||||
};
|
||||
project.setProperty("svn.checkprops.failed", convertSet2String(missingProps));
|
||||
project.setProperty("svn.unversioned.failed", convertSet2String(unversioned));
|
||||
]]></script>
|
||||
<fail if="svn.checkprops.failed"
|
||||
message="The following files are missing svn:eol-style (or binary svn:mime-type):${line.separator}${svn.checkprops.failed}"/>
|
||||
<fail if="svn.unversioned.failed"
|
||||
message="Source checkout is dirty after running tests!!! Offending files:${line.separator}${svn.unversioned.failed}"/>
|
||||
</target>
|
||||
</project>
|
|
@ -66,35 +66,13 @@ system.
|
|||
|
||||
NOTE: the ~ character represents your user account home directory.
|
||||
|
||||
Step 3) Install JavaCC
|
||||
|
||||
Building the Lucene distribution from the source does not require the JavaCC
|
||||
parser generator, but if you wish to regenerate any of the pre-generated
|
||||
parser pieces, you will need to install JavaCC. Version 4.1 is tested to
|
||||
work correctly.
|
||||
|
||||
http://javacc.dev.java.net
|
||||
|
||||
Follow the download links and download the zip file to a temporary
|
||||
location on your file system.
|
||||
|
||||
After JavaCC is installed, create a build.properties file
|
||||
(as in step 2), and add the line
|
||||
|
||||
javacc.home=/javacc
|
||||
|
||||
where this points to the root directory of your javacc installation
|
||||
(the directory that contains bin/lib/javacc.jar).
|
||||
|
||||
Step 4) Run ant
|
||||
Step 3) Run ant
|
||||
|
||||
Assuming you have ant in your PATH and have set ANT_HOME to the
|
||||
location of your ant installation, typing "ant" at the shell prompt
|
||||
and command prompt should run ant. Ant will by default look for the
|
||||
"build.xml" file in your current directory, and compile Lucene.
|
||||
|
||||
To rebuild any of the JavaCC-based parsers, run "ant javacc".
|
||||
|
||||
For further information on Lucene, go to:
|
||||
http://lucene.apache.org/
|
||||
|
||||
|
|
|
@ -23,6 +23,11 @@ New Features
|
|||
* LUCENE-4323: Added support for an absolute maximum CFS segment size
|
||||
(in MiB) to LogMergePolicy and TieredMergePolicy.
|
||||
(Alexey Lef via Uwe Schindler)
|
||||
|
||||
* LUCENE-4339: Allow deletes against 3.x segments for easier upgrading.
|
||||
Lucene3x Codec is still otherwise read-only, you should not set it
|
||||
as the default Codec on IndexWriter, because it cannot write new segments.
|
||||
(Mike McCandless, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
|
@ -77,6 +82,10 @@ API Changes
|
|||
fields in a stored document, has been replaced with the simpler
|
||||
StoredFieldVisitor API. (Mike McCandless)
|
||||
|
||||
* LUCENE-4343: Made Tokenizer.setReader final. This is a setter that should
|
||||
not be overriden by subclasses: per-stream initialization should happen
|
||||
in reset(). (Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
||||
|
@ -110,8 +119,18 @@ Bug Fixes
|
|||
containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir,
|
||||
Mike McCandless)
|
||||
|
||||
* LUCENE-4224: Add in-order scorer to query time joining and the
|
||||
out-of-order scorer throws an UOE. (Martijn van Groningen, Robert Muir)
|
||||
|
||||
* LUCENE-4333: Fixed NPE in TermGroupFacetCollector when faceting on mv fields.
|
||||
(Jesse MacVicar, Martijn van Groningen)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4322: Decrease lucene-core JAR size. The core JAR size had increased a
|
||||
lot because of generated code introduced in LUCENE-4161 and LUCENE-3892.
|
||||
(Adrien Grand)
|
||||
|
||||
* LUCENE-4317: Improve reuse of internal TokenStreams and StringReader
|
||||
in oal.document.Field. (Uwe Schindler, Chris Male, Robert Muir)
|
||||
|
||||
|
@ -883,7 +902,7 @@ API Changes
|
|||
|
||||
* LUCENE-3866: IndexReaderContext.leaves() is now the preferred way to access
|
||||
atomic sub-readers of any kind of IndexReader (for AtomicReaders it returns
|
||||
itsself as only leaf with docBase=0). (Uwe Schindler)
|
||||
itself as only leaf with docBase=0). (Uwe Schindler)
|
||||
|
||||
New features
|
||||
|
||||
|
@ -2306,7 +2325,7 @@ Changes in backwards compatibility policy
|
|||
(Mike McCandless, Shai Erera)
|
||||
|
||||
* LUCENE-3084: MergePolicy.OneMerge.segments was changed from
|
||||
SegmentInfos to a List<SegmentInfo>. SegmentInfos itsself was changed
|
||||
SegmentInfos to a List<SegmentInfo>. SegmentInfos itself was changed
|
||||
to no longer extend Vector<SegmentInfo> (to update code that is using
|
||||
Vector-API, use the new asList() and asSet() methods returning unmodifiable
|
||||
collections; modifying SegmentInfos is now only possible through
|
||||
|
@ -7547,7 +7566,7 @@ Infrastructure
|
|||
11. Fixed bugs in GermanAnalyzer (gschwarz)
|
||||
|
||||
|
||||
1.2 RC2:
|
||||
1.2 RC2
|
||||
- added sources to distribution
|
||||
- removed broken build scripts and libraries from distribution
|
||||
- SegmentsReader: fixed potential race condition
|
||||
|
@ -7562,7 +7581,8 @@ Infrastructure
|
|||
- JDK 1.1 compatibility fix: disabled lock files for JDK 1.1,
|
||||
since they rely on a feature added in JDK 1.2.
|
||||
|
||||
1.2 RC1 (first Apache release):
|
||||
1.2 RC1
|
||||
- first Apache release
|
||||
- packages renamed from com.lucene to org.apache.lucene
|
||||
- license switched from LGPL to Apache
|
||||
- ant-only build -- no more makefiles
|
||||
|
@ -7573,7 +7593,8 @@ Infrastructure
|
|||
- Analyzers can choose tokenizer based on field name
|
||||
- misc bug fixes.
|
||||
|
||||
1.01b (last Sourceforge release)
|
||||
1.01b
|
||||
. last Sourceforge release
|
||||
. a few bug fixes
|
||||
. new Query Parser
|
||||
. new prefix query (search for "foo*" matches "food")
|
||||
|
|
|
@ -318,9 +318,9 @@ FieldCache, use them with care!
|
|||
|
||||
The method IndexReader#getSequentialSubReaders() was moved to CompositeReader
|
||||
(see LUCENE-2858, LUCENE-3733) and made protected. It is solely used by
|
||||
CompositeReader itsself to build its reader tree. To get all atomic leaves
|
||||
CompositeReader itself to build its reader tree. To get all atomic leaves
|
||||
of a reader, use IndexReader#leaves(), which also provides the doc base
|
||||
of each leave. Readers that are already atomic return itsself as leaf with
|
||||
of each leave. Readers that are already atomic return itself as leaf with
|
||||
doc base 0. To emulate Lucene 3.x getSequentialSubReaders(),
|
||||
use getContext().children().
|
||||
|
||||
|
@ -626,3 +626,8 @@ you can now do this:
|
|||
method, StoredFieldVisitor has a needsField method: if that method
|
||||
returns true then the field will be loaded and the appropriate
|
||||
type-specific method will be invoked with that fields's value.
|
||||
|
||||
* LUCENE-4122: Removed the Payload class and replaced with BytesRef.
|
||||
PayloadAttribute's name is unchanged, it just uses the BytesRef
|
||||
class to refer to the payload bytes/start offset/end offset
|
||||
(or null if there is no payload).
|
||||
|
|
|
@ -94,8 +94,7 @@ public final class KeywordTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
this.done = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -78,9 +78,6 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
if (group >= 0 && group > matcher.groupCount()) {
|
||||
throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
|
||||
}
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -136,8 +133,7 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
|
|
|
@ -175,8 +175,7 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
public void reset() throws IOException {
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -183,8 +183,7 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
public void reset() throws IOException {
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -162,8 +162,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
public void reset() throws IOException {
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,15 @@ import java.util.regex.PatternSyntaxException;
|
|||
/**
|
||||
* Abstract parent class for analysis factories {@link TokenizerFactory},
|
||||
* {@link TokenFilterFactory} and {@link CharFilterFactory}.
|
||||
* <p>
|
||||
* The typical lifecycle for a factory consumer is:
|
||||
* <ol>
|
||||
* <li>Create factory via its a no-arg constructor
|
||||
* <li>Set version emulation by calling {@link #setLuceneMatchVersion(Version)}
|
||||
* <li>Calls {@link #init(Map)} passing arguments as key-value mappings.
|
||||
* <li>(Optional) If the factory uses resources such as files, {@link ResourceLoaderAware#inform(ResourceLoader)} is called to initialize those resources.
|
||||
* <li>Consumer calls create() to obtain instances.
|
||||
* </ol>
|
||||
*/
|
||||
public abstract class AbstractAnalysisFactory {
|
||||
|
||||
|
@ -46,6 +55,9 @@ public abstract class AbstractAnalysisFactory {
|
|||
/** the luceneVersion arg */
|
||||
protected Version luceneMatchVersion = null;
|
||||
|
||||
/**
|
||||
* Initialize this factory via a set of key-value pairs.
|
||||
*/
|
||||
public void init(Map<String,String> args) {
|
||||
this.args = args;
|
||||
}
|
||||
|
@ -104,6 +116,9 @@ public abstract class AbstractAnalysisFactory {
|
|||
return Boolean.parseBoolean(s);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles a pattern for the value of the specified argument key <code>name</code>
|
||||
*/
|
||||
protected Pattern getPattern(String name) {
|
||||
try {
|
||||
String pat = args.get(name);
|
||||
|
@ -118,6 +133,10 @@ public abstract class AbstractAnalysisFactory {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns as {@link CharArraySet} from wordFiles, which
|
||||
* can be a comma-separated list of filenames
|
||||
*/
|
||||
protected CharArraySet getWordSet(ResourceLoader loader,
|
||||
String wordFiles, boolean ignoreCase) throws IOException {
|
||||
assureMatchVersion();
|
||||
|
@ -137,6 +156,9 @@ public abstract class AbstractAnalysisFactory {
|
|||
return words;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the resource's lines (with content treated as UTF-8)
|
||||
*/
|
||||
protected List<String> getLines(ResourceLoader loader, String resource) throws IOException {
|
||||
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
|
||||
}
|
||||
|
|
|
@ -78,7 +78,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
|
||||
// note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
|
||||
private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
|
||||
|
@ -162,8 +163,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
|
|
|
@ -27,5 +27,9 @@ import java.io.IOException;
|
|||
*/
|
||||
public interface ResourceLoaderAware {
|
||||
|
||||
/**
|
||||
* Initializes this component with the provided ResourceLoader
|
||||
* (used for loading classes, files, etc).
|
||||
*/
|
||||
void inform(ResourceLoader loader) throws IOException;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,9 @@ package org.apache.lucene.analysis.util;
|
|||
|
||||
/** Some commonly-used stemming functions */
|
||||
public class StemmerUtil {
|
||||
/** no instance */
|
||||
private StemmerUtil() {}
|
||||
|
||||
/**
|
||||
* Returns true if the character array starts with the suffix.
|
||||
*
|
||||
|
|
|
@ -36,7 +36,10 @@ import org.apache.lucene.util.Version;
|
|||
*/
|
||||
public class WordlistLoader {
|
||||
|
||||
private static final int INITITAL_CAPACITY = 16;
|
||||
private static final int INITIAL_CAPACITY = 16;
|
||||
|
||||
/** no instance */
|
||||
private WordlistLoader() {}
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
|
||||
|
@ -74,7 +77,7 @@ public class WordlistLoader {
|
|||
* @return A {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
|
||||
return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
|
||||
return getWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -89,7 +92,7 @@ public class WordlistLoader {
|
|||
* @return A CharArraySet with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
|
||||
return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
|
||||
return getWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -171,7 +174,7 @@ public class WordlistLoader {
|
|||
* @return A {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
|
||||
return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
|
||||
return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -318,18 +318,12 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
tokens = null;
|
||||
scanner.reset();
|
||||
first = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
|
|
|
@ -39,6 +39,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
|
||||
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
|
||||
cgf.reset();
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How", term.toString());
|
||||
assertTrue(cgf.incrementToken());
|
||||
|
@ -61,6 +62,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
|
||||
CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
|
||||
nsf.reset();
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("How_the", term.toString());
|
||||
assertTrue(nsf.incrementToken());
|
||||
|
|
|
@ -235,6 +235,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
|
||||
tf.reset();
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
assertTrue(tf.incrementToken());
|
||||
|
@ -256,6 +257,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
assertTrue("Custom attribute value was lost", retAtt.getRetain());
|
||||
}
|
||||
|
|
|
@ -80,6 +80,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
|
||||
void verifyPayload(TokenStream ts) throws IOException {
|
||||
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
|
||||
ts.reset();
|
||||
for(byte b=1;;b++) {
|
||||
boolean hasNext = ts.incrementToken();
|
||||
if (!hasNext) break;
|
||||
|
|
|
@ -66,6 +66,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertNotNull(stream);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
|
@ -83,6 +84,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
|
|
|
@ -111,6 +111,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
in.reset();
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0)
|
||||
out.append(' ');
|
||||
|
|
|
@ -45,7 +45,8 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
/** true length of text in the buffer */
|
||||
private int length = 0;
|
||||
/** length in buffer that can be evaluated safely, up to a safe end point */
|
||||
private int usableLength = 0;
|
||||
// note: usableLength is -1 here to best-effort AIOOBE consumers that don't call reset()
|
||||
private int usableLength = -1;
|
||||
/** accumulated offset of previous buffers for this reader, for offsetAtt */
|
||||
private int offset = 0;
|
||||
|
||||
|
@ -101,12 +102,6 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
breaker.setText(buffer, 0, 0);
|
||||
length = usableLength = offset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
|
|
|
@ -244,15 +244,9 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
this.dotOut = dotOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
buffer.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer.reset(input);
|
||||
resetState();
|
||||
}
|
||||
|
||||
|
|
|
@ -112,16 +112,9 @@ public final class SentenceTokenizer extends Tokenizer {
|
|||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenStart = tokenEnd = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
|
|
|
@ -220,7 +220,7 @@ public class Row {
|
|||
* Character.
|
||||
*
|
||||
* @param way the Character associated with the desired Cell
|
||||
* @return the reference, or -1 if the Cell is <tt>null,/tt>
|
||||
* @return the reference, or -1 if the Cell is <tt>null</tt>
|
||||
*/
|
||||
public int getRef(Character way) {
|
||||
Cell c = at(way);
|
||||
|
|
|
@ -80,8 +80,7 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
iterator = null;
|
||||
}
|
||||
|
||||
|
|
|
@ -248,6 +248,10 @@
|
|||
<!-- spatial: problems -->
|
||||
<check-missing-javadocs dir="build/docs/suggest" level="class"/>
|
||||
<check-missing-javadocs dir="build/docs/test-framework" level="class"/>
|
||||
|
||||
<!-- too much to fix for now, but enforce full javadocs for key packages -->
|
||||
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
|
||||
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
|
@ -452,16 +456,6 @@
|
|||
<sign-artifacts-macro artifacts.dir="${dist.dir}"/>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- Build the JavaCC files into the source tree -->
|
||||
<!-- ================================================================== -->
|
||||
|
||||
<target name="javacc" depends="javacc-check">
|
||||
<subant target="javacc" failonerror="true" inheritall="false">
|
||||
<fileset dir="${common.dir}/queryparser" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="build-modules" depends="compile-test"
|
||||
description="Builds all additional modules and their tests">
|
||||
<modules-crawl target="build-artifacts-and-tests"/>
|
||||
|
@ -480,24 +474,6 @@
|
|||
<modules-crawl target="test" failonerror="true"/>
|
||||
</target>
|
||||
|
||||
<!--
|
||||
compile changes.txt into an html file
|
||||
-->
|
||||
<macrodef name="build-changes">
|
||||
<attribute name="changes.src.dir" default="${changes.src.dir}"/>
|
||||
<attribute name="changes.target.dir" default="${changes.target.dir}"/>
|
||||
<sequential>
|
||||
<mkdir dir="@{changes.target.dir}"/>
|
||||
<exec executable="perl" input="CHANGES.txt" output="@{changes.target.dir}/Changes.html"
|
||||
failonerror="true" logError="true">
|
||||
<arg value="@{changes.src.dir}/changes2html.pl"/>
|
||||
</exec>
|
||||
<copy todir="@{changes.target.dir}">
|
||||
<fileset dir="@{changes.src.dir}" includes="*.css"/>
|
||||
</copy>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<target name="changes-to-html">
|
||||
<build-changes changes.src.dir="${changes.src.dir}" changes.target.dir="${changes.target.dir}" />
|
||||
</target>
|
||||
|
|
|
@ -173,7 +173,6 @@
|
|||
<property name="m2.repository.url" value="file://${maven.dist.dir}"/>
|
||||
<property name="m2.repository.private.key" value="${user.home}/.ssh/id_dsa"/>
|
||||
|
||||
<property name="javacc.home" location="${common.dir}"/>
|
||||
<property name="jflex.home" location="${common.dir}"/>
|
||||
|
||||
<path id="jflex.classpath">
|
||||
|
@ -185,12 +184,6 @@
|
|||
</fileset>
|
||||
</path>
|
||||
|
||||
<path id="javacc.classpath">
|
||||
<fileset dir="${javacc.home}/">
|
||||
<include name="bin/lib/*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<property name="backwards.dir" location="backwards"/>
|
||||
<property name="build.dir.backwards" location="${build.dir}/backwards"/>
|
||||
|
||||
|
@ -261,23 +254,6 @@
|
|||
<delete file="velocity.log"/>
|
||||
</target>
|
||||
|
||||
<!-- TODO: maybe make JavaCC checking specific to only the projects
|
||||
that use it (Lucene core and queryparsers)
|
||||
-->
|
||||
<target name="javacc-uptodate-check">
|
||||
<uptodate property="javacc.files.uptodate">
|
||||
<srcfiles dir="${src.dir}" includes="**/*.jj" />
|
||||
<mapper type="glob" from="*.jj" to="*.java"/>
|
||||
</uptodate>
|
||||
</target>
|
||||
|
||||
<target name="javacc-notice" depends="javacc-uptodate-check" unless="javacc.files.uptodate">
|
||||
<echo>
|
||||
One or more of the JavaCC .jj files is newer than its corresponding
|
||||
.java file. Run the "javacc" target to regenerate the artifacts.
|
||||
</echo>
|
||||
</target>
|
||||
|
||||
<target name="init" depends="resolve">
|
||||
<!-- currently empty -->
|
||||
</target>
|
||||
|
@ -391,36 +367,6 @@
|
|||
</echo>
|
||||
</target>
|
||||
|
||||
<target name="javacc-check">
|
||||
<available property="javacc.present" classname="org.javacc.parser.Main">
|
||||
<classpath refid="javacc.classpath"/>
|
||||
</available>
|
||||
<fail unless="javacc.present">
|
||||
##################################################################
|
||||
JavaCC not found.
|
||||
JavaCC Home: ${javacc.home}
|
||||
|
||||
Please download and install JavaCC 4.1 from:
|
||||
|
||||
<http://javacc.dev.java.net>
|
||||
|
||||
Then, create a build.properties file either in your home
|
||||
directory, or within the Lucene directory and set the javacc.home
|
||||
property to the path where JavaCC is installed. For example,
|
||||
if you installed JavaCC in /usr/local/java/javacc-4.1, then set the
|
||||
javacc.home property to:
|
||||
|
||||
javacc.home=/usr/local/java/javacc-4.1
|
||||
|
||||
If you get an error like the one below, then you have not installed
|
||||
things correctly. Please check all your paths and try again.
|
||||
|
||||
java.lang.NoClassDefFoundError: org.javacc.parser.Main
|
||||
##################################################################
|
||||
</fail>
|
||||
|
||||
</target>
|
||||
|
||||
<target name="jflex-check">
|
||||
<available property="jflex.present" classname="jflex.anttask.JFlexTask">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
|
@ -508,6 +454,9 @@
|
|||
<attribute name="spec.version"/>
|
||||
<attribute name="manifest.file" default="${manifest.file}"/>
|
||||
<sequential>
|
||||
<!-- If possible, include the svnversion -->
|
||||
<exec dir="." executable="${svnversion.exe}" outputproperty="svnversion" failifexecutionfails="false"/>
|
||||
|
||||
<manifest file="@{manifest.file}">
|
||||
<!--
|
||||
http://java.sun.com/j2se/1.5.0/docs/guide/jar/jar.html#JAR%20Manifest
|
||||
|
@ -558,12 +507,6 @@
|
|||
<attribute name="manifest.file" default="${manifest.file}"/>
|
||||
<element name="nested" optional="true" implicit="true"/>
|
||||
<sequential>
|
||||
<!-- If possible, include the svnversion -->
|
||||
<exec dir="." executable="${svnversion.exe}"
|
||||
outputproperty="svnversion" failifexecutionfails="false">
|
||||
<arg value="."/>
|
||||
</exec>
|
||||
|
||||
<build-manifest title="@{title}"
|
||||
implementation.title="@{implementation.title}"
|
||||
spec.version="@{spec.version}"
|
||||
|
@ -850,7 +793,14 @@
|
|||
<sysproperty key="tests.multiplier" value="@{tests.multiplier}"/>
|
||||
|
||||
<!-- Temporary directory in the cwd. -->
|
||||
<sysproperty key="tempDir" value="."/>
|
||||
<sysproperty key="tempDir" value="." />
|
||||
<sysproperty key="java.io.tmpdir" value="." />
|
||||
|
||||
<!-- Restrict access to certain Java features and install security manager: -->
|
||||
<sysproperty key="tests.sandbox.dir" value="${build.dir}" />
|
||||
<sysproperty key="clover.db.dir" value="${clover.db.dir}" />
|
||||
<sysproperty key="java.security.manager" value="java.lang.SecurityManager" />
|
||||
<sysproperty key="java.security.policy" value="${common.dir}/tools/junit4/tests.policy" />
|
||||
|
||||
<sysproperty key="lucene.version" value="${dev.version}"/>
|
||||
|
||||
|
@ -1381,31 +1331,11 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
|
|||
<!-- <compilerarg line="-Xmaxwarns 10000000"/>
|
||||
<compilerarg line="-Xmaxerrs 10000000"/> -->
|
||||
<!-- for generics in Java 1.5: -->
|
||||
<compilerarg line="${javac.args}"/>
|
||||
<compilerarg line="${javac.args}"/>
|
||||
</javac>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<macrodef name="invoke-javacc">
|
||||
<attribute name="target"/>
|
||||
<attribute name="outputDir"/>
|
||||
<sequential>
|
||||
<mkdir dir="@{outputDir}"/>
|
||||
<javacc
|
||||
target="@{target}"
|
||||
outputDirectory="@{outputDir}"
|
||||
debugTokenManager="${javacc.debug.tokenmgr}"
|
||||
debugParser="${javacc.debug.parser}"
|
||||
debuglookahead="${javacc.debug.lookahead}"
|
||||
javacchome="${javacc.home}"
|
||||
jdkversion="${javac.source}"
|
||||
/>
|
||||
<fixcrlf srcdir="@{outputDir}" includes="*.java" encoding="UTF-8">
|
||||
<containsregexp expression="Generated.*By.*JavaCC"/>
|
||||
</fixcrlf>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<property name="failonjavadocwarning" value="true"/>
|
||||
<macrodef name="invoke-javadoc">
|
||||
<element name="sources" optional="yes"/>
|
||||
|
@ -1547,10 +1477,10 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
|
|||
description="Populates properties svn.URL and svn.Revision using 'svn info'.">
|
||||
<attribute name="directory"/>
|
||||
<sequential>
|
||||
<exec dir="." executable="${svnversion.exe}" outputproperty="svn.ver"/>
|
||||
<exec dir="@{directory}" executable="${svnversion.exe}" outputproperty="svn.ver"/>
|
||||
<fail message="A subversion checkout is required for this target">
|
||||
<condition>
|
||||
<equals arg1="${svn.ver}" arg2="exported"/>
|
||||
<matches pattern="(exported|unversioned.*)" string="${svn.ver}" casesensitive="false"/>
|
||||
</condition>
|
||||
</fail>
|
||||
<exec dir="@{directory}" executable="${svn.exe}" outputproperty="svn.info" failonerror="true">
|
||||
|
@ -1697,7 +1627,7 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
|
|||
<element name="nested" optional="false" implicit="true"/>
|
||||
<sequential>
|
||||
<copy todir="@{todir}" flatten="@{flatten}" overwrite="@{overwrite}" verbose="true"
|
||||
preservelastmodified="false" encoding="UTF-8" outputencoding="UTF-8"
|
||||
preservelastmodified="false" encoding="UTF-8" outputencoding="UTF-8" taskname="pegdown"
|
||||
>
|
||||
<filterchain>
|
||||
<tokenfilter>
|
||||
|
@ -1757,4 +1687,22 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
|
|||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<!--
|
||||
compile changes.txt into an html file
|
||||
-->
|
||||
<macrodef name="build-changes">
|
||||
<attribute name="changes.src.dir" default="${changes.src.dir}"/>
|
||||
<attribute name="changes.target.dir" default="${changes.target.dir}"/>
|
||||
<sequential>
|
||||
<mkdir dir="@{changes.target.dir}"/>
|
||||
<exec executable="perl" input="CHANGES.txt" output="@{changes.target.dir}/Changes.html"
|
||||
failonerror="true" logError="true">
|
||||
<arg value="@{changes.src.dir}/changes2html.pl"/>
|
||||
</exec>
|
||||
<copy todir="@{changes.target.dir}">
|
||||
<fileset dir="@{changes.src.dir}" includes="*.css"/>
|
||||
</copy>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -38,8 +38,6 @@
|
|||
<pathelement path="${java.class.path}"/>
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="jflex-notice, javacc-notice, common.compile-core"/>
|
||||
|
||||
<target name="test-core" depends="common.test"/>
|
||||
|
||||
<target name="javadocs-core" depends="javadocs"/>
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis;
|
|||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.util.CloseableThreadLocal;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
|
@ -67,14 +68,26 @@ import java.util.Map;
|
|||
* Analysis integration with Apache UIMA.
|
||||
* </ul>
|
||||
*/
|
||||
public abstract class Analyzer {
|
||||
public abstract class Analyzer implements Closeable {
|
||||
|
||||
private final ReuseStrategy reuseStrategy;
|
||||
|
||||
/**
|
||||
* Create a new Analyzer, reusing the same set of components per-thread
|
||||
* across calls to {@link #tokenStream(String, Reader)}.
|
||||
*/
|
||||
public Analyzer() {
|
||||
this(new GlobalReuseStrategy());
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: create a new Analyzer with a custom {@link ReuseStrategy}.
|
||||
* <p>
|
||||
* NOTE: if you just want to reuse on a per-field basis, its easier to
|
||||
* use a subclass of {@link AnalyzerWrapper} such as
|
||||
* <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.html">
|
||||
* PerFieldAnalyerWrapper</a> instead.
|
||||
*/
|
||||
public Analyzer(ReuseStrategy reuseStrategy) {
|
||||
this.reuseStrategy = reuseStrategy;
|
||||
}
|
||||
|
@ -93,20 +106,25 @@ public abstract class Analyzer {
|
|||
Reader reader);
|
||||
|
||||
/**
|
||||
* Creates a TokenStream that is allowed to be re-use from the previous time
|
||||
* that the same thread called this method. Callers that do not need to use
|
||||
* more than one TokenStream at the same time from this analyzer should use
|
||||
* this method for better performance.
|
||||
* Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
|
||||
* the contents of <code>reader</code>.
|
||||
* <p>
|
||||
* This method uses {@link #createComponents(String, Reader)} to obtain an
|
||||
* instance of {@link TokenStreamComponents}. It returns the sink of the
|
||||
* components and stores the components internally. Subsequent calls to this
|
||||
* method will reuse the previously stored components after resetting them
|
||||
* through {@link TokenStreamComponents#setReader(Reader)}.
|
||||
* </p>
|
||||
* <p>
|
||||
* <b>NOTE:</b> After calling this method, the consumer must follow the
|
||||
* workflow described in {@link TokenStream} to properly consume its contents.
|
||||
* See the {@link org.apache.lucene.analysis Analysis package documentation} for
|
||||
* some examples demonstrating this.
|
||||
*
|
||||
* @param fieldName the name of the field the created TokenStream is used for
|
||||
* @param reader the reader the streams source reads from
|
||||
* @return TokenStream for iterating the analyzed content of <code>reader</code>
|
||||
* @throws AlreadyClosedException if the Analyzer is closed.
|
||||
* @throws IOException if an i/o error occurs.
|
||||
*/
|
||||
public final TokenStream tokenStream(final String fieldName,
|
||||
final Reader reader) throws IOException {
|
||||
|
@ -123,6 +141,13 @@ public abstract class Analyzer {
|
|||
|
||||
/**
|
||||
* Override this if you want to add a CharFilter chain.
|
||||
* <p>
|
||||
* The default implementation returns <code>reader</code>
|
||||
* unchanged.
|
||||
*
|
||||
* @param fieldName IndexableField name being indexed
|
||||
* @param reader original Reader
|
||||
* @return reader, optionally decorated with CharFilter(s)
|
||||
*/
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return reader;
|
||||
|
@ -139,7 +164,8 @@ public abstract class Analyzer {
|
|||
* exact PhraseQuery matches, for instance, across IndexableField instance boundaries.
|
||||
*
|
||||
* @param fieldName IndexableField name being indexed.
|
||||
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
|
||||
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}.
|
||||
* This value must be {@code >= 0}.
|
||||
*/
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return 0;
|
||||
|
@ -152,7 +178,8 @@ public abstract class Analyzer {
|
|||
* produced at least one token for indexing.
|
||||
*
|
||||
* @param fieldName the field just indexed
|
||||
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
|
||||
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}.
|
||||
* This value must be {@code >= 0}.
|
||||
*/
|
||||
public int getOffsetGap(String fieldName) {
|
||||
return 1;
|
||||
|
@ -171,7 +198,14 @@ public abstract class Analyzer {
|
|||
* {@link Analyzer#tokenStream(String, Reader)}.
|
||||
*/
|
||||
public static class TokenStreamComponents {
|
||||
/**
|
||||
* Original source of the tokens.
|
||||
*/
|
||||
protected final Tokenizer source;
|
||||
/**
|
||||
* Sink tokenstream, such as the outer tokenfilter decorating
|
||||
* the chain. This can be the source if there are no filters.
|
||||
*/
|
||||
protected final TokenStream sink;
|
||||
|
||||
/**
|
||||
|
@ -235,10 +269,13 @@ public abstract class Analyzer {
|
|||
* Strategy defining how TokenStreamComponents are reused per call to
|
||||
* {@link Analyzer#tokenStream(String, java.io.Reader)}.
|
||||
*/
|
||||
public static abstract class ReuseStrategy {
|
||||
public static abstract class ReuseStrategy implements Closeable {
|
||||
|
||||
private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();
|
||||
|
||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
|
||||
public ReuseStrategy() {}
|
||||
|
||||
/**
|
||||
* Gets the reusable TokenStreamComponents for the field with the given name
|
||||
*
|
||||
|
@ -262,6 +299,7 @@ public abstract class Analyzer {
|
|||
* Returns the currently stored value
|
||||
*
|
||||
* @return Currently stored value or {@code null} if no value is stored
|
||||
* @throws AlreadyClosedException if the ReuseStrategy is closed.
|
||||
*/
|
||||
protected final Object getStoredValue() {
|
||||
try {
|
||||
|
@ -279,6 +317,7 @@ public abstract class Analyzer {
|
|||
* Sets the stored value
|
||||
*
|
||||
* @param storedValue Value to store
|
||||
* @throws AlreadyClosedException if the ReuseStrategy is closed.
|
||||
*/
|
||||
protected final void setStoredValue(Object storedValue) {
|
||||
try {
|
||||
|
@ -296,8 +335,10 @@ public abstract class Analyzer {
|
|||
* Closes the ReuseStrategy, freeing any resources
|
||||
*/
|
||||
public void close() {
|
||||
storedValue.close();
|
||||
storedValue = null;
|
||||
if (storedValue != null) {
|
||||
storedValue.close();
|
||||
storedValue = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -306,17 +347,16 @@ public abstract class Analyzer {
|
|||
* every field.
|
||||
*/
|
||||
public final static class GlobalReuseStrategy extends ReuseStrategy {
|
||||
|
||||
/** Creates a new instance, with empty per-thread values */
|
||||
public GlobalReuseStrategy() {}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public TokenStreamComponents getReusableComponents(String fieldName) {
|
||||
return (TokenStreamComponents) getStoredValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void setReusableComponents(String fieldName, TokenStreamComponents components) {
|
||||
setStoredValue(components);
|
||||
}
|
||||
|
@ -328,19 +368,18 @@ public abstract class Analyzer {
|
|||
*/
|
||||
public static class PerFieldReuseStrategy extends ReuseStrategy {
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
/** Creates a new instance, with empty per-thread-per-field values */
|
||||
public PerFieldReuseStrategy() {}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public TokenStreamComponents getReusableComponents(String fieldName) {
|
||||
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
|
||||
return componentsPerField != null ? componentsPerField.get(fieldName) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public void setReusableComponents(String fieldName, TokenStreamComponents components) {
|
||||
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
|
||||
if (componentsPerField == null) {
|
||||
|
|
|
@ -61,25 +61,16 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
|||
*/
|
||||
protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components);
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) {
|
||||
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader));
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public final int getPositionIncrementGap(String fieldName) {
|
||||
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public final int getOffsetGap(String fieldName) {
|
||||
return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
|
||||
|
|
|
@ -38,6 +38,11 @@ public final class CachingTokenFilter extends TokenFilter {
|
|||
private Iterator<AttributeSource.State> iterator = null;
|
||||
private AttributeSource.State finalState;
|
||||
|
||||
/**
|
||||
* Create a new CachingTokenFilter around <code>input</code>,
|
||||
* caching its token attributes, which can be replayed again
|
||||
* after a call to {@link #reset()}.
|
||||
*/
|
||||
public CachingTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
@ -67,6 +72,13 @@ public final class CachingTokenFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewinds the iterator to the beginning of the cached list.
|
||||
* <p>
|
||||
* Note that this does not call reset() on the wrapped tokenstream ever, even
|
||||
* the first time. You should reset() the inner tokenstream before wrapping
|
||||
* it with CachingTokenFilter.
|
||||
*/
|
||||
@Override
|
||||
public void reset() {
|
||||
if(cache != null) {
|
||||
|
|
|
@ -33,6 +33,9 @@ import java.io.Reader;
|
|||
* You can optionally provide more efficient implementations of additional methods
|
||||
* like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
|
||||
* but this is not required.
|
||||
* <p>
|
||||
* For examples and integration with {@link Analyzer}, see the
|
||||
* {@link org.apache.lucene.analysis Analysis package documentation}.
|
||||
*/
|
||||
// the way java.io.FilterReader should work!
|
||||
public abstract class CharFilter extends Reader {
|
||||
|
@ -52,6 +55,10 @@ public abstract class CharFilter extends Reader {
|
|||
|
||||
/**
|
||||
* Closes the underlying input stream.
|
||||
* <p>
|
||||
* <b>NOTE:</b>
|
||||
* The default implementation closes the input Reader, so
|
||||
* be sure to call <code>super.close()</code> when overriding this method.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
|
|
|
@ -144,6 +144,12 @@ public final class NumericTokenStream extends TokenStream {
|
|||
private long value = 0L;
|
||||
private int valueSize = 0, shift = 0, precisionStep = 0;
|
||||
private BytesRef bytes = new BytesRef();
|
||||
|
||||
/**
|
||||
* Creates, but does not yet initialize this attribute instance
|
||||
* @see #init(long, int, int, int)
|
||||
*/
|
||||
public NumericTermAttributeImpl() {}
|
||||
|
||||
public BytesRef getBytesRef() {
|
||||
return bytes;
|
||||
|
|
|
@ -176,8 +176,8 @@ public class Token extends CharTermAttributeImpl
|
|||
* instead use the char[] termBuffer methods to set the
|
||||
* term text.
|
||||
* @param text term text
|
||||
* @param start start offset
|
||||
* @param end end offset
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text
|
||||
*/
|
||||
public Token(String text, int start, int end) {
|
||||
checkOffsets(start, end);
|
||||
|
@ -191,8 +191,8 @@ public class Token extends CharTermAttributeImpl
|
|||
* speed you should instead use the char[] termBuffer
|
||||
* methods to set the term text.
|
||||
* @param text term text
|
||||
* @param start start offset
|
||||
* @param end end offset
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text
|
||||
* @param typ token type
|
||||
*/
|
||||
public Token(String text, int start, int end, String typ) {
|
||||
|
@ -208,9 +208,9 @@ public class Token extends CharTermAttributeImpl
|
|||
* offsets, & type. <b>NOTE:</b> for better indexing
|
||||
* speed you should instead use the char[] termBuffer
|
||||
* methods to set the term text.
|
||||
* @param text
|
||||
* @param start
|
||||
* @param end
|
||||
* @param text term text
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text
|
||||
* @param flags token type bits
|
||||
*/
|
||||
public Token(String text, int start, int end, int flags) {
|
||||
|
@ -225,11 +225,11 @@ public class Token extends CharTermAttributeImpl
|
|||
* Constructs a Token with the given term buffer (offset
|
||||
* & length), start and end
|
||||
* offsets
|
||||
* @param startTermBuffer
|
||||
* @param termBufferOffset
|
||||
* @param termBufferLength
|
||||
* @param start
|
||||
* @param end
|
||||
* @param startTermBuffer buffer containing term text
|
||||
* @param termBufferOffset the index in the buffer of the first character
|
||||
* @param termBufferLength number of valid characters in the buffer
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text
|
||||
*/
|
||||
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
|
||||
checkOffsets(start, end);
|
||||
|
@ -238,31 +238,9 @@ public class Token extends CharTermAttributeImpl
|
|||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Set the position increment. This determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
* @param positionIncrement the distance from the prior term
|
||||
* @see org.apache.lucene.index.DocsAndPositionsEnum
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see PositionIncrementAttribute
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
|
@ -271,93 +249,101 @@ public class Token extends CharTermAttributeImpl
|
|||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see PositionIncrementAttribute
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
/** Set the position length.
|
||||
* @see PositionLengthAttribute */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see PositionLengthAttribute
|
||||
*/
|
||||
@Override
|
||||
public void setPositionLength(int positionLength) {
|
||||
this.positionLength = positionLength;
|
||||
}
|
||||
|
||||
/** Get the position length.
|
||||
* @see PositionLengthAttribute */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see PositionLengthAttribute
|
||||
*/
|
||||
@Override
|
||||
public int getPositionLength() {
|
||||
return positionLength;
|
||||
}
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to {@link #length}, as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see OffsetAttribute
|
||||
*/
|
||||
public final int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see OffsetAttribute
|
||||
*/
|
||||
public final int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see OffsetAttribute
|
||||
*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
checkOffsets(startOffset, endOffset);
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see TypeAttribute
|
||||
*/
|
||||
public final String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see TypeAttribute
|
||||
*/
|
||||
public final void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
* @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
|
||||
* {@inheritDoc}
|
||||
* @see FlagsAttribute
|
||||
*/
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
* {@inheritDoc}
|
||||
* @see FlagsAttribute
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
* {@inheritDoc}
|
||||
* @see PayloadAttribute
|
||||
*/
|
||||
public BytesRef getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see PayloadAttribute
|
||||
*/
|
||||
public void setPayload(BytesRef payload) {
|
||||
this.payload = payload;
|
||||
|
@ -551,8 +537,8 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/**
|
||||
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
|
||||
* @param prototype
|
||||
* @param newTerm
|
||||
* @param prototype existing Token
|
||||
* @param newTerm new term text
|
||||
*/
|
||||
public void reinit(Token prototype, String newTerm) {
|
||||
setEmpty().append(newTerm);
|
||||
|
@ -566,10 +552,10 @@ public class Token extends CharTermAttributeImpl
|
|||
|
||||
/**
|
||||
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
|
||||
* @param prototype
|
||||
* @param newTermBuffer
|
||||
* @param offset
|
||||
* @param length
|
||||
* @param prototype existing Token
|
||||
* @param newTermBuffer buffer containing new term text
|
||||
* @param offset the index in the buffer of the first character
|
||||
* @param length number of valid characters in the buffer
|
||||
*/
|
||||
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
|
||||
copyBuffer(newTermBuffer, offset, length);
|
||||
|
|
|
@ -34,21 +34,37 @@ public abstract class TokenFilter extends TokenStream {
|
|||
this.input = input;
|
||||
}
|
||||
|
||||
/** Performs end-of-stream operations, if any, and calls then <code>end()</code> on the
|
||||
* input TokenStream.<p/>
|
||||
* <b>NOTE:</b> Be sure to call <code>super.end()</code> first when overriding this method.*/
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* <b>NOTE:</b>
|
||||
* The default implementation chains the call to the input TokenStream, so
|
||||
* be sure to call <code>super.end()</code> first when overriding this method.
|
||||
*/
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
input.end();
|
||||
}
|
||||
|
||||
/** Close the input TokenStream. */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* <b>NOTE:</b>
|
||||
* The default implementation chains the call to the input TokenStream, so
|
||||
* be sure to call <code>super.close()</code> when overriding this method.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
/** Reset the filter as well as the input TokenStream. */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* <b>NOTE:</b>
|
||||
* The default implementation chains the call to the input TokenStream, so
|
||||
* be sure to call <code>super.reset()</code> when overriding this method.
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
|
|
|
@ -170,12 +170,8 @@ public abstract class TokenStream extends AttributeSource implements Closeable {
|
|||
* This method is called by a consumer before it begins consumption using
|
||||
* {@link #incrementToken()}.
|
||||
* <p/>
|
||||
* Resets this stream to the beginning. As all TokenStreams must be reusable,
|
||||
* any implementations which have state that needs to be reset between usages
|
||||
* of the TokenStream, must implement this method. Note that if your TokenStream
|
||||
* caches tokens and feeds them back again after a reset, it is imperative
|
||||
* that you clone the tokens when you store them away (on the first pass) as
|
||||
* well as when you return them (on future passes after {@link #reset()}).
|
||||
* Resets this stream to a clean state. Stateful implementations must implement
|
||||
* this method so that they can be reused, just as if they had been created fresh.
|
||||
*/
|
||||
public void reset() throws IOException {}
|
||||
|
||||
|
|
|
@ -54,7 +54,13 @@ public abstract class Tokenizer extends TokenStream {
|
|||
this.input = input;
|
||||
}
|
||||
|
||||
/** By default, closes the input Reader. */
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* <b>NOTE:</b>
|
||||
* The default implementation closes the input Reader, so
|
||||
* be sure to call <code>super.close()</code> when overriding this method.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (input != null) {
|
||||
|
@ -76,12 +82,18 @@ public abstract class Tokenizer extends TokenStream {
|
|||
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff;
|
||||
}
|
||||
|
||||
/** Expert: Reset the tokenizer to a new reader. Typically, an
|
||||
/** Expert: Set a new reader on the Tokenizer. Typically, an
|
||||
* analyzer (in its tokenStream method) will use
|
||||
* this to re-use a previously created tokenizer. */
|
||||
public void setReader(Reader input) throws IOException {
|
||||
public final void setReader(Reader input) throws IOException {
|
||||
assert input != null: "input must not be null";
|
||||
this.input = input;
|
||||
assert setReaderTestPoint();
|
||||
}
|
||||
|
||||
// only used by assert, for testing
|
||||
boolean setReaderTestPoint() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -817,5 +817,30 @@ As a small hint, this is how the new Attribute class could begin:
|
|||
|
||||
...
|
||||
</pre>
|
||||
<h4>Adding a CharFilter chain</h4>
|
||||
Analyzers take Java {@link java.io.Reader}s as input. Of course you can wrap your Readers with {@link java.io.FilterReader}s
|
||||
to manipulate content, but this would have the big disadvantage that character offsets might be inconsistent with your original
|
||||
text.
|
||||
<p>
|
||||
{@link org.apache.lucene.analysis.CharFilter} is designed to allow you to pre-process input like a FilterReader would, but also
|
||||
preserve the original offsets associated with those characters. This way mechanisms like highlighting still work correctly.
|
||||
CharFilters can be chained.
|
||||
<p>
|
||||
Example:
|
||||
<pre class="prettyprint">
|
||||
public class MyAnalyzer extends Analyzer {
|
||||
|
||||
{@literal @Override}
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new MyTokenizer(reader));
|
||||
}
|
||||
|
||||
{@literal @Override}
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
// wrap the Reader in a CharFilter chain.
|
||||
return new SecondCharFilter(new FirstCharFilter(reader));
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -38,7 +38,11 @@ public interface CharTermAttribute extends Attribute, CharSequence, Appendable {
|
|||
* #resizeBuffer(int)} to increase it. After
|
||||
* altering the buffer be sure to call {@link
|
||||
* #setLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
* characters that were placed into the termBuffer.
|
||||
* <p>
|
||||
* <b>NOTE</b>: The returned buffer may be larger than
|
||||
* the valid {@link #length()}.
|
||||
*/
|
||||
public char[] buffer();
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
|
|
|
@ -26,14 +26,15 @@ import org.apache.lucene.util.AttributeReflector;
|
|||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* The term text of a Token.
|
||||
*/
|
||||
/** Default implementation of {@link CharTermAttribute}. */
|
||||
public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttribute, TermToBytesRefAttribute, Cloneable {
|
||||
private static int MIN_BUFFER_SIZE = 10;
|
||||
|
||||
private char[] termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
private int termLength = 0;
|
||||
|
||||
/** Initialize this attribute with empty term text */
|
||||
public CharTermAttributeImpl() {}
|
||||
|
||||
public final void copyBuffer(char[] buffer, int offset, int length) {
|
||||
growTermBuffer(length);
|
||||
|
|
|
@ -22,22 +22,23 @@ import org.apache.lucene.util.Attribute;
|
|||
|
||||
/**
|
||||
* This attribute can be used to pass different flags down the {@link Tokenizer} chain,
|
||||
* eg from one TokenFilter to another one.
|
||||
* e.g. from one TokenFilter to another one.
|
||||
* <p>
|
||||
* This is completely distinct from {@link TypeAttribute}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other
|
||||
* {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
* @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
|
||||
*/
|
||||
public interface FlagsAttribute extends Attribute {
|
||||
/**
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* Get the bitset for any bits that have been set.
|
||||
* @return The bits
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public int getFlags();
|
||||
|
||||
/**
|
||||
* Set the flags to a new bitset.
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags);
|
||||
|
|
|
@ -19,30 +19,17 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* This attribute can be used to pass different flags down the tokenizer chain,
|
||||
* eg from one TokenFilter to another one.
|
||||
* @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
|
||||
*/
|
||||
/** Default implementation of {@link FlagsAttribute}. */
|
||||
public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable {
|
||||
private int flags = 0;
|
||||
|
||||
/**
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
*/
|
||||
/** Initialize this attribute with no bits set */
|
||||
public FlagsAttributeImpl() {}
|
||||
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
|
|
@ -30,20 +30,22 @@ import org.apache.lucene.util.Attribute;
|
|||
public interface KeywordAttribute extends Attribute {
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> iff the current token is a keyword, otherwise
|
||||
* <code>false</code>/
|
||||
* Returns <code>true</code> if the current token is a keyword, otherwise
|
||||
* <code>false</code>
|
||||
*
|
||||
* @return <code>true</code> iff the current token is a keyword, otherwise
|
||||
* <code>false</code>/
|
||||
* @return <code>true</code> if the current token is a keyword, otherwise
|
||||
* <code>false</code>
|
||||
* @see #setKeyword(boolean)
|
||||
*/
|
||||
public boolean isKeyword();
|
||||
|
||||
/**
|
||||
* Marks the current token as keyword iff set to <code>true</code>.
|
||||
* Marks the current token as keyword if set to <code>true</code>.
|
||||
*
|
||||
* @param isKeyword
|
||||
* <code>true</code> iff the current token is a keyword, otherwise
|
||||
* <code>true</code> if the current token is a keyword, otherwise
|
||||
* <code>false</code>.
|
||||
* @see #isKeyword()
|
||||
*/
|
||||
public void setKeyword(boolean isKeyword);
|
||||
}
|
||||
|
|
|
@ -17,19 +17,15 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
*This attribute can be used to mark a token as a keyword. Keyword aware
|
||||
* {@link TokenStream}s can decide to modify a token based on the return value
|
||||
* of {@link #isKeyword()} if the token is modified. Stemming filters for
|
||||
* instance can use this attribute to conditionally skip a term if
|
||||
* {@link #isKeyword()} returns <code>true</code>.
|
||||
*/
|
||||
/** Default implementation of {@link KeywordAttribute}. */
|
||||
public final class KeywordAttributeImpl extends AttributeImpl implements
|
||||
KeywordAttribute {
|
||||
private boolean keyword;
|
||||
|
||||
/** Initialize this attribute with the keyword value as false. */
|
||||
public KeywordAttributeImpl() {}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
|
@ -57,24 +53,10 @@ public final class KeywordAttributeImpl extends AttributeImpl implements
|
|||
return keyword == other.keyword;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> iff the current token is a keyword, otherwise
|
||||
* <code>false</code>/
|
||||
*
|
||||
* @return <code>true</code> iff the current token is a keyword, otherwise
|
||||
* <code>false</code>/
|
||||
*/
|
||||
public boolean isKeyword() {
|
||||
return keyword;
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the current token as keyword iff set to <code>true</code>.
|
||||
*
|
||||
* @param isKeyword
|
||||
* <code>true</code> iff the current token is a keyword, otherwise
|
||||
* <code>false</code>.
|
||||
*/
|
||||
public void setKeyword(boolean isKeyword) {
|
||||
keyword = isKeyword;
|
||||
}
|
||||
|
|
|
@ -23,22 +23,34 @@ import org.apache.lucene.util.Attribute;
|
|||
* The start and end character offset of a Token.
|
||||
*/
|
||||
public interface OffsetAttribute extends Attribute {
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
/**
|
||||
* Returns this Token's starting offset, the position of the first character
|
||||
* corresponding to this token in the source text.
|
||||
* <p>
|
||||
* Note that the difference between {@link #endOffset()} and <code>startOffset()</code>
|
||||
* may not be equal to termText.length(), as the term text may have been altered by a
|
||||
* stemmer or some other filter.
|
||||
* @see #setOffset(int, int)
|
||||
*/
|
||||
public int startOffset();
|
||||
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
/**
|
||||
* Set the starting and ending offset.
|
||||
* @throws IllegalArgumentException If <code>startOffset</code> or <code>endOffset</code>
|
||||
* are negative, or if <code>startOffset</code> is greater than
|
||||
* <code>endOffset</code>
|
||||
* @see #startOffset()
|
||||
* @see #endOffset()
|
||||
*/
|
||||
public void setOffset(int startOffset, int endOffset);
|
||||
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
/**
|
||||
* Returns this Token's ending offset, one greater than the position of the
|
||||
* last character corresponding to this token in the source text. The length
|
||||
* of the token in the source text is (<code>endOffset()</code> - {@link #startOffset()}).
|
||||
* @see #setOffset(int, int)
|
||||
*/
|
||||
public int endOffset();
|
||||
}
|
||||
|
|
|
@ -19,26 +19,18 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The start and end character offset of a Token.
|
||||
*/
|
||||
/** Default implementation of {@link OffsetAttribute}. */
|
||||
public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable {
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
/** Initialize this attribute with startOffset and endOffset of 0. */
|
||||
public OffsetAttributeImpl() {}
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
|
||||
// TODO: we could assert that this is set-once, ie,
|
||||
|
@ -56,10 +48,6 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
|
|||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
|
|
@ -17,20 +17,34 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum; // javadocs
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* The payload of a Token.
|
||||
* The payload of a Token.
|
||||
* <p>
|
||||
* The payload is stored in the index at each position, and can
|
||||
* be used to influence scoring when using Payload-based queries
|
||||
* in the {@link org.apache.lucene.search.payloads} and
|
||||
* {@link org.apache.lucene.search.spans} packages.
|
||||
* <p>
|
||||
* NOTE: because the payload will be stored at each position, its usually
|
||||
* best to use the minimum number of bytes necessary. Some codec implementations
|
||||
* may optimize payload storage when all payloads have the same length.
|
||||
*
|
||||
* @see DocsAndPositionsEnum
|
||||
*/
|
||||
public interface PayloadAttribute extends Attribute {
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
* @see #setPayload(BytesRef)
|
||||
*/
|
||||
public BytesRef getPayload();
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
* @see #getPayload()
|
||||
*/
|
||||
public void setPayload(BytesRef payload);
|
||||
}
|
||||
|
|
|
@ -20,9 +20,7 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* The payload of a Token.
|
||||
*/
|
||||
/** Default implementation of {@link PayloadAttribute}. */
|
||||
public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable {
|
||||
private BytesRef payload;
|
||||
|
||||
|
@ -38,16 +36,10 @@ public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttrib
|
|||
this.payload = payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public BytesRef getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(BytesRef payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
|
|
@ -49,11 +49,14 @@ public interface PositionIncrementAttribute extends Attribute {
|
|||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
* @throws IllegalArgumentException if <code>positionIncrement</code>
|
||||
* is negative.
|
||||
* @see #getPositionIncrement()
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement);
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
* @see #setPositionIncrement(int)
|
||||
*/
|
||||
public int getPositionIncrement();
|
||||
}
|
||||
|
|
|
@ -17,40 +17,15 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/** Determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
*/
|
||||
/** Default implementation of {@link PositionIncrementAttribute}. */
|
||||
public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable {
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
/** Initialize this attribute with position increment of 1 */
|
||||
public PositionIncrementAttributeImpl() {}
|
||||
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0) {
|
||||
throw new IllegalArgumentException
|
||||
|
@ -59,9 +34,6 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
|
|||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
|
|
@ -26,11 +26,20 @@ import org.apache.lucene.util.Attribute;
|
|||
* produced by decompounding, word splitting/joining,
|
||||
* synonym filtering, etc.
|
||||
*
|
||||
* <p>The default value is one. */
|
||||
* <p>NOTE: this is optional, and most analyzers
|
||||
* don't change the default value (1). */
|
||||
|
||||
public interface PositionLengthAttribute extends Attribute {
|
||||
/** @param positionLength how many positions this token
|
||||
* spans. */
|
||||
/**
|
||||
* Set the position length of this Token.
|
||||
* <p>
|
||||
* The default value is one.
|
||||
* @param positionLength how many positions this token
|
||||
* spans.
|
||||
* @throws IllegalArgumentException if <code>positionLength</code>
|
||||
* is zero or negative.
|
||||
* @see #getPositionLength()
|
||||
*/
|
||||
public void setPositionLength(int positionLength);
|
||||
|
||||
/** Returns the position length of this Token.
|
||||
|
|
|
@ -19,13 +19,13 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/** See {@link PositionLengthAttribute}. */
|
||||
/** Default implementation of {@link PositionLengthAttribute}. */
|
||||
public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
|
||||
private int positionLength = 1;
|
||||
|
||||
/** @param positionLength how many positions this token
|
||||
* spans. NOTE: this is optional, and most analyzers
|
||||
* don't change the default value (1). */
|
||||
/** Initializes this attribute with position length of 1. */
|
||||
public PositionLengthAttributeImpl() {}
|
||||
|
||||
public void setPositionLength(int positionLength) {
|
||||
if (positionLength < 1) {
|
||||
throw new IllegalArgumentException
|
||||
|
@ -34,9 +34,6 @@ public class PositionLengthAttributeImpl extends AttributeImpl implements Positi
|
|||
this.positionLength = positionLength;
|
||||
}
|
||||
|
||||
/** Returns the position length of this Token.
|
||||
* @see #setPositionLength
|
||||
*/
|
||||
public int getPositionLength() {
|
||||
return positionLength;
|
||||
}
|
||||
|
|
|
@ -56,7 +56,7 @@ public interface TermToBytesRefAttribute extends Attribute {
|
|||
* Updates the bytes {@link #getBytesRef()} to contain this term's
|
||||
* final encoding, and returns its hashcode.
|
||||
* @return the hashcode as defined by {@link BytesRef#hashCode}:
|
||||
* <pre>
|
||||
* <pre class="prettyprint">
|
||||
* int hash = 0;
|
||||
* for (int i = termBytes.offset; i < termBytes.offset+termBytes.length; i++) {
|
||||
* hash = 31*hash + termBytes.bytes[i];
|
||||
|
|
|
@ -27,10 +27,15 @@ public interface TypeAttribute extends Attribute {
|
|||
/** the default type */
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
/**
|
||||
* Returns this Token's lexical type. Defaults to "word".
|
||||
* @see #setType(String)
|
||||
*/
|
||||
public String type();
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
/**
|
||||
* Set the lexical type.
|
||||
* @see #type()
|
||||
*/
|
||||
public void setType(String type);
|
||||
}
|
||||
|
|
|
@ -19,27 +19,24 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* A Token's lexical type. The Default value is "word".
|
||||
*/
|
||||
/** Default implementation of {@link TypeAttribute}. */
|
||||
public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable {
|
||||
private String type;
|
||||
|
||||
/** Initialize this attribute with {@link TypeAttribute#DEFAULT_TYPE} */
|
||||
public TypeAttributeImpl() {
|
||||
this(DEFAULT_TYPE);
|
||||
}
|
||||
|
||||
/** Initialize this attribute with <code>type</code> */
|
||||
public TypeAttributeImpl(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
|
|
@ -41,6 +41,14 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
|
||||
private final String name;
|
||||
|
||||
/**
|
||||
* Creates a new codec.
|
||||
* <p>
|
||||
* The provided name will be written into the index segment: in order to
|
||||
* for the segment to be read this class should be registered with Java's
|
||||
* SPI mechanism (registered in META-INF/ of your jar file, etc).
|
||||
* @param name must be all ascii alphanumeric, and less than 128 characters in length.
|
||||
*/
|
||||
public Codec(String name) {
|
||||
NamedSPILoader.checkServiceName(name);
|
||||
this.name = name;
|
||||
|
@ -118,6 +126,10 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
defaultCodec = codec;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the codec's name. Subclasses can override to provide
|
||||
* more detail (such as parameters).
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return name;
|
||||
|
|
|
@ -18,14 +18,24 @@ package org.apache.lucene.codecs;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ServiceLoader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; // javadocs
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.util.NamedSPILoader;
|
||||
|
||||
/**
|
||||
* Encodes/decodes terms, postings, and proximity data.
|
||||
* <p>
|
||||
* Note, when extending this class, the name ({@link #getName}) may
|
||||
* written into the index in certain configurations. In order for the segment
|
||||
* to be read, the name must resolve to your implementation via {@link #forName(String)}.
|
||||
* This method uses Java's
|
||||
* {@link ServiceLoader Service Provider Interface} to resolve codec names.
|
||||
* <p>
|
||||
* @see ServiceLoader
|
||||
* @lucene.experimental */
|
||||
public abstract class PostingsFormat implements NamedSPILoader.NamedSPI {
|
||||
|
||||
|
@ -38,11 +48,21 @@ public abstract class PostingsFormat implements NamedSPILoader.NamedSPI {
|
|||
*/
|
||||
private final String name;
|
||||
|
||||
/**
|
||||
* Creates a new postings format.
|
||||
* <p>
|
||||
* The provided name will be written into the index segment in some configurations
|
||||
* (such as when using {@link PerFieldPostingsFormat}): in such configurations,
|
||||
* for the segment to be read this class should be registered with Java's
|
||||
* SPI mechanism (registered in META-INF/ of your jar file, etc).
|
||||
* @param name must be all ascii alphanumeric, and less than 128 characters in length.
|
||||
*/
|
||||
protected PostingsFormat(String name) {
|
||||
NamedSPILoader.checkServiceName(name);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** Returns this posting format's name */
|
||||
@Override
|
||||
public final String getName() {
|
||||
return name;
|
||||
|
|
|
@ -1,485 +1,485 @@
|
|||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.FuzzySet.ContainsResult;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A {@link PostingsFormat} useful for low doc-frequency fields such as primary
|
||||
* keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
|
||||
* for reads in segments known to have no record of the key. A choice of
|
||||
* delegate PostingsFormat is used to record all other Postings data.
|
||||
* </p>
|
||||
* <p>
|
||||
* A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
|
||||
* settings on a per-field basis. The default configuration is
|
||||
* {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
|
||||
* values using {@link MurmurHash2}. This should be suitable for most purposes.
|
||||
* </p>
|
||||
* <p>
|
||||
* The format of the blm file is as follows:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>BloomFilter (.blm) --> Header, DelegatePostingsFormatName,
|
||||
* NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
|
||||
* <li>Filter --> FieldNumber, FuzzySet</li>
|
||||
* <li>FuzzySet -->See {@link FuzzySet#serialize(DataOutput)}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>DelegatePostingsFormatName --> {@link DataOutput#writeString(String)
|
||||
* String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
|
||||
* <li>NumFilteredFields --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt Uint32} The number of the
|
||||
* field in this segment</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BloomFilteringPostingsFormat extends PostingsFormat {
|
||||
|
||||
public static final String BLOOM_CODEC_NAME = "BloomFilter";
|
||||
public static final int BLOOM_CODEC_VERSION = 1;
|
||||
|
||||
/** Extension of Bloom Filters file */
|
||||
static final String BLOOM_EXTENSION = "blm";
|
||||
|
||||
BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
|
||||
private PostingsFormat delegatePostingsFormat;
|
||||
|
||||
/**
|
||||
* Creates Bloom filters for a selection of fields created in the index. This
|
||||
* is recorded as a set of Bitsets held as a segment summary in an additional
|
||||
* "blm" file. This PostingsFormat delegates to a choice of delegate
|
||||
* PostingsFormat for encoding all other postings data.
|
||||
*
|
||||
* @param delegatePostingsFormat
|
||||
* The PostingsFormat that records all the non-bloom filter data i.e.
|
||||
* postings info.
|
||||
* @param bloomFilterFactory
|
||||
* The {@link BloomFilterFactory} responsible for sizing BloomFilters
|
||||
* appropriately
|
||||
*/
|
||||
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
|
||||
BloomFilterFactory bloomFilterFactory) {
|
||||
super(BLOOM_CODEC_NAME);
|
||||
this.delegatePostingsFormat = delegatePostingsFormat;
|
||||
this.bloomFilterFactory = bloomFilterFactory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates Bloom filters for a selection of fields created in the index. This
|
||||
* is recorded as a set of Bitsets held as a segment summary in an additional
|
||||
* "blm" file. This PostingsFormat delegates to a choice of delegate
|
||||
* PostingsFormat for encoding all other postings data. This choice of
|
||||
* constructor defaults to the {@link DefaultBloomFilterFactory} for
|
||||
* configuring per-field BloomFilters.
|
||||
*
|
||||
* @param delegatePostingsFormat
|
||||
* The PostingsFormat that records all the non-bloom filter data i.e.
|
||||
* postings info.
|
||||
*/
|
||||
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
|
||||
this(delegatePostingsFormat, new DefaultBloomFilterFactory());
|
||||
}
|
||||
|
||||
// Used only by core Lucene at read-time via Service Provider instantiation -
|
||||
// do not use at Write-time in application code.
|
||||
public BloomFilteringPostingsFormat() {
|
||||
super(BLOOM_CODEC_NAME);
|
||||
}
|
||||
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
|
||||
throws IOException {
|
||||
if (delegatePostingsFormat == null) {
|
||||
throw new UnsupportedOperationException("Error - " + getClass().getName()
|
||||
+ " has been constructed without a choice of PostingsFormat");
|
||||
}
|
||||
return new BloomFilteredFieldsConsumer(
|
||||
delegatePostingsFormat.fieldsConsumer(state), state,
|
||||
delegatePostingsFormat);
|
||||
}
|
||||
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
return new BloomFilteredFieldsProducer(state);
|
||||
}
|
||||
|
||||
public class BloomFilteredFieldsProducer extends FieldsProducer {
|
||||
private FieldsProducer delegateFieldsProducer;
|
||||
HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<String,FuzzySet>();
|
||||
|
||||
public BloomFilteredFieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexInput bloomIn = null;
|
||||
try {
|
||||
bloomIn = state.dir.openInput(bloomFileName, state.context);
|
||||
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
|
||||
BLOOM_CODEC_VERSION);
|
||||
// // Load the hash function used in the BloomFilter
|
||||
// hashFunction = HashFunction.forName(bloomIn.readString());
|
||||
// Load the delegate postings format
|
||||
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
|
||||
.readString());
|
||||
|
||||
this.delegateFieldsProducer = delegatePostingsFormat
|
||||
.fieldsProducer(state);
|
||||
int numBlooms = bloomIn.readInt();
|
||||
for (int i = 0; i < numBlooms; i++) {
|
||||
int fieldNum = bloomIn.readInt();
|
||||
FuzzySet bloom = FuzzySet.deserialize(bloomIn);
|
||||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
|
||||
bloomsByFieldName.put(fieldInfo.name, bloom);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(bloomIn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Iterator<String> iterator() {
|
||||
return delegateFieldsProducer.iterator();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
delegateFieldsProducer.close();
|
||||
}
|
||||
|
||||
public Terms terms(String field) throws IOException {
|
||||
FuzzySet filter = bloomsByFieldName.get(field);
|
||||
if (filter == null) {
|
||||
return delegateFieldsProducer.terms(field);
|
||||
} else {
|
||||
Terms result = delegateFieldsProducer.terms(field);
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
return new BloomFilteredTerms(result, filter);
|
||||
}
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return delegateFieldsProducer.size();
|
||||
}
|
||||
|
||||
class BloomFilteredTerms extends Terms {
|
||||
private Terms delegateTerms;
|
||||
private FuzzySet filter;
|
||||
|
||||
public BloomFilteredTerms(Terms terms, FuzzySet filter) {
|
||||
this.delegateTerms = terms;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum intersect(CompiledAutomaton compiled,
|
||||
final BytesRef startTerm) throws IOException {
|
||||
return delegateTerms.intersect(compiled, startTerm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||
TermsEnum result;
|
||||
if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
|
||||
// recycle the existing BloomFilteredTermsEnum by asking the delegate
|
||||
// to recycle its contained TermsEnum
|
||||
BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
|
||||
if (bfte.filter == filter) {
|
||||
bfte.delegateTermsEnum = delegateTerms
|
||||
.iterator(bfte.delegateTermsEnum);
|
||||
return bfte;
|
||||
}
|
||||
}
|
||||
// We have been handed something we cannot reuse (either null, wrong
|
||||
// class or wrong filter) so allocate a new object
|
||||
result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse),
|
||||
filter);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return delegateTerms.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() throws IOException {
|
||||
return delegateTerms.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
return delegateTerms.getSumTotalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumDocFreq() throws IOException {
|
||||
return delegateTerms.getSumDocFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getDocCount() throws IOException {
|
||||
return delegateTerms.getDocCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return delegateTerms.hasOffsets();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return delegateTerms.hasPositions();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return delegateTerms.hasPayloads();
|
||||
}
|
||||
}
|
||||
|
||||
class BloomFilteredTermsEnum extends TermsEnum {
|
||||
|
||||
TermsEnum delegateTermsEnum;
|
||||
private FuzzySet filter;
|
||||
|
||||
public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) {
|
||||
this.delegateTermsEnum = iterator;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final BytesRef next() throws IOException {
|
||||
return delegateTermsEnum.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Comparator<BytesRef> getComparator() {
|
||||
return delegateTermsEnum.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean seekExact(BytesRef text, boolean useCache)
|
||||
throws IOException {
|
||||
// The magical fail-fast speed up that is the entire point of all of
|
||||
// this code - save a disk seek if there is a match on an in-memory
|
||||
// structure
|
||||
// that may occasionally give a false positive but guaranteed no false
|
||||
// negatives
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return false;
|
||||
}
|
||||
return delegateTermsEnum.seekExact(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SeekStatus seekCeil(BytesRef text, boolean useCache)
|
||||
throws IOException {
|
||||
return delegateTermsEnum.seekCeil(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void seekExact(long ord) throws IOException {
|
||||
delegateTermsEnum.seekExact(ord);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final BytesRef term() throws IOException {
|
||||
return delegateTermsEnum.term();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long ord() throws IOException {
|
||||
return delegateTermsEnum.ord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int docFreq() throws IOException {
|
||||
return delegateTermsEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long totalTermFreq() throws IOException {
|
||||
return delegateTermsEnum.totalTermFreq();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
|
||||
DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||
return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
|
||||
throws IOException {
|
||||
return delegateTermsEnum.docs(liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class BloomFilteredFieldsConsumer extends FieldsConsumer {
|
||||
private FieldsConsumer delegateFieldsConsumer;
|
||||
private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
|
||||
private SegmentWriteState state;
|
||||
|
||||
// private PostingsFormat delegatePostingsFormat;
|
||||
|
||||
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
|
||||
SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
|
||||
this.delegateFieldsConsumer = fieldsConsumer;
|
||||
// this.delegatePostingsFormat=delegatePostingsFormat;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsConsumer addField(FieldInfo field) throws IOException {
|
||||
FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
|
||||
if (bloomFilter != null) {
|
||||
assert bloomFilters.containsKey(field) == false;
|
||||
bloomFilters.put(field, bloomFilter);
|
||||
return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
|
||||
} else {
|
||||
// No, use the unfiltered fieldsConsumer - we are not interested in
|
||||
// recording any term Bitsets.
|
||||
return delegateFieldsConsumer.addField(field);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
delegateFieldsConsumer.close();
|
||||
// Now we are done accumulating values for these fields
|
||||
List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
|
||||
|
||||
for (Entry<FieldInfo,FuzzySet> entry : bloomFilters.entrySet()) {
|
||||
FuzzySet bloomFilter = entry.getValue();
|
||||
if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){
|
||||
nonSaturatedBlooms.add(entry);
|
||||
}
|
||||
}
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexOutput bloomOutput = null;
|
||||
try {
|
||||
bloomOutput = state.directory
|
||||
.createOutput(bloomFileName, state.context);
|
||||
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
|
||||
BLOOM_CODEC_VERSION);
|
||||
// remember the name of the postings format we will delegate to
|
||||
bloomOutput.writeString(delegatePostingsFormat.getName());
|
||||
|
||||
// First field in the output file is the number of fields+blooms saved
|
||||
bloomOutput.writeInt(nonSaturatedBlooms.size());
|
||||
for (Entry<FieldInfo,FuzzySet> entry : nonSaturatedBlooms) {
|
||||
FieldInfo fieldInfo = entry.getKey();
|
||||
FuzzySet bloomFilter = entry.getValue();
|
||||
bloomOutput.writeInt(fieldInfo.number);
|
||||
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(bloomOutput);
|
||||
}
|
||||
//We are done with large bitsets so no need to keep them hanging around
|
||||
bloomFilters.clear();
|
||||
}
|
||||
|
||||
private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
|
||||
FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
|
||||
|
||||
FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
|
||||
bloomFilter);
|
||||
if (rightSizedSet == null) {
|
||||
rightSizedSet = bloomFilter;
|
||||
}
|
||||
rightSizedSet.serialize(bloomOutput);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class WrappedTermsConsumer extends TermsConsumer {
|
||||
private TermsConsumer delegateTermsConsumer;
|
||||
private FuzzySet bloomFilter;
|
||||
|
||||
public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
|
||||
this.delegateTermsConsumer = termsConsumer;
|
||||
this.bloomFilter = bloomFilter;
|
||||
}
|
||||
|
||||
public PostingsConsumer startTerm(BytesRef text) throws IOException {
|
||||
return delegateTermsConsumer.startTerm(text);
|
||||
}
|
||||
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
|
||||
// Record this term in our BloomFilter
|
||||
if (stats.docFreq > 0) {
|
||||
bloomFilter.addValue(text);
|
||||
}
|
||||
delegateTermsConsumer.finishTerm(text, stats);
|
||||
}
|
||||
|
||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
|
||||
throws IOException {
|
||||
delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
|
||||
}
|
||||
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return delegateTermsConsumer.getComparator();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.FuzzySet.ContainsResult;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A {@link PostingsFormat} useful for low doc-frequency fields such as primary
|
||||
* keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
|
||||
* for reads in segments known to have no record of the key. A choice of
|
||||
* delegate PostingsFormat is used to record all other Postings data.
|
||||
* </p>
|
||||
* <p>
|
||||
* A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
|
||||
* settings on a per-field basis. The default configuration is
|
||||
* {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
|
||||
* values using {@link MurmurHash2}. This should be suitable for most purposes.
|
||||
* </p>
|
||||
* <p>
|
||||
* The format of the blm file is as follows:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>BloomFilter (.blm) --> Header, DelegatePostingsFormatName,
|
||||
* NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
|
||||
* <li>Filter --> FieldNumber, FuzzySet</li>
|
||||
* <li>FuzzySet -->See {@link FuzzySet#serialize(DataOutput)}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>DelegatePostingsFormatName --> {@link DataOutput#writeString(String)
|
||||
* String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
|
||||
* <li>NumFilteredFields --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt Uint32} The number of the
|
||||
* field in this segment</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BloomFilteringPostingsFormat extends PostingsFormat {
|
||||
|
||||
public static final String BLOOM_CODEC_NAME = "BloomFilter";
|
||||
public static final int BLOOM_CODEC_VERSION = 1;
|
||||
|
||||
/** Extension of Bloom Filters file */
|
||||
static final String BLOOM_EXTENSION = "blm";
|
||||
|
||||
BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
|
||||
private PostingsFormat delegatePostingsFormat;
|
||||
|
||||
/**
|
||||
* Creates Bloom filters for a selection of fields created in the index. This
|
||||
* is recorded as a set of Bitsets held as a segment summary in an additional
|
||||
* "blm" file. This PostingsFormat delegates to a choice of delegate
|
||||
* PostingsFormat for encoding all other postings data.
|
||||
*
|
||||
* @param delegatePostingsFormat
|
||||
* The PostingsFormat that records all the non-bloom filter data i.e.
|
||||
* postings info.
|
||||
* @param bloomFilterFactory
|
||||
* The {@link BloomFilterFactory} responsible for sizing BloomFilters
|
||||
* appropriately
|
||||
*/
|
||||
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
|
||||
BloomFilterFactory bloomFilterFactory) {
|
||||
super(BLOOM_CODEC_NAME);
|
||||
this.delegatePostingsFormat = delegatePostingsFormat;
|
||||
this.bloomFilterFactory = bloomFilterFactory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates Bloom filters for a selection of fields created in the index. This
|
||||
* is recorded as a set of Bitsets held as a segment summary in an additional
|
||||
* "blm" file. This PostingsFormat delegates to a choice of delegate
|
||||
* PostingsFormat for encoding all other postings data. This choice of
|
||||
* constructor defaults to the {@link DefaultBloomFilterFactory} for
|
||||
* configuring per-field BloomFilters.
|
||||
*
|
||||
* @param delegatePostingsFormat
|
||||
* The PostingsFormat that records all the non-bloom filter data i.e.
|
||||
* postings info.
|
||||
*/
|
||||
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
|
||||
this(delegatePostingsFormat, new DefaultBloomFilterFactory());
|
||||
}
|
||||
|
||||
// Used only by core Lucene at read-time via Service Provider instantiation -
|
||||
// do not use at Write-time in application code.
|
||||
public BloomFilteringPostingsFormat() {
|
||||
super(BLOOM_CODEC_NAME);
|
||||
}
|
||||
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
|
||||
throws IOException {
|
||||
if (delegatePostingsFormat == null) {
|
||||
throw new UnsupportedOperationException("Error - " + getClass().getName()
|
||||
+ " has been constructed without a choice of PostingsFormat");
|
||||
}
|
||||
return new BloomFilteredFieldsConsumer(
|
||||
delegatePostingsFormat.fieldsConsumer(state), state,
|
||||
delegatePostingsFormat);
|
||||
}
|
||||
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
return new BloomFilteredFieldsProducer(state);
|
||||
}
|
||||
|
||||
public class BloomFilteredFieldsProducer extends FieldsProducer {
|
||||
private FieldsProducer delegateFieldsProducer;
|
||||
HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<String,FuzzySet>();
|
||||
|
||||
public BloomFilteredFieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexInput bloomIn = null;
|
||||
try {
|
||||
bloomIn = state.dir.openInput(bloomFileName, state.context);
|
||||
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
|
||||
BLOOM_CODEC_VERSION);
|
||||
// // Load the hash function used in the BloomFilter
|
||||
// hashFunction = HashFunction.forName(bloomIn.readString());
|
||||
// Load the delegate postings format
|
||||
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
|
||||
.readString());
|
||||
|
||||
this.delegateFieldsProducer = delegatePostingsFormat
|
||||
.fieldsProducer(state);
|
||||
int numBlooms = bloomIn.readInt();
|
||||
for (int i = 0; i < numBlooms; i++) {
|
||||
int fieldNum = bloomIn.readInt();
|
||||
FuzzySet bloom = FuzzySet.deserialize(bloomIn);
|
||||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
|
||||
bloomsByFieldName.put(fieldInfo.name, bloom);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(bloomIn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Iterator<String> iterator() {
|
||||
return delegateFieldsProducer.iterator();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
delegateFieldsProducer.close();
|
||||
}
|
||||
|
||||
public Terms terms(String field) throws IOException {
|
||||
FuzzySet filter = bloomsByFieldName.get(field);
|
||||
if (filter == null) {
|
||||
return delegateFieldsProducer.terms(field);
|
||||
} else {
|
||||
Terms result = delegateFieldsProducer.terms(field);
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
return new BloomFilteredTerms(result, filter);
|
||||
}
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return delegateFieldsProducer.size();
|
||||
}
|
||||
|
||||
class BloomFilteredTerms extends Terms {
|
||||
private Terms delegateTerms;
|
||||
private FuzzySet filter;
|
||||
|
||||
public BloomFilteredTerms(Terms terms, FuzzySet filter) {
|
||||
this.delegateTerms = terms;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum intersect(CompiledAutomaton compiled,
|
||||
final BytesRef startTerm) throws IOException {
|
||||
return delegateTerms.intersect(compiled, startTerm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||
TermsEnum result;
|
||||
if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
|
||||
// recycle the existing BloomFilteredTermsEnum by asking the delegate
|
||||
// to recycle its contained TermsEnum
|
||||
BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
|
||||
if (bfte.filter == filter) {
|
||||
bfte.delegateTermsEnum = delegateTerms
|
||||
.iterator(bfte.delegateTermsEnum);
|
||||
return bfte;
|
||||
}
|
||||
}
|
||||
// We have been handed something we cannot reuse (either null, wrong
|
||||
// class or wrong filter) so allocate a new object
|
||||
result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse),
|
||||
filter);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return delegateTerms.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() throws IOException {
|
||||
return delegateTerms.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
return delegateTerms.getSumTotalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumDocFreq() throws IOException {
|
||||
return delegateTerms.getSumDocFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getDocCount() throws IOException {
|
||||
return delegateTerms.getDocCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return delegateTerms.hasOffsets();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return delegateTerms.hasPositions();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return delegateTerms.hasPayloads();
|
||||
}
|
||||
}
|
||||
|
||||
class BloomFilteredTermsEnum extends TermsEnum {
|
||||
|
||||
TermsEnum delegateTermsEnum;
|
||||
private FuzzySet filter;
|
||||
|
||||
public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) {
|
||||
this.delegateTermsEnum = iterator;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final BytesRef next() throws IOException {
|
||||
return delegateTermsEnum.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Comparator<BytesRef> getComparator() {
|
||||
return delegateTermsEnum.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean seekExact(BytesRef text, boolean useCache)
|
||||
throws IOException {
|
||||
// The magical fail-fast speed up that is the entire point of all of
|
||||
// this code - save a disk seek if there is a match on an in-memory
|
||||
// structure
|
||||
// that may occasionally give a false positive but guaranteed no false
|
||||
// negatives
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return false;
|
||||
}
|
||||
return delegateTermsEnum.seekExact(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SeekStatus seekCeil(BytesRef text, boolean useCache)
|
||||
throws IOException {
|
||||
return delegateTermsEnum.seekCeil(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void seekExact(long ord) throws IOException {
|
||||
delegateTermsEnum.seekExact(ord);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final BytesRef term() throws IOException {
|
||||
return delegateTermsEnum.term();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long ord() throws IOException {
|
||||
return delegateTermsEnum.ord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int docFreq() throws IOException {
|
||||
return delegateTermsEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long totalTermFreq() throws IOException {
|
||||
return delegateTermsEnum.totalTermFreq();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
|
||||
DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||
return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
|
||||
throws IOException {
|
||||
return delegateTermsEnum.docs(liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class BloomFilteredFieldsConsumer extends FieldsConsumer {
|
||||
private FieldsConsumer delegateFieldsConsumer;
|
||||
private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
|
||||
private SegmentWriteState state;
|
||||
|
||||
// private PostingsFormat delegatePostingsFormat;
|
||||
|
||||
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
|
||||
SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
|
||||
this.delegateFieldsConsumer = fieldsConsumer;
|
||||
// this.delegatePostingsFormat=delegatePostingsFormat;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsConsumer addField(FieldInfo field) throws IOException {
|
||||
FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
|
||||
if (bloomFilter != null) {
|
||||
assert bloomFilters.containsKey(field) == false;
|
||||
bloomFilters.put(field, bloomFilter);
|
||||
return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
|
||||
} else {
|
||||
// No, use the unfiltered fieldsConsumer - we are not interested in
|
||||
// recording any term Bitsets.
|
||||
return delegateFieldsConsumer.addField(field);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
delegateFieldsConsumer.close();
|
||||
// Now we are done accumulating values for these fields
|
||||
List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
|
||||
|
||||
for (Entry<FieldInfo,FuzzySet> entry : bloomFilters.entrySet()) {
|
||||
FuzzySet bloomFilter = entry.getValue();
|
||||
if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){
|
||||
nonSaturatedBlooms.add(entry);
|
||||
}
|
||||
}
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexOutput bloomOutput = null;
|
||||
try {
|
||||
bloomOutput = state.directory
|
||||
.createOutput(bloomFileName, state.context);
|
||||
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
|
||||
BLOOM_CODEC_VERSION);
|
||||
// remember the name of the postings format we will delegate to
|
||||
bloomOutput.writeString(delegatePostingsFormat.getName());
|
||||
|
||||
// First field in the output file is the number of fields+blooms saved
|
||||
bloomOutput.writeInt(nonSaturatedBlooms.size());
|
||||
for (Entry<FieldInfo,FuzzySet> entry : nonSaturatedBlooms) {
|
||||
FieldInfo fieldInfo = entry.getKey();
|
||||
FuzzySet bloomFilter = entry.getValue();
|
||||
bloomOutput.writeInt(fieldInfo.number);
|
||||
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(bloomOutput);
|
||||
}
|
||||
//We are done with large bitsets so no need to keep them hanging around
|
||||
bloomFilters.clear();
|
||||
}
|
||||
|
||||
private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
|
||||
FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
|
||||
|
||||
FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
|
||||
bloomFilter);
|
||||
if (rightSizedSet == null) {
|
||||
rightSizedSet = bloomFilter;
|
||||
}
|
||||
rightSizedSet.serialize(bloomOutput);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class WrappedTermsConsumer extends TermsConsumer {
|
||||
private TermsConsumer delegateTermsConsumer;
|
||||
private FuzzySet bloomFilter;
|
||||
|
||||
public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
|
||||
this.delegateTermsConsumer = termsConsumer;
|
||||
this.bloomFilter = bloomFilter;
|
||||
}
|
||||
|
||||
public PostingsConsumer startTerm(BytesRef text) throws IOException {
|
||||
return delegateTermsConsumer.startTerm(text);
|
||||
}
|
||||
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
|
||||
// Record this term in our BloomFilter
|
||||
if (stats.docFreq > 0) {
|
||||
bloomFilter.addValue(text);
|
||||
}
|
||||
delegateTermsConsumer.finishTerm(text, stats);
|
||||
}
|
||||
|
||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
|
||||
throws IOException {
|
||||
delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
|
||||
}
|
||||
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return delegateTermsConsumer.getComparator();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,25 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Codec PostingsFormat for fast access to low-frequency terms such as primary key fields.
|
||||
</body>
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Codec PostingsFormat for fast access to low-frequency terms such as primary key fields.
|
||||
</body>
|
||||
</html>
|
|
@ -38,12 +38,21 @@ import org.apache.lucene.index.DocValues;
|
|||
|
||||
public class ByteDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for 8-bit byte DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.FIXED_INTS_8);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified 8-bit byte value
|
||||
* @param name field name
|
||||
* @param value 8-bit byte value
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public ByteDocValuesField(String name, byte value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Byte.valueOf(value);
|
||||
|
|
|
@ -92,10 +92,14 @@ public class CompressionTools {
|
|||
return compress(result.bytes, 0, result.length, compressionLevel);
|
||||
}
|
||||
|
||||
/** Decompress the byte array previously returned by
|
||||
* compress (referenced by the provided BytesRef) */
|
||||
public static byte[] decompress(BytesRef bytes) throws DataFormatException {
|
||||
return decompress(bytes.bytes, bytes.offset, bytes.length);
|
||||
}
|
||||
|
||||
/** Decompress the byte array previously returned by
|
||||
* compress */
|
||||
public static byte[] decompress(byte[] value) throws DataFormatException {
|
||||
return decompress(value, 0, value.length);
|
||||
}
|
||||
|
@ -130,6 +134,8 @@ public class CompressionTools {
|
|||
return decompressString(value, 0, value.length);
|
||||
}
|
||||
|
||||
/** Decompress the byte array previously returned by
|
||||
* compressString back into a String */
|
||||
public static String decompressString(byte[] value, int offset, int length) throws DataFormatException {
|
||||
final byte[] bytes = decompress(value, offset, length);
|
||||
CharsRef result = new CharsRef(bytes.length);
|
||||
|
@ -137,6 +143,8 @@ public class CompressionTools {
|
|||
return new String(result.chars, 0, result.length);
|
||||
}
|
||||
|
||||
/** Decompress the byte array (referenced by the provided BytesRef)
|
||||
* previously returned by compressString back into a String */
|
||||
public static String decompressString(BytesRef bytes) throws DataFormatException {
|
||||
return decompressString(bytes.bytes, bytes.offset, bytes.length);
|
||||
}
|
||||
|
|
|
@ -185,7 +185,20 @@ public class DateTools {
|
|||
/** Specifies the time granularity. */
|
||||
public static enum Resolution {
|
||||
|
||||
YEAR(4), MONTH(6), DAY(8), HOUR(10), MINUTE(12), SECOND(14), MILLISECOND(17);
|
||||
/** Limit a date's resolution to year granularity. */
|
||||
YEAR(4),
|
||||
/** Limit a date's resolution to month granularity. */
|
||||
MONTH(6),
|
||||
/** Limit a date's resolution to day granularity. */
|
||||
DAY(8),
|
||||
/** Limit a date's resolution to hour granularity. */
|
||||
HOUR(10),
|
||||
/** Limit a date's resolution to minute granularity. */
|
||||
MINUTE(12),
|
||||
/** Limit a date's resolution to second granularity. */
|
||||
SECOND(14),
|
||||
/** Limit a date's resolution to millisecond granularity. */
|
||||
MILLISECOND(17);
|
||||
|
||||
final int formatLen;
|
||||
final SimpleDateFormat format;//should be cloned before use, since it's not threadsafe
|
||||
|
|
|
@ -44,23 +44,49 @@ import org.apache.lucene.util.BytesRef;
|
|||
public class DerefBytesDocValuesField extends StoredField {
|
||||
|
||||
// TODO: ideally indexer figures out var vs fixed on its own!?
|
||||
/**
|
||||
* Type for indirect bytes DocValues: all with the same length
|
||||
*/
|
||||
public static final FieldType TYPE_FIXED_LEN = new FieldType();
|
||||
static {
|
||||
TYPE_FIXED_LEN.setDocValueType(DocValues.Type.BYTES_FIXED_DEREF);
|
||||
TYPE_FIXED_LEN.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for indirect bytes DocValues: can have variable lengths
|
||||
*/
|
||||
public static final FieldType TYPE_VAR_LEN = new FieldType();
|
||||
static {
|
||||
TYPE_VAR_LEN.setDocValueType(DocValues.Type.BYTES_VAR_DEREF);
|
||||
TYPE_VAR_LEN.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new variable-length indirect DocValues field.
|
||||
* <p>
|
||||
* This calls
|
||||
* {@link DerefBytesDocValuesField#DerefBytesDocValuesField(String, BytesRef, boolean)
|
||||
* DerefBytesDocValuesField(name, bytes, false}, meaning by default
|
||||
* it allows for values of different lengths. If your values are all
|
||||
* the same length, use that constructor instead.
|
||||
* @param name field name
|
||||
* @param bytes binary content
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public DerefBytesDocValuesField(String name, BytesRef bytes) {
|
||||
super(name, TYPE_VAR_LEN);
|
||||
fieldsData = bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new fixed or variable length indirect DocValues field.
|
||||
* <p>
|
||||
* @param name field name
|
||||
* @param bytes binary content
|
||||
* @param isFixedLength true if all values have the same length.
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public DerefBytesDocValuesField(String name, BytesRef bytes, boolean isFixedLength) {
|
||||
super(name, isFixedLength ? TYPE_FIXED_LEN : TYPE_VAR_LEN);
|
||||
fieldsData = bytes;
|
||||
|
|
|
@ -97,6 +97,10 @@ public class DocumentStoredFieldVisitor extends StoredFieldVisitor {
|
|||
return fieldsToAdd == null || fieldsToAdd.contains(fieldInfo.name) ? Status.YES : Status.NO;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the visited document.
|
||||
* @return StoredDocument populated with stored fields.
|
||||
*/
|
||||
public StoredDocument getDocument() {
|
||||
return doc;
|
||||
}
|
||||
|
|
|
@ -38,12 +38,21 @@ import org.apache.lucene.index.DocValues;
|
|||
|
||||
public class DoubleDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for 64-bit double DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.FLOAT_64);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified 64-bit double value
|
||||
* @param name field name
|
||||
* @param value 64-bit double value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public DoubleDocValuesField(String name, double value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Double.valueOf(value);
|
||||
|
|
|
@ -114,6 +114,10 @@ import org.apache.lucene.util.NumericUtils;
|
|||
|
||||
public final class DoubleField extends Field {
|
||||
|
||||
/**
|
||||
* Type for a DoubleField that is not stored:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_NOT_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_NOT_STORED.setIndexed(true);
|
||||
|
@ -124,6 +128,10 @@ public final class DoubleField extends Field {
|
|||
TYPE_NOT_STORED.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for a stored DoubleField:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_STORED.setIndexed(true);
|
||||
|
@ -137,14 +145,26 @@ public final class DoubleField extends Field {
|
|||
|
||||
/** Creates a stored or un-stored DoubleField with the provided value
|
||||
* and default <code>precisionStep</code> {@link
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4). */
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4).
|
||||
* @param name field name
|
||||
* @param value 64-bit double value
|
||||
* @param stored Store.YES if the content should also be stored
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public DoubleField(String name, double value, Store stored) {
|
||||
super(name, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
|
||||
fieldsData = Double.valueOf(value);
|
||||
}
|
||||
|
||||
/** Expert: allows you to customize the {@link
|
||||
* FieldType}. */
|
||||
* FieldType}.
|
||||
* @param name field name
|
||||
* @param value 64-bit double value
|
||||
* @param type customized field type: must have {@link FieldType#numericType()}
|
||||
* of {@link FieldType.NumericType#DOUBLE}.
|
||||
* @throws IllegalArgumentException if the field name or type is null, or
|
||||
* if the field type does not have a DOUBLE numericType()
|
||||
*/
|
||||
public DoubleField(String name, double value, FieldType type) {
|
||||
super(name, type);
|
||||
if (type.numericType() != FieldType.NumericType.DOUBLE) {
|
||||
|
|
|
@ -61,23 +61,42 @@ import org.apache.lucene.index.FieldInvertState; // javadocs
|
|||
*/
|
||||
public class Field implements IndexableField, StorableField {
|
||||
|
||||
/**
|
||||
* Field's type
|
||||
*/
|
||||
protected final FieldType type;
|
||||
|
||||
/**
|
||||
* Field's name
|
||||
*/
|
||||
protected final String name;
|
||||
|
||||
// Field's value:
|
||||
/** Field's value */
|
||||
protected Object fieldsData;
|
||||
|
||||
// Pre-analyzed tokenStream for indexed fields; this is
|
||||
// separate from fieldsData because you are allowed to
|
||||
// have both; eg maybe field has a String value but you
|
||||
// customize how it's tokenized:
|
||||
/** Pre-analyzed tokenStream for indexed fields; this is
|
||||
* separate from fieldsData because you are allowed to
|
||||
* have both; eg maybe field has a String value but you
|
||||
* customize how it's tokenized */
|
||||
protected TokenStream tokenStream;
|
||||
|
||||
private transient TokenStream internalTokenStream;
|
||||
private transient ReusableStringReader internalReader;
|
||||
|
||||
/**
|
||||
* Field's boost
|
||||
* @see #boost()
|
||||
*/
|
||||
protected float boost = 1.0f;
|
||||
|
||||
/**
|
||||
* Expert: creates a field with no initial value.
|
||||
* Intended only for custom Field subclasses.
|
||||
* @param name field name
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if either the name or type
|
||||
* is null.
|
||||
*/
|
||||
protected Field(String name, FieldType type) {
|
||||
if (name == null) {
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
|
@ -91,6 +110,13 @@ public class Field implements IndexableField, StorableField {
|
|||
|
||||
/**
|
||||
* Create field with Reader value.
|
||||
* @param name field name
|
||||
* @param reader reader value
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if either the name or type
|
||||
* is null, or if the field's type is stored(), or
|
||||
* if tokenized() is false.
|
||||
* @throws NullPointerException if the reader is null
|
||||
*/
|
||||
public Field(String name, Reader reader, FieldType type) {
|
||||
if (name == null) {
|
||||
|
@ -116,6 +142,13 @@ public class Field implements IndexableField, StorableField {
|
|||
|
||||
/**
|
||||
* Create field with TokenStream value.
|
||||
* @param name field name
|
||||
* @param tokenStream TokenStream value
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if either the name or type
|
||||
* is null, or if the field's type is stored(), or
|
||||
* if tokenized() is false, or if indexed() is false.
|
||||
* @throws NullPointerException if the tokenStream is null
|
||||
*/
|
||||
public Field(String name, TokenStream tokenStream, FieldType type) {
|
||||
if (name == null) {
|
||||
|
@ -139,6 +172,15 @@ public class Field implements IndexableField, StorableField {
|
|||
|
||||
/**
|
||||
* Create field with binary value.
|
||||
*
|
||||
* <p>NOTE: the provided byte[] is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param value byte array pointing to binary content (not copied)
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if the field name is null,
|
||||
* or the field's type is indexed()
|
||||
* @throws NullPointerException if the type is null
|
||||
*/
|
||||
public Field(String name, byte[] value, FieldType type) {
|
||||
this(name, value, 0, value.length, type);
|
||||
|
@ -146,6 +188,17 @@ public class Field implements IndexableField, StorableField {
|
|||
|
||||
/**
|
||||
* Create field with binary value.
|
||||
*
|
||||
* <p>NOTE: the provided byte[] is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param value byte array pointing to binary content (not copied)
|
||||
* @param offset starting position of the byte array
|
||||
* @param length valid length of the byte array
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if the field name is null,
|
||||
* or the field's type is indexed()
|
||||
* @throws NullPointerException if the type is null
|
||||
*/
|
||||
public Field(String name, byte[] value, int offset, int length, FieldType type) {
|
||||
this(name, new BytesRef(value, offset, length), type);
|
||||
|
@ -156,6 +209,12 @@ public class Field implements IndexableField, StorableField {
|
|||
*
|
||||
* <p>NOTE: the provided BytesRef is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param bytes BytesRef pointing to binary content (not copied)
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if the field name is null,
|
||||
* or the field's type is indexed()
|
||||
* @throws NullPointerException if the type is null
|
||||
*/
|
||||
public Field(String name, BytesRef bytes, FieldType type) {
|
||||
if (name == null) {
|
||||
|
@ -173,6 +232,13 @@ public class Field implements IndexableField, StorableField {
|
|||
|
||||
/**
|
||||
* Create field with String value.
|
||||
* @param name field name
|
||||
* @param value string value
|
||||
* @param type field type
|
||||
* @throws IllegalArgumentException if either the name or value
|
||||
* is null, or if the field's type is neither indexed() nor stored(),
|
||||
* or if indexed() is false but storeTermVectors() is true.
|
||||
* @throws NullPointerException if the type is null
|
||||
*/
|
||||
public Field(String name, String value, FieldType type) {
|
||||
if (name == null) {
|
||||
|
@ -214,7 +280,7 @@ public class Field implements IndexableField, StorableField {
|
|||
}
|
||||
|
||||
/**
|
||||
* The TokesStream for this field to be used when indexing, or null. If null,
|
||||
* The TokenStream for this field to be used when indexing, or null. If null,
|
||||
* the Reader value or String value is analyzed to produce the indexed tokens.
|
||||
*/
|
||||
public TokenStream tokenStreamValue() {
|
||||
|
@ -280,6 +346,10 @@ public class Field implements IndexableField, StorableField {
|
|||
fieldsData = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: change the value of this field. See
|
||||
* {@link #setStringValue(String)}.
|
||||
*/
|
||||
public void setByteValue(byte value) {
|
||||
if (!(fieldsData instanceof Byte)) {
|
||||
throw new IllegalArgumentException("cannot change value type from " + fieldsData.getClass().getSimpleName() + " to Byte");
|
||||
|
@ -287,6 +357,10 @@ public class Field implements IndexableField, StorableField {
|
|||
fieldsData = Byte.valueOf(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: change the value of this field. See
|
||||
* {@link #setStringValue(String)}.
|
||||
*/
|
||||
public void setShortValue(short value) {
|
||||
if (!(fieldsData instanceof Short)) {
|
||||
throw new IllegalArgumentException("cannot change value type from " + fieldsData.getClass().getSimpleName() + " to Short");
|
||||
|
@ -294,6 +368,10 @@ public class Field implements IndexableField, StorableField {
|
|||
fieldsData = Short.valueOf(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: change the value of this field. See
|
||||
* {@link #setStringValue(String)}.
|
||||
*/
|
||||
public void setIntValue(int value) {
|
||||
if (!(fieldsData instanceof Integer)) {
|
||||
throw new IllegalArgumentException("cannot change value type from " + fieldsData.getClass().getSimpleName() + " to Integer");
|
||||
|
@ -301,6 +379,10 @@ public class Field implements IndexableField, StorableField {
|
|||
fieldsData = Integer.valueOf(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: change the value of this field. See
|
||||
* {@link #setStringValue(String)}.
|
||||
*/
|
||||
public void setLongValue(long value) {
|
||||
if (!(fieldsData instanceof Long)) {
|
||||
throw new IllegalArgumentException("cannot change value type from " + fieldsData.getClass().getSimpleName() + " to Long");
|
||||
|
@ -308,6 +390,10 @@ public class Field implements IndexableField, StorableField {
|
|||
fieldsData = Long.valueOf(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: change the value of this field. See
|
||||
* {@link #setStringValue(String)}.
|
||||
*/
|
||||
public void setFloatValue(float value) {
|
||||
if (!(fieldsData instanceof Float)) {
|
||||
throw new IllegalArgumentException("cannot change value type from " + fieldsData.getClass().getSimpleName() + " to Float");
|
||||
|
@ -315,6 +401,10 @@ public class Field implements IndexableField, StorableField {
|
|||
fieldsData = Float.valueOf(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: change the value of this field. See
|
||||
* {@link #setStringValue(String)}.
|
||||
*/
|
||||
public void setDoubleValue(double value) {
|
||||
if (!(fieldsData instanceof Double)) {
|
||||
throw new IllegalArgumentException("cannot change value type from " + fieldsData.getClass().getSimpleName() + " to Double");
|
||||
|
@ -341,23 +431,21 @@ public class Field implements IndexableField, StorableField {
|
|||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default value is <code>1.0f</code> (no boost).
|
||||
* @see #setBoost(float)
|
||||
*/
|
||||
public float boost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
/** Sets the boost factor hits on this field. This value will be
|
||||
* multiplied into the score of all hits on this this field of this
|
||||
* document.
|
||||
*
|
||||
* <p>The boost is used to compute the norm factor for the field. By
|
||||
* default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState, Norm)} method,
|
||||
* the boost value is multiplied by the length normalization factor and then
|
||||
* rounded by {@link org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
|
||||
* index. One should attempt to ensure that this product does not overflow
|
||||
* the range of that encoding.
|
||||
*
|
||||
* @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState, Norm)
|
||||
* @see org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)
|
||||
/**
|
||||
* Sets the boost factor on this field.
|
||||
* @throws IllegalArgumentException if this field is not indexed,
|
||||
* or if it omits norms.
|
||||
* @see #boost()
|
||||
*/
|
||||
public void setBoost(float boost) {
|
||||
if (boost != 1.0f) {
|
||||
|
@ -406,9 +494,6 @@ public class Field implements IndexableField, StorableField {
|
|||
return type;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
|
||||
if (!fieldType().indexed()) {
|
||||
return null;
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.document;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer; // javadocs
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableFieldType;
|
||||
|
@ -31,7 +32,16 @@ public class FieldType implements IndexableFieldType {
|
|||
/** Data type of the numeric value
|
||||
* @since 3.2
|
||||
*/
|
||||
public static enum NumericType {INT, LONG, FLOAT, DOUBLE}
|
||||
public static enum NumericType {
|
||||
/** 32-bit integer numeric type */
|
||||
INT,
|
||||
/** 64-bit long numeric type */
|
||||
LONG,
|
||||
/** 32-bit float numeric type */
|
||||
FLOAT,
|
||||
/** 64-bit double numeric type */
|
||||
DOUBLE
|
||||
}
|
||||
|
||||
private boolean indexed;
|
||||
private boolean stored;
|
||||
|
@ -47,6 +57,9 @@ public class FieldType implements IndexableFieldType {
|
|||
private int numericPrecisionStep = NumericUtils.PRECISION_STEP_DEFAULT;
|
||||
private DocValues.Type docValueType;
|
||||
|
||||
/**
|
||||
* Create a new mutable FieldType with all of the properties from <code>ref</code>
|
||||
*/
|
||||
public FieldType(FieldType ref) {
|
||||
this.indexed = ref.indexed();
|
||||
this.stored = ref.stored();
|
||||
|
@ -62,6 +75,9 @@ public class FieldType implements IndexableFieldType {
|
|||
// Do not copy frozen!
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new FieldType with default properties.
|
||||
*/
|
||||
public FieldType() {
|
||||
}
|
||||
|
||||
|
@ -80,100 +96,241 @@ public class FieldType implements IndexableFieldType {
|
|||
this.frozen = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setIndexed(boolean)
|
||||
*/
|
||||
public boolean indexed() {
|
||||
return this.indexed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to index (invert) this field.
|
||||
* @param value true if this field should be indexed.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #indexed()
|
||||
*/
|
||||
public void setIndexed(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.indexed = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setStored(boolean)
|
||||
*/
|
||||
public boolean stored() {
|
||||
return this.stored;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to store this field.
|
||||
* @param value true if this field should be stored.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #stored()
|
||||
*/
|
||||
public void setStored(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.stored = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>true</code>.
|
||||
* @see #setTokenized(boolean)
|
||||
*/
|
||||
public boolean tokenized() {
|
||||
return this.tokenized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to tokenize this field's contents via the
|
||||
* configured {@link Analyzer}.
|
||||
* @param value true if this field should be tokenized.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #tokenized()
|
||||
*/
|
||||
public void setTokenized(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.tokenized = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setStoreTermVectors(boolean)
|
||||
*/
|
||||
public boolean storeTermVectors() {
|
||||
return this.storeTermVectors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> if this field's indexed form should be also stored
|
||||
* into term vectors.
|
||||
* @param value true if this field should store term vectors.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #storeTermVectors()
|
||||
*/
|
||||
public void setStoreTermVectors(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.storeTermVectors = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setStoreTermVectorOffsets(boolean)
|
||||
*/
|
||||
public boolean storeTermVectorOffsets() {
|
||||
return this.storeTermVectorOffsets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to also store token character offsets into the term
|
||||
* vector for this field.
|
||||
* @param value true if this field should store term vector offsets.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #storeTermVectorOffsets()
|
||||
*/
|
||||
public void setStoreTermVectorOffsets(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.storeTermVectorOffsets = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setStoreTermVectorPositions(boolean)
|
||||
*/
|
||||
public boolean storeTermVectorPositions() {
|
||||
return this.storeTermVectorPositions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to also store token positions into the term
|
||||
* vector for this field.
|
||||
* @param value true if this field should store term vector positions.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #storeTermVectorPositions()
|
||||
*/
|
||||
public void setStoreTermVectorPositions(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.storeTermVectorPositions = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setStoreTermVectorPayloads(boolean)
|
||||
*/
|
||||
public boolean storeTermVectorPayloads() {
|
||||
return this.storeTermVectorPayloads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to also store token payloads into the term
|
||||
* vector for this field.
|
||||
* @param value true if this field should store term vector payloads.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #storeTermVectorPayloads()
|
||||
*/
|
||||
public void setStoreTermVectorPayloads(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.storeTermVectorPayloads = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>false</code>.
|
||||
* @see #setOmitNorms(boolean)
|
||||
*/
|
||||
public boolean omitNorms() {
|
||||
return this.omitNorms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set to <code>true</code> to omit normalization values for the field.
|
||||
* @param value true if this field should omit norms.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #omitNorms()
|
||||
*/
|
||||
public void setOmitNorms(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.omitNorms = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS}.
|
||||
* @see #setIndexOptions(FieldInfo.IndexOptions)
|
||||
*/
|
||||
public IndexOptions indexOptions() {
|
||||
return this.indexOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the indexing options for the field:
|
||||
* @param value indexing options
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #indexOptions()
|
||||
*/
|
||||
public void setIndexOptions(IndexOptions value) {
|
||||
checkIfFrozen();
|
||||
this.indexOptions = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specifies the field's numeric type.
|
||||
* @param type numeric type, or null if the field has no numeric type.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #numericType()
|
||||
*/
|
||||
public void setNumericType(NumericType type) {
|
||||
checkIfFrozen();
|
||||
numericType = type;
|
||||
}
|
||||
|
||||
/** NumericDataType; if
|
||||
* non-null then the field's value will be indexed
|
||||
* numerically so that {@link NumericRangeQuery} can be
|
||||
* used at search time. */
|
||||
/**
|
||||
* NumericType: if non-null then the field's value will be indexed
|
||||
* numerically so that {@link NumericRangeQuery} can be used at
|
||||
* search time.
|
||||
* <p>
|
||||
* The default is <code>null</code> (no numeric type)
|
||||
* @see #setNumericType(NumericType)
|
||||
*/
|
||||
public NumericType numericType() {
|
||||
return numericType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the numeric precision step for the field.
|
||||
* @param precisionStep numeric precision step for the field
|
||||
* @throws IllegalArgumentException if precisionStep is less than 1.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #numericPrecisionStep()
|
||||
*/
|
||||
public void setNumericPrecisionStep(int precisionStep) {
|
||||
checkIfFrozen();
|
||||
if (precisionStep < 1) {
|
||||
|
@ -182,7 +339,14 @@ public class FieldType implements IndexableFieldType {
|
|||
this.numericPrecisionStep = precisionStep;
|
||||
}
|
||||
|
||||
/** Precision step for numeric field. */
|
||||
/**
|
||||
* Precision step for numeric field.
|
||||
* <p>
|
||||
* This has no effect if {@link #numericType()} returns null.
|
||||
* <p>
|
||||
* The default is {@link NumericUtils#PRECISION_STEP_DEFAULT}
|
||||
* @see #setNumericPrecisionStep(int)
|
||||
*/
|
||||
public int numericPrecisionStep() {
|
||||
return numericPrecisionStep;
|
||||
}
|
||||
|
@ -239,11 +403,24 @@ public class FieldType implements IndexableFieldType {
|
|||
|
||||
/* from StorableFieldType */
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* <p>
|
||||
* The default is <code>null</code> (no docValues)
|
||||
* @see #setDocValueType(DocValues.Type)
|
||||
*/
|
||||
@Override
|
||||
public DocValues.Type docValueType() {
|
||||
return docValueType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set's the field's DocValues.Type
|
||||
* @param type DocValues type, or null if no DocValues should be stored.
|
||||
* @throws IllegalStateException if this FieldType is frozen against
|
||||
* future modifications.
|
||||
* @see #docValueType()
|
||||
*/
|
||||
public void setDocValueType(DocValues.Type type) {
|
||||
checkIfFrozen();
|
||||
docValueType = type;
|
||||
|
|
|
@ -37,12 +37,21 @@ import org.apache.lucene.index.DocValues;
|
|||
|
||||
public class FloatDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for 32-bit float DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.FLOAT_32);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified 32-bit float value
|
||||
* @param name field name
|
||||
* @param value 32-bit float value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public FloatDocValuesField(String name, float value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Float.valueOf(value);
|
||||
|
|
|
@ -114,6 +114,10 @@ import org.apache.lucene.util.NumericUtils;
|
|||
|
||||
public final class FloatField extends Field {
|
||||
|
||||
/**
|
||||
* Type for a FloatField that is not stored:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_NOT_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_NOT_STORED.setIndexed(true);
|
||||
|
@ -124,6 +128,10 @@ public final class FloatField extends Field {
|
|||
TYPE_NOT_STORED.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for a stored FloatField:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_STORED.setIndexed(true);
|
||||
|
@ -137,14 +145,26 @@ public final class FloatField extends Field {
|
|||
|
||||
/** Creates a stored or un-stored FloatField with the provided value
|
||||
* and default <code>precisionStep</code> {@link
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4). */
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4).
|
||||
* @param name field name
|
||||
* @param value 32-bit double value
|
||||
* @param stored Store.YES if the content should also be stored
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public FloatField(String name, float value, Store stored) {
|
||||
super(name, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
|
||||
fieldsData = Float.valueOf(value);
|
||||
}
|
||||
|
||||
/** Expert: allows you to customize the {@link
|
||||
* FieldType}. */
|
||||
* FieldType}.
|
||||
* @param name field name
|
||||
* @param value 32-bit float value
|
||||
* @param type customized field type: must have {@link FieldType#numericType()}
|
||||
* of {@link FieldType.NumericType#FLOAT}.
|
||||
* @throws IllegalArgumentException if the field name or type is null, or
|
||||
* if the field type does not have a FLOAT numericType()
|
||||
*/
|
||||
public FloatField(String name, float value, FieldType type) {
|
||||
super(name, type);
|
||||
if (type.numericType() != FieldType.NumericType.FLOAT) {
|
||||
|
|
|
@ -37,12 +37,21 @@ import org.apache.lucene.index.DocValues;
|
|||
|
||||
public class IntDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for 32-bit integer DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.FIXED_INTS_32);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified 32-bit integer value
|
||||
* @param name field name
|
||||
* @param value 32-bit integer value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public IntDocValuesField(String name, int value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Integer.valueOf(value);
|
||||
|
|
|
@ -114,6 +114,10 @@ import org.apache.lucene.util.NumericUtils;
|
|||
|
||||
public final class IntField extends Field {
|
||||
|
||||
/**
|
||||
* Type for an IntField that is not stored:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_NOT_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_NOT_STORED.setIndexed(true);
|
||||
|
@ -124,6 +128,10 @@ public final class IntField extends Field {
|
|||
TYPE_NOT_STORED.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for a stored IntField:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_STORED.setIndexed(true);
|
||||
|
@ -137,14 +145,26 @@ public final class IntField extends Field {
|
|||
|
||||
/** Creates a stored or un-stored IntField with the provided value
|
||||
* and default <code>precisionStep</code> {@link
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4). */
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4).
|
||||
* @param name field name
|
||||
* @param value 32-bit integer value
|
||||
* @param stored Store.YES if the content should also be stored
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public IntField(String name, int value, Store stored) {
|
||||
super(name, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
|
||||
fieldsData = Integer.valueOf(value);
|
||||
}
|
||||
|
||||
/** Expert: allows you to customize the {@link
|
||||
* FieldType}. */
|
||||
* FieldType}.
|
||||
* @param name field name
|
||||
* @param value 32-bit integer value
|
||||
* @param type customized field type: must have {@link FieldType#numericType()}
|
||||
* of {@link FieldType.NumericType#INT}.
|
||||
* @throws IllegalArgumentException if the field name or type is null, or
|
||||
* if the field type does not have a INT numericType()
|
||||
*/
|
||||
public IntField(String name, int value, FieldType type) {
|
||||
super(name, type);
|
||||
if (type.numericType() != FieldType.NumericType.INT) {
|
||||
|
|
|
@ -37,12 +37,21 @@ import org.apache.lucene.index.DocValues;
|
|||
|
||||
public class LongDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for 64-bit long DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.FIXED_INTS_64);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified 64-bit long value
|
||||
* @param name field name
|
||||
* @param value 64-bit long value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public LongDocValuesField(String name, long value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Long.valueOf(value);
|
||||
|
|
|
@ -124,6 +124,10 @@ import org.apache.lucene.util.NumericUtils;
|
|||
|
||||
public final class LongField extends Field {
|
||||
|
||||
/**
|
||||
* Type for a LongField that is not stored:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_NOT_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_NOT_STORED.setIndexed(true);
|
||||
|
@ -134,6 +138,10 @@ public final class LongField extends Field {
|
|||
TYPE_NOT_STORED.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for a stored LongField:
|
||||
* normalization factors, frequencies, and positions are omitted.
|
||||
*/
|
||||
public static final FieldType TYPE_STORED = new FieldType();
|
||||
static {
|
||||
TYPE_STORED.setIndexed(true);
|
||||
|
@ -147,14 +155,26 @@ public final class LongField extends Field {
|
|||
|
||||
/** Creates a stored or un-stored LongField with the provided value
|
||||
* and default <code>precisionStep</code> {@link
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4). */
|
||||
* NumericUtils#PRECISION_STEP_DEFAULT} (4).
|
||||
* @param name field name
|
||||
* @param value 64-bit long value
|
||||
* @param stored Store.YES if the content should also be stored
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public LongField(String name, long value, Store stored) {
|
||||
super(name, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
|
||||
fieldsData = Long.valueOf(value);
|
||||
}
|
||||
|
||||
/** Expert: allows you to customize the {@link
|
||||
* FieldType}. */
|
||||
* FieldType}.
|
||||
* @param name field name
|
||||
* @param value 64-bit long value
|
||||
* @param type customized field type: must have {@link FieldType#numericType()}
|
||||
* of {@link FieldType.NumericType#LONG}.
|
||||
* @throws IllegalArgumentException if the field name or type is null, or
|
||||
* if the field type does not have a LONG numericType()
|
||||
*/
|
||||
public LongField(String name, long value, FieldType type) {
|
||||
super(name, type);
|
||||
if (type.numericType() != FieldType.NumericType.LONG) {
|
||||
|
|
|
@ -41,6 +41,9 @@ import org.apache.lucene.index.AtomicReader; // javadocs
|
|||
|
||||
public class PackedLongDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for packed long DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.VAR_INTS);
|
||||
|
@ -48,6 +51,12 @@ public class PackedLongDocValuesField extends StoredField {
|
|||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified long value
|
||||
* @param name field name
|
||||
* @param value 64-bit long value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public PackedLongDocValuesField(String name, long value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Long.valueOf(value);
|
||||
|
|
|
@ -38,12 +38,21 @@ import org.apache.lucene.index.DocValues;
|
|||
|
||||
public class ShortDocValuesField extends StoredField {
|
||||
|
||||
/**
|
||||
* Type for 16-bit short DocValues.
|
||||
*/
|
||||
public static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setDocValueType(DocValues.Type.FIXED_INTS_16);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DocValues field with the specified 16-bit short value
|
||||
* @param name field name
|
||||
* @param value 16-bit short value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public ShortDocValuesField(String name, short value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = Short.valueOf(value);
|
||||
|
|
|
@ -40,22 +40,47 @@ import org.apache.lucene.util.BytesRef;
|
|||
public class SortedBytesDocValuesField extends StoredField {
|
||||
|
||||
// TODO: ideally indexer figures out var vs fixed on its own!?
|
||||
/**
|
||||
* Type for sorted bytes DocValues: all with the same length
|
||||
*/
|
||||
public static final FieldType TYPE_FIXED_LEN = new FieldType();
|
||||
static {
|
||||
TYPE_FIXED_LEN.setDocValueType(DocValues.Type.BYTES_FIXED_SORTED);
|
||||
TYPE_FIXED_LEN.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for sorted bytes DocValues: can have variable lengths
|
||||
*/
|
||||
public static final FieldType TYPE_VAR_LEN = new FieldType();
|
||||
static {
|
||||
TYPE_VAR_LEN.setDocValueType(DocValues.Type.BYTES_VAR_SORTED);
|
||||
TYPE_VAR_LEN.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new variable-length sorted DocValues field.
|
||||
* <p>
|
||||
* This calls
|
||||
* {@link SortedBytesDocValuesField#SortedBytesDocValuesField(String, BytesRef, boolean)
|
||||
* SortedBytesDocValuesField(name, bytes, false}, meaning by default
|
||||
* it allows for values of different lengths. If your values are all
|
||||
* the same length, use that constructor instead.
|
||||
* @param name field name
|
||||
* @param bytes binary content
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public SortedBytesDocValuesField(String name, BytesRef bytes) {
|
||||
this(name, bytes, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new fixed or variable length sorted DocValues field.
|
||||
* @param name field name
|
||||
* @param bytes binary content
|
||||
* @param isFixedLength true if all values have the same length.
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public SortedBytesDocValuesField(String name, BytesRef bytes, boolean isFixedLength) {
|
||||
super(name, isFixedLength ? TYPE_FIXED_LEN : TYPE_VAR_LEN);
|
||||
fieldsData = bytes;
|
||||
|
|
|
@ -27,6 +27,9 @@ import org.apache.lucene.util.BytesRef;
|
|||
* return the field and its value. */
|
||||
public class StoredField extends Field {
|
||||
|
||||
/**
|
||||
* Type for a stored-only field.
|
||||
*/
|
||||
public final static FieldType TYPE;
|
||||
static {
|
||||
TYPE = new FieldType();
|
||||
|
@ -34,10 +37,28 @@ public class StoredField extends Field {
|
|||
TYPE.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given binary value.
|
||||
* <p>NOTE: the provided byte[] is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param value byte array pointing to binary content (not copied)
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
protected StoredField(String name, FieldType type) {
|
||||
super(name, type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: allows you to customize the {@link
|
||||
* FieldType}.
|
||||
* <p>NOTE: the provided byte[] is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param value byte array pointing to binary content (not copied)
|
||||
* @param type custom {@link FieldType} for this field
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, BytesRef bytes, FieldType type) {
|
||||
super(name, bytes, type);
|
||||
}
|
||||
|
@ -46,14 +67,38 @@ public class StoredField extends Field {
|
|||
super(name, value, TYPE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given binary value.
|
||||
* <p>NOTE: the provided byte[] is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param value byte array pointing to binary content (not copied)
|
||||
* @param offset starting position of the byte array
|
||||
* @param length valid length of the byte array
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, byte[] value, int offset, int length) {
|
||||
super(name, value, offset, length, TYPE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given binary value.
|
||||
* <p>NOTE: the provided BytesRef is not copied so be sure
|
||||
* not to change it until you're done with this field.
|
||||
* @param name field name
|
||||
* @param value BytesRef pointing to binary content (not copied)
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, BytesRef value) {
|
||||
super(name, value, TYPE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given string value.
|
||||
* @param name field name
|
||||
* @param value string value
|
||||
* @throws IllegalArgumentException if the field name or value is null.
|
||||
*/
|
||||
public StoredField(String name, String value) {
|
||||
super(name, value, TYPE);
|
||||
}
|
||||
|
@ -63,21 +108,45 @@ public class StoredField extends Field {
|
|||
}
|
||||
|
||||
// TODO: not great but maybe not a big problem?
|
||||
/**
|
||||
* Create a stored-only field with the given integer value.
|
||||
* @param name field name
|
||||
* @param value integer value
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, int value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given float value.
|
||||
* @param name field name
|
||||
* @param value float value
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, float value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given long value.
|
||||
* @param name field name
|
||||
* @param value long value
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, long value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored-only field with the given double value.
|
||||
* @param name field name
|
||||
* @param value double value
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
*/
|
||||
public StoredField(String name, double value) {
|
||||
super(name, TYPE);
|
||||
fieldsData = value;
|
||||
|
|
|
@ -43,23 +43,49 @@ import org.apache.lucene.util.BytesRef;
|
|||
public class StraightBytesDocValuesField extends StoredField {
|
||||
|
||||
// TODO: ideally indexer figures out var vs fixed on its own!?
|
||||
/**
|
||||
* Type for direct bytes DocValues: all with the same length
|
||||
*/
|
||||
public static final FieldType TYPE_FIXED_LEN = new FieldType();
|
||||
static {
|
||||
TYPE_FIXED_LEN.setDocValueType(DocValues.Type.BYTES_FIXED_STRAIGHT);
|
||||
TYPE_FIXED_LEN.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Type for direct bytes DocValues: can have variable lengths
|
||||
*/
|
||||
public static final FieldType TYPE_VAR_LEN = new FieldType();
|
||||
static {
|
||||
TYPE_VAR_LEN.setDocValueType(DocValues.Type.BYTES_VAR_STRAIGHT);
|
||||
TYPE_VAR_LEN.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new variable-length direct DocValues field.
|
||||
* <p>
|
||||
* This calls
|
||||
* {@link StraightBytesDocValuesField#StraightBytesDocValuesField(String, BytesRef, boolean)
|
||||
* StraightBytesDocValuesField(name, bytes, false}, meaning by default
|
||||
* it allows for values of different lengths. If your values are all
|
||||
* the same length, use that constructor instead.
|
||||
* @param name field name
|
||||
* @param bytes binary content
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public StraightBytesDocValuesField(String name, BytesRef bytes) {
|
||||
super(name, TYPE_VAR_LEN);
|
||||
fieldsData = bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new fixed or variable length direct DocValues field.
|
||||
* <p>
|
||||
* @param name field name
|
||||
* @param bytes binary content
|
||||
* @param isFixedLength true if all values have the same length.
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
*/
|
||||
public StraightBytesDocValuesField(String name, BytesRef bytes, boolean isFixedLength) {
|
||||
super(name, isFixedLength ? TYPE_FIXED_LEN : TYPE_VAR_LEN);
|
||||
fieldsData = bytes;
|
||||
|
|
|
@ -50,7 +50,12 @@ public final class StringField extends Field {
|
|||
TYPE_STORED.freeze();
|
||||
}
|
||||
|
||||
/** Creates a new StringField. */
|
||||
/** Creates a new StringField.
|
||||
* @param name field name
|
||||
* @param value String value
|
||||
* @param stored Store.YES if the content should also be stored
|
||||
* @throws IllegalArgumentException if the field name or value is null.
|
||||
*/
|
||||
public StringField(String name, String value, Store stored) {
|
||||
super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
|
||||
}
|
||||
|
|
|
@ -27,10 +27,10 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
|
||||
public final class TextField extends Field {
|
||||
|
||||
/* Indexed, tokenized, not stored. */
|
||||
/** Indexed, tokenized, not stored. */
|
||||
public static final FieldType TYPE_NOT_STORED = new FieldType();
|
||||
|
||||
/* Indexed, tokenized, stored. */
|
||||
/** Indexed, tokenized, stored. */
|
||||
public static final FieldType TYPE_STORED = new FieldType();
|
||||
|
||||
static {
|
||||
|
@ -46,17 +46,32 @@ public final class TextField extends Field {
|
|||
|
||||
// TODO: add sugar for term vectors...?
|
||||
|
||||
/** Creates a new un-stored TextField with Reader value. */
|
||||
/** Creates a new un-stored TextField with Reader value.
|
||||
* @param name field name
|
||||
* @param reader reader value
|
||||
* @throws IllegalArgumentException if the field name is null
|
||||
* @throws NullPointerException if the reader is null
|
||||
*/
|
||||
public TextField(String name, Reader reader) {
|
||||
super(name, reader, TYPE_NOT_STORED);
|
||||
}
|
||||
|
||||
/** Creates a new TextField with String value. */
|
||||
/** Creates a new TextField with String value.
|
||||
* @param name field name
|
||||
* @param value string value
|
||||
* @param store Store.YES if the content should also be stored
|
||||
* @throws IllegalArgumentException if the field name or value is null.
|
||||
*/
|
||||
public TextField(String name, String value, Store store) {
|
||||
super(name, value, store == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
|
||||
}
|
||||
|
||||
/** Creates a new un-stored TextField with TokenStream value. */
|
||||
/** Creates a new un-stored TextField with TokenStream value.
|
||||
* @param name field name
|
||||
* @param stream TokenStream value
|
||||
* @throws IllegalArgumentException if the field name is null.
|
||||
* @throws NullPointerException if the tokenStream is null
|
||||
*/
|
||||
public TextField(String name, TokenStream stream) {
|
||||
super(name, stream, TYPE_NOT_STORED);
|
||||
}
|
||||
|
|
|
@ -51,6 +51,7 @@ import org.apache.lucene.store.Directory;
|
|||
public abstract class DirectoryReader extends BaseCompositeReader<AtomicReader> {
|
||||
public static final int DEFAULT_TERMS_INDEX_DIVISOR = 1;
|
||||
|
||||
/** The index directory. */
|
||||
protected final Directory directory;
|
||||
|
||||
/** Returns a IndexReader reading the index in the given
|
||||
|
|
|
@ -31,7 +31,9 @@ import org.apache.lucene.index.DocValues.Type;
|
|||
**/
|
||||
|
||||
public final class FieldInfo {
|
||||
/** Field's name */
|
||||
public final String name;
|
||||
/** Internal field number */
|
||||
public final int number;
|
||||
|
||||
private boolean indexed;
|
||||
|
@ -55,14 +57,29 @@ public final class FieldInfo {
|
|||
// NOTE: order is important here; FieldInfo uses this
|
||||
// order to merge two conflicting IndexOptions (always
|
||||
// "downgrades" by picking the lowest).
|
||||
/** only documents are indexed: term frequencies and positions are omitted */
|
||||
/**
|
||||
* Only documents are indexed: term frequencies and positions are omitted.
|
||||
* Phrase and other positional queries on the field will throw an exception, and scoring
|
||||
* will behave as if any term in the document appears only once.
|
||||
*/
|
||||
// TODO: maybe rename to just DOCS?
|
||||
DOCS_ONLY,
|
||||
/** only documents and term frequencies are indexed: positions are omitted */
|
||||
/**
|
||||
* Only documents and term frequencies are indexed: positions are omitted.
|
||||
* This enables normal scoring, except Phrase and other positional queries
|
||||
* will throw an exception.
|
||||
*/
|
||||
DOCS_AND_FREQS,
|
||||
/** documents, frequencies and positions */
|
||||
/**
|
||||
* Indexes documents, frequencies and positions.
|
||||
* This is a typical default for full-text search: full scoring is enabled
|
||||
* and positional queries are supported.
|
||||
*/
|
||||
DOCS_AND_FREQS_AND_POSITIONS,
|
||||
/** documents, frequencies, positions and offsets */
|
||||
/**
|
||||
* Indexes documents, frequencies, positions and offsets.
|
||||
* Character offsets are encoded alongside the positions.
|
||||
*/
|
||||
DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
|
||||
};
|
||||
|
||||
|
@ -149,27 +166,27 @@ public final class FieldInfo {
|
|||
assert checkConsistency();
|
||||
}
|
||||
|
||||
/** @return IndexOptions for the field, or null if the field is not indexed */
|
||||
/** Returns IndexOptions for the field, or null if the field is not indexed */
|
||||
public IndexOptions getIndexOptions() {
|
||||
return indexOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this field has any docValues.
|
||||
* Returns true if this field has any docValues.
|
||||
*/
|
||||
public boolean hasDocValues() {
|
||||
return docValueType != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return {@link DocValues.Type} of the docValues. this may be null if the field has no docvalues.
|
||||
* Returns {@link DocValues.Type} of the docValues. this may be null if the field has no docvalues.
|
||||
*/
|
||||
public DocValues.Type getDocValuesType() {
|
||||
return docValueType;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return {@link DocValues.Type} of the norm. this may be null if the field has no norms.
|
||||
* Returns {@link DocValues.Type} of the norm. this may be null if the field has no norms.
|
||||
*/
|
||||
public DocValues.Type getNormType() {
|
||||
return normType;
|
||||
|
@ -193,35 +210,35 @@ public final class FieldInfo {
|
|||
}
|
||||
|
||||
/**
|
||||
* @return true if norms are explicitly omitted for this field
|
||||
* Returns true if norms are explicitly omitted for this field
|
||||
*/
|
||||
public boolean omitsNorms() {
|
||||
return omitNorms;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this field actually has any norms.
|
||||
* Returns true if this field actually has any norms.
|
||||
*/
|
||||
public boolean hasNorms() {
|
||||
return normType != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this field is indexed.
|
||||
* Returns true if this field is indexed.
|
||||
*/
|
||||
public boolean isIndexed() {
|
||||
return indexed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if any payloads exist for this field.
|
||||
* Returns true if any payloads exist for this field.
|
||||
*/
|
||||
public boolean hasPayloads() {
|
||||
return storePayloads;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if any term vectors exist for this field.
|
||||
* Returns true if any term vectors exist for this field.
|
||||
*/
|
||||
public boolean hasVectors() {
|
||||
return storeTermVector;
|
||||
|
@ -256,7 +273,7 @@ public final class FieldInfo {
|
|||
}
|
||||
|
||||
/**
|
||||
* @return internal codec attributes map. May be null if no mappings exist.
|
||||
* Returns internal codec attributes map. May be null if no mappings exist.
|
||||
*/
|
||||
public Map<String,String> attributes() {
|
||||
return attributes;
|
||||
|
|
|
@ -44,6 +44,9 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>();
|
||||
private final Collection<FieldInfo> values; // for an unmodifiable iterator
|
||||
|
||||
/**
|
||||
* Constructs a new FieldInfos from an array of FieldInfo objects
|
||||
*/
|
||||
public FieldInfos(FieldInfo[] infos) {
|
||||
boolean hasVectors = false;
|
||||
boolean hasProx = false;
|
||||
|
@ -98,30 +101,22 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
return hasOffsets;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if at least one field has any vectors
|
||||
*/
|
||||
/** Returns true if any fields have vectors */
|
||||
public boolean hasVectors() {
|
||||
return hasVectors;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if at least one field has any norms
|
||||
*/
|
||||
/** Returns true if any fields have norms */
|
||||
public boolean hasNorms() {
|
||||
return hasNorms;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if at least one field has doc values
|
||||
*/
|
||||
/** Returns true if any fields have DocValues */
|
||||
public boolean hasDocValues() {
|
||||
return hasDocValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return number of fields
|
||||
*/
|
||||
/** Returns the number of fields */
|
||||
public int size() {
|
||||
assert byNumber.size() == byName.size();
|
||||
return byNumber.size();
|
||||
|
|
|
@ -40,8 +40,13 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
/** Base class for filtering {@link Fields}
|
||||
* implementations. */
|
||||
public static class FilterFields extends Fields {
|
||||
/** The underlying Fields instance. */
|
||||
protected final Fields in;
|
||||
|
||||
/**
|
||||
* Creates a new FilterFields.
|
||||
* @param in the underlying Fields instance.
|
||||
*/
|
||||
public FilterFields(Fields in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
@ -65,8 +70,13 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
/** Base class for filtering {@link Terms}
|
||||
* implementations. */
|
||||
public static class FilterTerms extends Terms {
|
||||
/** The underlying Terms instance. */
|
||||
protected final Terms in;
|
||||
|
||||
/**
|
||||
* Creates a new FilterTerms
|
||||
* @param in the underlying Terms instance.
|
||||
*/
|
||||
public FilterTerms(Terms in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
@ -124,8 +134,13 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
|
||||
/** Base class for filtering {@link TermsEnum} implementations. */
|
||||
public static class FilterTermsEnum extends TermsEnum {
|
||||
/** The underlying TermsEnum instance. */
|
||||
protected final TermsEnum in;
|
||||
|
||||
/**
|
||||
* Creates a new FilterTermsEnum
|
||||
* @param in the underlying TermsEnum instance.
|
||||
*/
|
||||
public FilterTermsEnum(TermsEnum in) { this.in = in; }
|
||||
|
||||
@Override
|
||||
|
@ -201,8 +216,13 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
|
||||
/** Base class for filtering {@link DocsEnum} implementations. */
|
||||
public static class FilterDocsEnum extends DocsEnum {
|
||||
/** The underlying DocsEnum instance. */
|
||||
protected final DocsEnum in;
|
||||
|
||||
/**
|
||||
* Create a new FilterDocsEnum
|
||||
* @param in the underlying DocsEnum instance.
|
||||
*/
|
||||
public FilterDocsEnum(DocsEnum in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
@ -235,8 +255,13 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
|
||||
/** Base class for filtering {@link DocsAndPositionsEnum} implementations. */
|
||||
public static class FilterDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
/** The underlying DocsAndPositionsEnum instance. */
|
||||
protected final DocsAndPositionsEnum in;
|
||||
|
||||
/**
|
||||
* Create a new FilterDocsAndPositionsEnum
|
||||
* @param in the underlying DocsAndPositionsEnum instance.
|
||||
*/
|
||||
public FilterDocsAndPositionsEnum(DocsAndPositionsEnum in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
@ -287,6 +312,7 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
}
|
||||
}
|
||||
|
||||
/** The underlying AtomicReader. */
|
||||
protected final AtomicReader in;
|
||||
|
||||
/**
|
||||
|
|
|
@ -48,7 +48,20 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
* the enum should call {@link #nextSeekTerm} and step forward.
|
||||
* @see #accept(BytesRef)
|
||||
*/
|
||||
protected static enum AcceptStatus {YES, YES_AND_SEEK, NO, NO_AND_SEEK, END};
|
||||
protected static enum AcceptStatus {
|
||||
/** Accept the term and position the enum at the next term. */
|
||||
YES,
|
||||
/** Accept the term and advance ({@link FilteredTermsEnum#nextSeekTerm(BytesRef)})
|
||||
* to the next term. */
|
||||
YES_AND_SEEK,
|
||||
/** Reject the term and position the enum at the next term. */
|
||||
NO,
|
||||
/** Reject the term and advance ({@link FilteredTermsEnum#nextSeekTerm(BytesRef)})
|
||||
* to the next term. */
|
||||
NO_AND_SEEK,
|
||||
/** Reject the term and stop enumerating. */
|
||||
END
|
||||
};
|
||||
|
||||
/** Return if term is accepted, not accepted or the iteration should ended
|
||||
* (and possibly seek).
|
||||
|
|
|
@ -40,6 +40,9 @@ import org.apache.lucene.codecs.Codec;
|
|||
*/
|
||||
|
||||
public final class IndexFileNames {
|
||||
|
||||
/** No instance */
|
||||
private IndexFileNames() {}
|
||||
|
||||
/** Name of the index segment file */
|
||||
public static final String SEGMENTS = "segments";
|
||||
|
@ -184,6 +187,10 @@ public final class IndexFileNames {
|
|||
return filename;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the extension (anything after the first '.'),
|
||||
* otherwise returns the original filename.
|
||||
*/
|
||||
public static String stripExtension(String filename) {
|
||||
int idx = filename.indexOf('.');
|
||||
if (idx != -1) {
|
||||
|
|
|
@ -243,7 +243,8 @@ public abstract class IndexReader implements Closeable {
|
|||
}
|
||||
|
||||
/**
|
||||
* @throws AlreadyClosedException if this IndexReader is closed
|
||||
* Throws AlreadyClosedException if this IndexReader or any
|
||||
* of its child readers is closed, otherwise returns.
|
||||
*/
|
||||
protected final void ensureOpen() throws AlreadyClosedException {
|
||||
if (refCount.get() <= 0) {
|
||||
|
|
|
@ -549,6 +549,14 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Used internally to throw an {@link
|
||||
* AlreadyClosedException} if this IndexWriter has been
|
||||
* closed.
|
||||
* <p>
|
||||
* Calls {@link #ensureOpen(boolean) ensureOpen(true)}.
|
||||
* @throws AlreadyClosedException if this IndexWriter is closed
|
||||
*/
|
||||
protected final void ensureOpen() throws AlreadyClosedException {
|
||||
ensureOpen(true);
|
||||
}
|
||||
|
@ -1030,6 +1038,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this index has deletions (including buffered deletions).
|
||||
*/
|
||||
public synchronized boolean hasDeletions() {
|
||||
ensureOpen();
|
||||
if (bufferedDeletesStream.any()) {
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity; // javadocs
|
||||
import org.apache.lucene.search.similarities.Similarity; // javadocs
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
// TODO: how to handle versioning here...?
|
||||
|
@ -46,6 +48,25 @@ public interface IndexableField extends GeneralField {
|
|||
*/
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException;
|
||||
|
||||
/** Field boost (you must pre-multiply in any doc boost). */
|
||||
/**
|
||||
* Returns the field's index-time boost.
|
||||
* <p>
|
||||
* Only fields can have an index-time boost, if you want to simulate
|
||||
* a "document boost", then you must pre-multiply it across all the
|
||||
* relevant fields yourself.
|
||||
* <p>The boost is used to compute the norm factor for the field. By
|
||||
* default, in the {@link Similarity#computeNorm(FieldInvertState, Norm)} method,
|
||||
* the boost value is multiplied by the length normalization factor and then
|
||||
* rounded by {@link DefaultSimilarity#encodeNormValue(float)} before it is stored in the
|
||||
* index. One should attempt to ensure that this product does not overflow
|
||||
* the range of that encoding.
|
||||
* <p>
|
||||
* It is illegal to return a boost other than 1.0f for a field that is not
|
||||
* indexed ({@link IndexableFieldType#indexed()} is false) or omits normalization values
|
||||
* ({@link IndexableFieldType#omitNorms()} returns true).
|
||||
*
|
||||
* @see Similarity#computeNorm(FieldInvertState, Norm)
|
||||
* @see DefaultSimilarity#encodeNormValue(float)
|
||||
*/
|
||||
public float boost();
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer; // javadocs
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
|
||||
/**
|
||||
|
@ -31,29 +32,68 @@ public interface IndexableFieldType {
|
|||
/** True if the field's value should be stored */
|
||||
public boolean stored();
|
||||
|
||||
/** True if this field's value should be analyzed */
|
||||
/**
|
||||
* True if this field's value should be analyzed by the
|
||||
* {@link Analyzer}.
|
||||
* <p>
|
||||
* This has no effect if {@link #indexed()} returns false.
|
||||
*/
|
||||
public boolean tokenized();
|
||||
|
||||
/** True if term vectors should be indexed */
|
||||
/**
|
||||
* True if this field's indexed form should be also stored
|
||||
* into term vectors.
|
||||
* <p>
|
||||
* This builds a miniature inverted-index for this field which
|
||||
* can be accessed in a document-oriented way from
|
||||
* {@link IndexReader#getTermVector(int,String)}.
|
||||
* <p>
|
||||
* This option is illegal if {@link #indexed()} returns false.
|
||||
*/
|
||||
public boolean storeTermVectors();
|
||||
|
||||
/** True if term vector offsets should be indexed */
|
||||
/**
|
||||
* True if this field's token character offsets should also
|
||||
* be stored into term vectors.
|
||||
* <p>
|
||||
* This option is illegal if term vectors are not enabled for the field
|
||||
* ({@link #storeTermVectors()} is false)
|
||||
*/
|
||||
public boolean storeTermVectorOffsets();
|
||||
|
||||
/** True if term vector positions should be indexed */
|
||||
/**
|
||||
* True if this field's token positions should also be stored
|
||||
* into the term vectors.
|
||||
* <p>
|
||||
* This option is illegal if term vectors are not enabled for the field
|
||||
* ({@link #storeTermVectors()} is false).
|
||||
*/
|
||||
public boolean storeTermVectorPositions();
|
||||
|
||||
/** True if term vector payloads should be indexed */
|
||||
/**
|
||||
* True if this field's token payloads should also be stored
|
||||
* into the term vectors.
|
||||
* <p>
|
||||
* This option is illegal if term vector positions are not enabled
|
||||
* for the field ({@link #storeTermVectors()} is false).
|
||||
*/
|
||||
public boolean storeTermVectorPayloads();
|
||||
|
||||
/** True if norms should not be indexed */
|
||||
/**
|
||||
* True if normalization values should be omitted for the field.
|
||||
* <p>
|
||||
* This saves memory, but at the expense of scoring quality (length normalization
|
||||
* will be disabled), and if you omit norms, you cannot use index-time boosts.
|
||||
*/
|
||||
public boolean omitNorms();
|
||||
|
||||
/** {@link IndexOptions}, describing what should be
|
||||
* recorded into the inverted index */
|
||||
public IndexOptions indexOptions();
|
||||
|
||||
/** DocValues type; if non-null then the field's value
|
||||
* will be indexed into docValues */
|
||||
/**
|
||||
* DocValues {@link DocValues.Type}: if non-null then the field's value
|
||||
* will be indexed into docValues.
|
||||
*/
|
||||
public DocValues.Type docValueType();
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue