mirror of https://github.com/apache/lucene.git
LUCENE-2326: Removed SVN checkouts for backwards tests. The backwards branch is now included in the svn repository using "svn copy" after release.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@924207 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5023a08ace
commit
675597141b
10
CHANGES.txt
10
CHANGES.txt
|
@ -238,9 +238,13 @@ Optimizations
|
|||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
into core, and moved the ICU-based collation support into contrib/icu.
|
||||
(Robert Muir)
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
into core, and moved the ICU-based collation support into contrib/icu.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-2326: Removed SVN checkouts for backwards tests. The backwards branch
|
||||
is now included in the svn repository using "svn copy" after release.
|
||||
(Uwe Schindler)
|
||||
|
||||
Test Cases
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
This folder contains the src/ folder of the previous Lucene major version.
|
||||
|
||||
The test-backwards ANT task compiles the core classes of the previous version and its tests
|
||||
against these class files. After that the compiled test classes are run against the new
|
||||
lucene-core.jar file.
|
||||
|
||||
After branching a new Lucene major version (branch name "lucene_X_Y") do the following:
|
||||
|
||||
* svn rm backwards/src/
|
||||
* svn cp https://svn.apache.org/repos/asf/lucene/java/branches/lucene_X_Y/src/ backwards/src/
|
||||
* Check that everything is correct: The backwards folder should contain a src/ folder
|
||||
that now contains java, test, demo,.... The files should be the ones from the branch.
|
||||
* Run "ant test-backwards"
|
|
@ -0,0 +1,253 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="lucene-demo" default="compile-demo" basedir=".">
|
||||
<dirname file="${ant.file.common}" property="common.dir"/>
|
||||
|
||||
<property name="version" value="@PLACEHOLDER_version@"/>
|
||||
<property name="javac.source" value="@PLACEHOLDER_javac.source@"/>
|
||||
<property name="javac.target" value="@PLACEHOLDER_javac.target@"/>
|
||||
|
||||
<property name="build.dir" location="build"/>
|
||||
|
||||
|
||||
<property name="core.name" value="lucene-core-${version}"/>
|
||||
<property name="demo.name" value="lucene-demos-${version}"/>
|
||||
<property name="demo.war.name" value="luceneweb"/>
|
||||
|
||||
<property name="manifest.file" location="${build.dir}/MANIFEST.MF"/>
|
||||
|
||||
<!-- Build classpath -->
|
||||
<path id="classpath">
|
||||
<pathelement location="${common.dir}/${core.name}.jar"/>
|
||||
</path>
|
||||
|
||||
<path id="demo.classpath">
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${build.dir}/classes/demo"/>
|
||||
</path>
|
||||
|
||||
<available
|
||||
property="jar.core.present"
|
||||
type="file"
|
||||
file="${common.dir}/${core.name}.jar"
|
||||
/>
|
||||
|
||||
<target name="jar.core-check">
|
||||
<fail unless="jar.core.present">
|
||||
##################################################################
|
||||
${common.dir}/${core.name}.jar not found.
|
||||
##################################################################
|
||||
</fail>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- J A R -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
|
||||
<target name="jar-demo" depends="compile-demo"
|
||||
description="Build demo jar file">
|
||||
<sequential>
|
||||
<build-manifest/>
|
||||
<jar
|
||||
destfile="${demo.name}.jar"
|
||||
basedir="${build.dir}/classes/demo"
|
||||
excludes="**/*.java"
|
||||
manifest="${manifest.file}">
|
||||
<metainf dir="${common.dir}">
|
||||
<include name="LICENSE.txt"/>
|
||||
<include name="NOTICE.txt"/>
|
||||
</metainf>
|
||||
</jar>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="war-demo" depends="jar-demo"
|
||||
description="Build demo war file">
|
||||
<sequential>
|
||||
<build-manifest/>
|
||||
<war destfile="${demo.war.name}.war"
|
||||
webxml="src/jsp/WEB-INF/web.xml"
|
||||
manifest="${manifest.file}">
|
||||
<fileset dir="src/jsp" excludes="WEB-INF/web.xml"/>
|
||||
<lib dir="." includes="${demo.name}.jar"/>
|
||||
<lib dir="." includes="${core.name}.jar"/>
|
||||
<metainf dir="${common.dir}">
|
||||
<include name="LICENSE.txt"/>
|
||||
<include name="NOTICE.txt"/>
|
||||
</metainf>
|
||||
</war>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- B U I L D D E M O -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="compile-demo" depends="jar.core-check"
|
||||
description="Compile demo classes">
|
||||
<mkdir dir="${build.dir}/classes/demo"/>
|
||||
|
||||
<compile
|
||||
srcdir="src/demo"
|
||||
destdir="${build.dir}/classes/demo">
|
||||
<classpath refid="demo.classpath"/>
|
||||
</compile>
|
||||
</target>
|
||||
|
||||
<target name="clean"
|
||||
description="Removes contents of build directory">
|
||||
<delete dir="${build.dir}"/>
|
||||
<delete dir="${common.dir}/demo-text-dir"/>
|
||||
<delete dir="${common.dir}/demo-html-dir"/>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- R U N T E X T I N D E X I N G D E M O -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="demo-index-text" depends="jar-demo"
|
||||
description="Run text indexing demo (index the sources of the demo).">
|
||||
<echo>----- (1) Prepare dir ----- </echo>
|
||||
<echo>cd ${common.dir} </echo>
|
||||
<echo>rmdir demo-text-dir </echo>
|
||||
<delete dir="${common.dir}/demo-text-dir"/>
|
||||
<echo>mkdir demo-text-dir </echo>
|
||||
<mkdir dir="${common.dir}/demo-text-dir"/>
|
||||
<echo>cd demo-text-dir </echo>
|
||||
<echo>----- (2) Index the files located under ${common.dir}/src ----- </echo>
|
||||
<invoke-java class="IndexFiles" params="${common.dir}/src/demo" paramsDisplay="../src/demo" type="text"/>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- R U N T E X T S E A R C H D E M O -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="demo-search-text" depends="jar-demo"
|
||||
description="Run interactive search demo.">
|
||||
<echo>----- Interactive search ----- </echo>
|
||||
<echo>cd demo-text-dir </echo>
|
||||
<invoke-java class="SearchFiles" params="-index index" paramsDisplay="-index index" type="text"/>
|
||||
</target>
|
||||
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- R U N H T M L I N D E X I N G D E M O -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="demo-index-html" depends="jar-demo"
|
||||
description="Run html indexing demo (index the javadocs).">
|
||||
<echo>----- (1) Prepare dir ----- </echo>
|
||||
<echo>cd ${common.dir} </echo>
|
||||
<echo>rmdir demo-html-dir </echo>
|
||||
<delete dir="${common.dir}/demo-html-dir"/>
|
||||
<echo>mkdir demo-html-dir </echo>
|
||||
<mkdir dir="${common.dir}/demo-html-dir"/>
|
||||
<echo>cd demo-html-dir </echo>
|
||||
<echo>----- (2) Index the files located under ${common.dir}/src ----- </echo>
|
||||
<invoke-java class="IndexFiles" params="${common.dir}/docs/api" paramsDisplay="../docs/api" type="html"/>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- R U N H T M L S E A R C H D E M O -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="demo-search-html" depends="jar-demo"
|
||||
description="Run interactive search demo.">
|
||||
<echo>----- Interactive search ----- </echo>
|
||||
<echo>cd demo-html-dir </echo>
|
||||
<invoke-java class="SearchFiles" params="-index index" paramsDisplay="-index index" type="html"/>
|
||||
</target>
|
||||
|
||||
|
||||
<!--+
|
||||
| M A C R O S
|
||||
+-->
|
||||
|
||||
<macrodef name="build-manifest" description="Builds a manifest file">
|
||||
<sequential>
|
||||
<manifest file="${manifest.file}">
|
||||
<attribute name="Specification-Title" value="Lucene Search Engine: demos"/>
|
||||
<!-- spec version must match "digit+{.digit+}*" -->
|
||||
<attribute name="Specification-Version" value="${version}"/>
|
||||
<attribute name="Specification-Vendor"
|
||||
value="The Apache Software Foundation"/>
|
||||
<attribute name="Implementation-Title" value="org.apache.lucene"/>
|
||||
<!-- impl version can be any string -->
|
||||
<attribute name="Implementation-Version"
|
||||
value="${version}"/>
|
||||
<attribute name="Implementation-Vendor"
|
||||
value="The Apache Software Foundation"/>
|
||||
<attribute name="X-Compile-Source-JDK"
|
||||
value="${javac.source}"/>
|
||||
<attribute name="X-Compile-Target-JDK"
|
||||
value="${javac.target}"/>
|
||||
</manifest>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<macrodef name="compile">
|
||||
<attribute name="srcdir"/>
|
||||
<attribute name="destdir"/>
|
||||
<element name="nested" implicit="yes" optional="yes"/>
|
||||
|
||||
<sequential>
|
||||
<mkdir dir="@{destdir}"/>
|
||||
<javac
|
||||
srcdir="@{srcdir}"
|
||||
destdir="@{destdir}"
|
||||
deprecation="off"
|
||||
debug="on"
|
||||
source="${javac.source}"
|
||||
target="${javac.target}">
|
||||
<nested/>
|
||||
</javac>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<macrodef name="invoke-java">
|
||||
<attribute name="class"/>
|
||||
<attribute name="params"/>
|
||||
<attribute name="paramsDisplay"/>
|
||||
<attribute name="type"/>
|
||||
<sequential>
|
||||
<echo>java -classpath "../${core.name}.jar;../${demo.name}.jar" org.apache.lucene.demo.@{class} @{paramsDisplay} </echo>
|
||||
<java classname="org.apache.lucene.demo.@{class}"
|
||||
dir="${common.dir}/demo-@{type}-dir"
|
||||
fork="true"
|
||||
failonerror="true"
|
||||
maxmemory="128m"
|
||||
>
|
||||
<arg value="@{params}"/>
|
||||
<classpath>
|
||||
<pathelement location="${common.dir}/${core.name}.jar"/>
|
||||
<pathelement location="${common.dir}/${demo.name}.jar"/>
|
||||
</classpath>
|
||||
</java>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,66 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
//import org.apache.lucene.index.Term;
|
||||
|
||||
|
||||
/** Deletes documents from an index that do not contain a term. */
|
||||
public class DeleteFiles {
|
||||
|
||||
private DeleteFiles() {} // singleton
|
||||
|
||||
/** Deletes documents from an index that do not contain a term. */
|
||||
public static void main(String[] args) {
|
||||
String usage = "java org.apache.lucene.demo.DeleteFiles <unique_term>";
|
||||
if (args.length == 0) {
|
||||
System.err.println("Usage: " + usage);
|
||||
System.exit(1);
|
||||
}
|
||||
try {
|
||||
Directory directory = FSDirectory.open(new File("index"));
|
||||
IndexReader reader = IndexReader.open(directory, false); // we don't want read-only because we are about to delete
|
||||
|
||||
Term term = new Term("path", args[0]);
|
||||
int deleted = reader.deleteDocuments(term);
|
||||
|
||||
System.out.println("deleted " + deleted +
|
||||
" documents containing " + term);
|
||||
|
||||
// one can also delete documents by their internal id:
|
||||
/*
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
System.out.println("Deleting document with id " + i);
|
||||
reader.delete(i);
|
||||
}*/
|
||||
|
||||
reader.close();
|
||||
directory.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
/** A utility for making Lucene Documents from a File. */
|
||||
|
||||
public class FileDocument {
|
||||
/** Makes a document for a File.
|
||||
<p>
|
||||
The document has three fields:
|
||||
<ul>
|
||||
<li><code>path</code>--containing the pathname of the file, as a stored,
|
||||
untokenized field;
|
||||
<li><code>modified</code>--containing the last modified date of the file as
|
||||
a field as created by <a
|
||||
href="lucene.document.DateTools.html">DateTools</a>; and
|
||||
<li><code>contents</code>--containing the full contents of the file, as a
|
||||
Reader field;
|
||||
*/
|
||||
public static Document Document(File f)
|
||||
throws java.io.FileNotFoundException {
|
||||
|
||||
// make a new, empty document
|
||||
Document doc = new Document();
|
||||
|
||||
// Add the path of the file as a field named "path". Use a field that is
|
||||
// indexed (i.e. searchable), but don't tokenize the field into words.
|
||||
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
// Add the last modified date of the file a field named "modified". Use
|
||||
// a field that is indexed (i.e. searchable), but don't tokenize the field
|
||||
// into words.
|
||||
doc.add(new Field("modified",
|
||||
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
// Add the contents of the file to a field named "contents". Specify a Reader,
|
||||
// so that the text of the file is tokenized and indexed, but not stored.
|
||||
// Note that FileReader expects the file to be in the system's default encoding.
|
||||
// If that's not the case searching for special characters will fail.
|
||||
doc.add(new Field("contents", new FileReader(f)));
|
||||
|
||||
// return the document
|
||||
return doc;
|
||||
}
|
||||
|
||||
private FileDocument() {}
|
||||
}
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.demo.html.HTMLParser;
|
||||
|
||||
/** A utility for making Lucene Documents for HTML documents. */
|
||||
|
||||
public class HTMLDocument {
|
||||
static char dirSep = System.getProperty("file.separator").charAt(0);
|
||||
|
||||
public static String uid(File f) {
|
||||
// Append path and date into a string in such a way that lexicographic
|
||||
// sorting gives the same results as a walk of the file hierarchy. Thus
|
||||
// null (\u0000) is used both to separate directory components and to
|
||||
// separate the path from the date.
|
||||
return f.getPath().replace(dirSep, '\u0000') +
|
||||
"\u0000" +
|
||||
DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND);
|
||||
}
|
||||
|
||||
public static String uid2url(String uid) {
|
||||
String url = uid.replace('\u0000', '/'); // replace nulls with slashes
|
||||
return url.substring(0, url.lastIndexOf('/')); // remove date from end
|
||||
}
|
||||
|
||||
public static Document Document(File f)
|
||||
throws IOException, InterruptedException {
|
||||
// make a new, empty document
|
||||
Document doc = new Document();
|
||||
|
||||
// Add the url as a field named "path". Use a field that is
|
||||
// indexed (i.e. searchable), but don't tokenize the field into words.
|
||||
doc.add(new Field("path", f.getPath().replace(dirSep, '/'), Field.Store.YES,
|
||||
Field.Index.NOT_ANALYZED));
|
||||
|
||||
// Add the last modified date of the file a field named "modified".
|
||||
// Use a field that is indexed (i.e. searchable), but don't tokenize
|
||||
// the field into words.
|
||||
doc.add(new Field("modified",
|
||||
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
// Add the uid as a field, so that index can be incrementally maintained.
|
||||
// This field is not stored with document, it is indexed, but it is not
|
||||
// tokenized prior to indexing.
|
||||
doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
|
||||
|
||||
FileInputStream fis = new FileInputStream(f);
|
||||
HTMLParser parser = new HTMLParser(fis);
|
||||
|
||||
// Add the tag-stripped contents as a Reader-valued Text field so it will
|
||||
// get tokenized and indexed.
|
||||
doc.add(new Field("contents", parser.getReader()));
|
||||
|
||||
// Add the summary as a field that is stored and returned with
|
||||
// hit documents for display.
|
||||
doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));
|
||||
|
||||
// Add the title as a field that it can be searched and that is stored.
|
||||
doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
|
||||
|
||||
// return the document
|
||||
return doc;
|
||||
}
|
||||
|
||||
private HTMLDocument() {}
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/** Index all text files under a directory. */
|
||||
public class IndexFiles {
|
||||
|
||||
private IndexFiles() {}
|
||||
|
||||
static final File INDEX_DIR = new File("index");
|
||||
|
||||
/** Index all text files under a directory. */
|
||||
public static void main(String[] args) {
|
||||
String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>";
|
||||
if (args.length == 0) {
|
||||
System.err.println("Usage: " + usage);
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (INDEX_DIR.exists()) {
|
||||
System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
final File docDir = new File(args[0]);
|
||||
if (!docDir.exists() || !docDir.canRead()) {
|
||||
System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
Date start = new Date();
|
||||
try {
|
||||
IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
System.out.println("Indexing to directory '" +INDEX_DIR+ "'...");
|
||||
indexDocs(writer, docDir);
|
||||
System.out.println("Optimizing...");
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
Date end = new Date();
|
||||
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
|
||||
|
||||
} catch (IOException e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
static void indexDocs(IndexWriter writer, File file)
|
||||
throws IOException {
|
||||
// do not try to index files that cannot be read
|
||||
if (file.canRead()) {
|
||||
if (file.isDirectory()) {
|
||||
String[] files = file.list();
|
||||
// an IO error could occur
|
||||
if (files != null) {
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
indexDocs(writer, new File(file, files[i]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
System.out.println("adding " + file);
|
||||
try {
|
||||
writer.addDocument(FileDocument.Document(file));
|
||||
}
|
||||
// at least on windows, some temporary files raise this exception with an "access denied" message
|
||||
// checking if the file can be read doesn't help
|
||||
catch (FileNotFoundException fnfe) {
|
||||
;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,168 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
import java.util.Arrays;
|
||||
|
||||
/** Indexer for HTML files. */
|
||||
public class IndexHTML {
|
||||
private IndexHTML() {}
|
||||
|
||||
private static boolean deleting = false; // true during deletion pass
|
||||
private static IndexReader reader; // existing index
|
||||
private static IndexWriter writer; // new index being built
|
||||
private static TermEnum uidIter; // document id iterator
|
||||
|
||||
/** Indexer for HTML files.*/
|
||||
public static void main(String[] argv) {
|
||||
try {
|
||||
File index = new File("index");
|
||||
boolean create = false;
|
||||
File root = null;
|
||||
|
||||
String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
|
||||
|
||||
if (argv.length == 0) {
|
||||
System.err.println("Usage: " + usage);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < argv.length; i++) {
|
||||
if (argv[i].equals("-index")) { // parse -index option
|
||||
index = new File(argv[++i]);
|
||||
} else if (argv[i].equals("-create")) { // parse -create option
|
||||
create = true;
|
||||
} else if (i != argv.length-1) {
|
||||
System.err.println("Usage: " + usage);
|
||||
return;
|
||||
} else
|
||||
root = new File(argv[i]);
|
||||
}
|
||||
|
||||
if(root == null) {
|
||||
System.err.println("Specify directory to index");
|
||||
System.err.println("Usage: " + usage);
|
||||
return;
|
||||
}
|
||||
|
||||
Date start = new Date();
|
||||
|
||||
if (!create) { // delete stale docs
|
||||
deleting = true;
|
||||
indexDocs(root, index, create);
|
||||
}
|
||||
writer = new IndexWriter(FSDirectory.open(index), new StandardAnalyzer(Version.LUCENE_CURRENT), create,
|
||||
new IndexWriter.MaxFieldLength(1000000));
|
||||
indexDocs(root, index, create); // add new docs
|
||||
|
||||
System.out.println("Optimizing index...");
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
Date end = new Date();
|
||||
|
||||
System.out.print(end.getTime() - start.getTime());
|
||||
System.out.println(" total milliseconds");
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/* Walk directory hierarchy in uid order, while keeping uid iterator from
|
||||
/* existing index in sync. Mismatches indicate one of: (a) old documents to
|
||||
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
|
||||
/* documents, to be indexed.
|
||||
*/
|
||||
|
||||
private static void indexDocs(File file, File index, boolean create)
|
||||
throws Exception {
|
||||
if (!create) { // incrementally update
|
||||
|
||||
reader = IndexReader.open(FSDirectory.open(index), false); // open existing index
|
||||
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
|
||||
|
||||
indexDocs(file);
|
||||
|
||||
if (deleting) { // delete rest of stale docs
|
||||
while (uidIter.term() != null && uidIter.term().field() == "uid") {
|
||||
System.out.println("deleting " +
|
||||
HTMLDocument.uid2url(uidIter.term().text()));
|
||||
reader.deleteDocuments(uidIter.term());
|
||||
uidIter.next();
|
||||
}
|
||||
deleting = false;
|
||||
}
|
||||
|
||||
uidIter.close(); // close uid iterator
|
||||
reader.close(); // close existing index
|
||||
|
||||
} else // don't have exisiting
|
||||
indexDocs(file);
|
||||
}
|
||||
|
||||
private static void indexDocs(File file) throws Exception {
|
||||
if (file.isDirectory()) { // if a directory
|
||||
String[] files = file.list(); // list its files
|
||||
Arrays.sort(files); // sort the files
|
||||
for (int i = 0; i < files.length; i++) // recursively index them
|
||||
indexDocs(new File(file, files[i]));
|
||||
|
||||
} else if (file.getPath().endsWith(".html") || // index .html files
|
||||
file.getPath().endsWith(".htm") || // index .htm files
|
||||
file.getPath().endsWith(".txt")) { // index .txt files
|
||||
|
||||
if (uidIter != null) {
|
||||
String uid = HTMLDocument.uid(file); // construct uid for doc
|
||||
|
||||
while (uidIter.term() != null && uidIter.term().field() == "uid" &&
|
||||
uidIter.term().text().compareTo(uid) < 0) {
|
||||
if (deleting) { // delete stale docs
|
||||
System.out.println("deleting " +
|
||||
HTMLDocument.uid2url(uidIter.term().text()));
|
||||
reader.deleteDocuments(uidIter.term());
|
||||
}
|
||||
uidIter.next();
|
||||
}
|
||||
if (uidIter.term() != null && uidIter.term().field() == "uid" &&
|
||||
uidIter.term().text().compareTo(uid) == 0) {
|
||||
uidIter.next(); // keep matching docs
|
||||
} else if (!deleting) { // add new docs
|
||||
Document doc = HTMLDocument.Document(file);
|
||||
System.out.println("adding " + doc.get("path"));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
} else { // creating a new index
|
||||
Document doc = HTMLDocument.Document(file);
|
||||
System.out.println("adding " + doc.get("path"));
|
||||
writer.addDocument(doc); // add docs unconditionally
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,313 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.FilterIndexReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** Simple command-line based search demo. */
|
||||
public class SearchFiles {
|
||||
|
||||
/** Use the norms from one field for all fields. Norms are read into memory,
|
||||
* using a byte of memory per document per searched field. This can cause
|
||||
* search of large collections with a large number of fields to run out of
|
||||
* memory. If all of the fields contain only a single token, then the norms
|
||||
* are all identical, then single norm vector may be shared. */
|
||||
private static class OneNormsReader extends FilterIndexReader {
|
||||
private String field;
|
||||
|
||||
public OneNormsReader(IndexReader in, String field) {
|
||||
super(in);
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] norms(String field) throws IOException {
|
||||
return in.norms(this.field);
|
||||
}
|
||||
}
|
||||
|
||||
private SearchFiles() {}
|
||||
|
||||
/** Simple command-line based search demo. */
|
||||
public static void main(String[] args) throws Exception {
|
||||
String usage =
|
||||
"Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field] [-paging hitsPerPage]";
|
||||
usage += "\n\tSpecify 'false' for hitsPerPage to use streaming instead of paging search.";
|
||||
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
|
||||
System.out.println(usage);
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
String index = "index";
|
||||
String field = "contents";
|
||||
String queries = null;
|
||||
int repeat = 0;
|
||||
boolean raw = false;
|
||||
String normsField = null;
|
||||
boolean paging = true;
|
||||
int hitsPerPage = 10;
|
||||
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
if ("-index".equals(args[i])) {
|
||||
index = args[i+1];
|
||||
i++;
|
||||
} else if ("-field".equals(args[i])) {
|
||||
field = args[i+1];
|
||||
i++;
|
||||
} else if ("-queries".equals(args[i])) {
|
||||
queries = args[i+1];
|
||||
i++;
|
||||
} else if ("-repeat".equals(args[i])) {
|
||||
repeat = Integer.parseInt(args[i+1]);
|
||||
i++;
|
||||
} else if ("-raw".equals(args[i])) {
|
||||
raw = true;
|
||||
} else if ("-norms".equals(args[i])) {
|
||||
normsField = args[i+1];
|
||||
i++;
|
||||
} else if ("-paging".equals(args[i])) {
|
||||
if (args[i+1].equals("false")) {
|
||||
paging = false;
|
||||
} else {
|
||||
hitsPerPage = Integer.parseInt(args[i+1]);
|
||||
if (hitsPerPage == 0) {
|
||||
paging = false;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true); // only searching, so read-only=true
|
||||
|
||||
if (normsField != null)
|
||||
reader = new OneNormsReader(reader, normsField);
|
||||
|
||||
Searcher searcher = new IndexSearcher(reader);
|
||||
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
|
||||
|
||||
BufferedReader in = null;
|
||||
if (queries != null) {
|
||||
in = new BufferedReader(new FileReader(queries));
|
||||
} else {
|
||||
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
|
||||
}
|
||||
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, analyzer);
|
||||
while (true) {
|
||||
if (queries == null) // prompt the user
|
||||
System.out.println("Enter query: ");
|
||||
|
||||
String line = in.readLine();
|
||||
|
||||
if (line == null || line.length() == -1)
|
||||
break;
|
||||
|
||||
line = line.trim();
|
||||
if (line.length() == 0)
|
||||
break;
|
||||
|
||||
Query query = parser.parse(line);
|
||||
System.out.println("Searching for: " + query.toString(field));
|
||||
|
||||
|
||||
if (repeat > 0) { // repeat & time as benchmark
|
||||
Date start = new Date();
|
||||
for (int i = 0; i < repeat; i++) {
|
||||
searcher.search(query, null, 100);
|
||||
}
|
||||
Date end = new Date();
|
||||
System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
|
||||
}
|
||||
|
||||
if (paging) {
|
||||
doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null);
|
||||
} else {
|
||||
doStreamingSearch(searcher, query);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* This method uses a custom HitCollector implementation which simply prints out
|
||||
* the docId and score of every matching document.
|
||||
*
|
||||
* This simulates the streaming search use case, where all hits are supposed to
|
||||
* be processed, regardless of their relevance.
|
||||
*/
|
||||
public static void doStreamingSearch(final Searcher searcher, Query query) throws IOException {
|
||||
Collector streamingHitCollector = new Collector() {
|
||||
private Scorer scorer;
|
||||
private int docBase;
|
||||
|
||||
// simply print docId and score of every matching document
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
System.out.println("doc=" + doc + docBase + " score=" + scorer.score());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase)
|
||||
throws IOException {
|
||||
this.docBase = docBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
searcher.search(query, streamingHitCollector);
|
||||
}
|
||||
|
||||
/**
|
||||
* This demonstrates a typical paging search scenario, where the search engine presents
|
||||
* pages of size n to the user. The user can then go to the next page if interested in
|
||||
* the next hits.
|
||||
*
|
||||
* When the query is executed for the first time, then only enough results are collected
|
||||
* to fill 5 result pages. If the user wants to page beyond this limit, then the query
|
||||
* is executed another time and all hits are collected.
|
||||
*
|
||||
*/
|
||||
public static void doPagingSearch(BufferedReader in, Searcher searcher, Query query,
|
||||
int hitsPerPage, boolean raw, boolean interactive) throws IOException {
|
||||
|
||||
// Collect enough docs to show 5 pages
|
||||
TopScoreDocCollector collector = TopScoreDocCollector.create(
|
||||
5 * hitsPerPage, false);
|
||||
searcher.search(query, collector);
|
||||
ScoreDoc[] hits = collector.topDocs().scoreDocs;
|
||||
|
||||
int numTotalHits = collector.getTotalHits();
|
||||
System.out.println(numTotalHits + " total matching documents");
|
||||
|
||||
int start = 0;
|
||||
int end = Math.min(numTotalHits, hitsPerPage);
|
||||
|
||||
while (true) {
|
||||
if (end > hits.length) {
|
||||
System.out.println("Only results 1 - " + hits.length +" of " + numTotalHits + " total matching documents collected.");
|
||||
System.out.println("Collect more (y/n) ?");
|
||||
String line = in.readLine();
|
||||
if (line.length() == 0 || line.charAt(0) == 'n') {
|
||||
break;
|
||||
}
|
||||
|
||||
collector = TopScoreDocCollector.create(numTotalHits, false);
|
||||
searcher.search(query, collector);
|
||||
hits = collector.topDocs().scoreDocs;
|
||||
}
|
||||
|
||||
end = Math.min(hits.length, start + hitsPerPage);
|
||||
|
||||
for (int i = start; i < end; i++) {
|
||||
if (raw) { // output raw format
|
||||
System.out.println("doc="+hits[i].doc+" score="+hits[i].score);
|
||||
continue;
|
||||
}
|
||||
|
||||
Document doc = searcher.doc(hits[i].doc);
|
||||
String path = doc.get("path");
|
||||
if (path != null) {
|
||||
System.out.println((i+1) + ". " + path);
|
||||
String title = doc.get("title");
|
||||
if (title != null) {
|
||||
System.out.println(" Title: " + doc.get("title"));
|
||||
}
|
||||
} else {
|
||||
System.out.println((i+1) + ". " + "No path for this document");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (!interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (numTotalHits >= end) {
|
||||
boolean quit = false;
|
||||
while (true) {
|
||||
System.out.print("Press ");
|
||||
if (start - hitsPerPage >= 0) {
|
||||
System.out.print("(p)revious page, ");
|
||||
}
|
||||
if (start + hitsPerPage < numTotalHits) {
|
||||
System.out.print("(n)ext page, ");
|
||||
}
|
||||
System.out.println("(q)uit or enter number to jump to a page.");
|
||||
|
||||
String line = in.readLine();
|
||||
if (line.length() == 0 || line.charAt(0)=='q') {
|
||||
quit = true;
|
||||
break;
|
||||
}
|
||||
if (line.charAt(0) == 'p') {
|
||||
start = Math.max(0, start - hitsPerPage);
|
||||
break;
|
||||
} else if (line.charAt(0) == 'n') {
|
||||
if (start + hitsPerPage < numTotalHits) {
|
||||
start+=hitsPerPage;
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
int page = Integer.parseInt(line);
|
||||
if ((page - 1) * hitsPerPage < numTotalHits) {
|
||||
start = (page - 1) * hitsPerPage;
|
||||
break;
|
||||
} else {
|
||||
System.out.println("No such page");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (quit) break;
|
||||
end = Math.min(numTotalHits, start + hitsPerPage);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,329 @@
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class Entities {
|
||||
static final Map<String,String> decoder = new HashMap<String,String>(300);
|
||||
static final String[] encoder = new String[0x100];
|
||||
|
||||
static final String decode(String entity) {
|
||||
if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
|
||||
entity = entity.substring(0, entity.length()-1);
|
||||
if (entity.charAt(1) == '#') {
|
||||
int start = 2;
|
||||
int radix = 10;
|
||||
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
|
||||
start++;
|
||||
radix = 16;
|
||||
}
|
||||
Character c =
|
||||
new Character((char)Integer.parseInt(entity.substring(start), radix));
|
||||
return c.toString();
|
||||
} else {
|
||||
String s = (String)decoder.get(entity);
|
||||
if (s != null)
|
||||
return s;
|
||||
else return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static final String encode(String s) {
|
||||
int length = s.length();
|
||||
StringBuffer buffer = new StringBuffer(length * 2);
|
||||
for (int i = 0; i < length; i++) {
|
||||
char c = s.charAt(i);
|
||||
int j = (int)c;
|
||||
if (j < 0x100 && encoder[j] != null) {
|
||||
buffer.append(encoder[j]); // have a named encoding
|
||||
buffer.append(';');
|
||||
} else if (j < 0x80) {
|
||||
buffer.append(c); // use ASCII value
|
||||
} else {
|
||||
buffer.append("&#"); // use numeric encoding
|
||||
buffer.append((int)c);
|
||||
buffer.append(';');
|
||||
}
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
static final void add(String entity, int value) {
|
||||
decoder.put(entity, (new Character((char)value)).toString());
|
||||
if (value < 0x100)
|
||||
encoder[value] = entity;
|
||||
}
|
||||
|
||||
static {
|
||||
add(" ", 160);
|
||||
add("¡", 161);
|
||||
add("¢", 162);
|
||||
add("£", 163);
|
||||
add("¤", 164);
|
||||
add("¥", 165);
|
||||
add("¦", 166);
|
||||
add("§", 167);
|
||||
add("¨", 168);
|
||||
add("©", 169);
|
||||
add("ª", 170);
|
||||
add("«", 171);
|
||||
add("¬", 172);
|
||||
add("­", 173);
|
||||
add("®", 174);
|
||||
add("¯", 175);
|
||||
add("°", 176);
|
||||
add("±", 177);
|
||||
add("²", 178);
|
||||
add("³", 179);
|
||||
add("´", 180);
|
||||
add("µ", 181);
|
||||
add("¶", 182);
|
||||
add("·", 183);
|
||||
add("¸", 184);
|
||||
add("¹", 185);
|
||||
add("º", 186);
|
||||
add("»", 187);
|
||||
add("¼", 188);
|
||||
add("½", 189);
|
||||
add("¾", 190);
|
||||
add("¿", 191);
|
||||
add("À", 192);
|
||||
add("Á", 193);
|
||||
add("Â", 194);
|
||||
add("Ã", 195);
|
||||
add("Ä", 196);
|
||||
add("Å", 197);
|
||||
add("Æ", 198);
|
||||
add("Ç", 199);
|
||||
add("È", 200);
|
||||
add("É", 201);
|
||||
add("Ê", 202);
|
||||
add("Ë", 203);
|
||||
add("Ì", 204);
|
||||
add("Í", 205);
|
||||
add("Î", 206);
|
||||
add("Ï", 207);
|
||||
add("Ð", 208);
|
||||
add("Ñ", 209);
|
||||
add("Ò", 210);
|
||||
add("Ó", 211);
|
||||
add("Ô", 212);
|
||||
add("Õ", 213);
|
||||
add("Ö", 214);
|
||||
add("×", 215);
|
||||
add("Ø", 216);
|
||||
add("Ù", 217);
|
||||
add("Ú", 218);
|
||||
add("Û", 219);
|
||||
add("Ü", 220);
|
||||
add("Ý", 221);
|
||||
add("Þ", 222);
|
||||
add("ß", 223);
|
||||
add("à", 224);
|
||||
add("á", 225);
|
||||
add("â", 226);
|
||||
add("ã", 227);
|
||||
add("ä", 228);
|
||||
add("å", 229);
|
||||
add("æ", 230);
|
||||
add("ç", 231);
|
||||
add("è", 232);
|
||||
add("é", 233);
|
||||
add("ê", 234);
|
||||
add("ë", 235);
|
||||
add("ì", 236);
|
||||
add("í", 237);
|
||||
add("î", 238);
|
||||
add("ï", 239);
|
||||
add("ð", 240);
|
||||
add("ñ", 241);
|
||||
add("ò", 242);
|
||||
add("ó", 243);
|
||||
add("ô", 244);
|
||||
add("õ", 245);
|
||||
add("ö", 246);
|
||||
add("÷", 247);
|
||||
add("ø", 248);
|
||||
add("ù", 249);
|
||||
add("ú", 250);
|
||||
add("û", 251);
|
||||
add("ü", 252);
|
||||
add("ý", 253);
|
||||
add("þ", 254);
|
||||
add("ÿ", 255);
|
||||
add("&fnof", 402);
|
||||
add("&Alpha", 913);
|
||||
add("&Beta", 914);
|
||||
add("&Gamma", 915);
|
||||
add("&Delta", 916);
|
||||
add("&Epsilon",917);
|
||||
add("&Zeta", 918);
|
||||
add("&Eta", 919);
|
||||
add("&Theta", 920);
|
||||
add("&Iota", 921);
|
||||
add("&Kappa", 922);
|
||||
add("&Lambda", 923);
|
||||
add("&Mu", 924);
|
||||
add("&Nu", 925);
|
||||
add("&Xi", 926);
|
||||
add("&Omicron",927);
|
||||
add("&Pi", 928);
|
||||
add("&Rho", 929);
|
||||
add("&Sigma", 931);
|
||||
add("&Tau", 932);
|
||||
add("&Upsilon",933);
|
||||
add("&Phi", 934);
|
||||
add("&Chi", 935);
|
||||
add("&Psi", 936);
|
||||
add("&Omega", 937);
|
||||
add("&alpha", 945);
|
||||
add("&beta", 946);
|
||||
add("&gamma", 947);
|
||||
add("&delta", 948);
|
||||
add("&epsilon",949);
|
||||
add("&zeta", 950);
|
||||
add("&eta", 951);
|
||||
add("&theta", 952);
|
||||
add("&iota", 953);
|
||||
add("&kappa", 954);
|
||||
add("&lambda", 955);
|
||||
add("&mu", 956);
|
||||
add("&nu", 957);
|
||||
add("&xi", 958);
|
||||
add("&omicron",959);
|
||||
add("&pi", 960);
|
||||
add("&rho", 961);
|
||||
add("&sigmaf", 962);
|
||||
add("&sigma", 963);
|
||||
add("&tau", 964);
|
||||
add("&upsilon",965);
|
||||
add("&phi", 966);
|
||||
add("&chi", 967);
|
||||
add("&psi", 968);
|
||||
add("&omega", 969);
|
||||
add("&thetasym",977);
|
||||
add("&upsih", 978);
|
||||
add("&piv", 982);
|
||||
add("&bull", 8226);
|
||||
add("&hellip", 8230);
|
||||
add("&prime", 8242);
|
||||
add("&Prime", 8243);
|
||||
add("&oline", 8254);
|
||||
add("&frasl", 8260);
|
||||
add("&weierp", 8472);
|
||||
add("&image", 8465);
|
||||
add("&real", 8476);
|
||||
add("&trade", 8482);
|
||||
add("&alefsym",8501);
|
||||
add("&larr", 8592);
|
||||
add("&uarr", 8593);
|
||||
add("&rarr", 8594);
|
||||
add("&darr", 8595);
|
||||
add("&harr", 8596);
|
||||
add("&crarr", 8629);
|
||||
add("&lArr", 8656);
|
||||
add("&uArr", 8657);
|
||||
add("&rArr", 8658);
|
||||
add("&dArr", 8659);
|
||||
add("&hArr", 8660);
|
||||
add("&forall", 8704);
|
||||
add("&part", 8706);
|
||||
add("&exist", 8707);
|
||||
add("&empty", 8709);
|
||||
add("&nabla", 8711);
|
||||
add("&isin", 8712);
|
||||
add("¬in", 8713);
|
||||
add("&ni", 8715);
|
||||
add("&prod", 8719);
|
||||
add("&sum", 8721);
|
||||
add("&minus", 8722);
|
||||
add("&lowast", 8727);
|
||||
add("&radic", 8730);
|
||||
add("&prop", 8733);
|
||||
add("&infin", 8734);
|
||||
add("&ang", 8736);
|
||||
add("&and", 8743);
|
||||
add("&or", 8744);
|
||||
add("&cap", 8745);
|
||||
add("&cup", 8746);
|
||||
add("&int", 8747);
|
||||
add("&there4", 8756);
|
||||
add("&sim", 8764);
|
||||
add("&cong", 8773);
|
||||
add("&asymp", 8776);
|
||||
add("&ne", 8800);
|
||||
add("&equiv", 8801);
|
||||
add("&le", 8804);
|
||||
add("&ge", 8805);
|
||||
add("&sub", 8834);
|
||||
add("&sup", 8835);
|
||||
add("&nsub", 8836);
|
||||
add("&sube", 8838);
|
||||
add("&supe", 8839);
|
||||
add("&oplus", 8853);
|
||||
add("&otimes", 8855);
|
||||
add("&perp", 8869);
|
||||
add("&sdot", 8901);
|
||||
add("&lceil", 8968);
|
||||
add("&rceil", 8969);
|
||||
add("&lfloor", 8970);
|
||||
add("&rfloor", 8971);
|
||||
add("&lang", 9001);
|
||||
add("&rang", 9002);
|
||||
add("&loz", 9674);
|
||||
add("&spades", 9824);
|
||||
add("&clubs", 9827);
|
||||
add("&hearts", 9829);
|
||||
add("&diams", 9830);
|
||||
add(""", 34);
|
||||
add("&", 38);
|
||||
add("<", 60);
|
||||
add(">", 62);
|
||||
add("&OElig", 338);
|
||||
add("&oelig", 339);
|
||||
add("&Scaron", 352);
|
||||
add("&scaron", 353);
|
||||
add("&Yuml", 376);
|
||||
add("&circ", 710);
|
||||
add("&tilde", 732);
|
||||
add("&ensp", 8194);
|
||||
add("&emsp", 8195);
|
||||
add("&thinsp", 8201);
|
||||
add("&zwnj", 8204);
|
||||
add("&zwj", 8205);
|
||||
add("&lrm", 8206);
|
||||
add("&rlm", 8207);
|
||||
add("&ndash", 8211);
|
||||
add("&mdash", 8212);
|
||||
add("&lsquo", 8216);
|
||||
add("&rsquo", 8217);
|
||||
add("&sbquo", 8218);
|
||||
add("&ldquo", 8220);
|
||||
add("&rdquo", 8221);
|
||||
add("&bdquo", 8222);
|
||||
add("&dagger", 8224);
|
||||
add("&Dagger", 8225);
|
||||
add("&permil", 8240);
|
||||
add("&lsaquo", 8249);
|
||||
add("&rsaquo", 8250);
|
||||
add("&euro", 8364);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,754 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Properties;
|
||||
|
||||
public class HTMLParser implements HTMLParserConstants {
|
||||
public static int SUMMARY_LENGTH = 200;
|
||||
|
||||
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
|
||||
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
|
||||
Properties metaTags=new Properties();
|
||||
String currentMetaTag=null;
|
||||
String currentMetaContent=null;
|
||||
int length = 0;
|
||||
boolean titleComplete = false;
|
||||
boolean inTitle = false;
|
||||
boolean inMetaTag = false;
|
||||
boolean inStyle = false;
|
||||
boolean afterTag = false;
|
||||
boolean afterSpace = false;
|
||||
String eol = System.getProperty("line.separator");
|
||||
Reader pipeIn = null;
|
||||
Writer pipeOut;
|
||||
private MyPipedInputStream pipeInStream = null;
|
||||
private PipedOutputStream pipeOutStream = null;
|
||||
|
||||
private class MyPipedInputStream extends PipedInputStream{
|
||||
|
||||
public MyPipedInputStream(){
|
||||
super();
|
||||
}
|
||||
|
||||
public MyPipedInputStream(PipedOutputStream src) throws IOException{
|
||||
super(src);
|
||||
}
|
||||
|
||||
public boolean full() throws IOException{
|
||||
return this.available() >= PipedInputStream.PIPE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||
*/
|
||||
public HTMLParser(File file) throws FileNotFoundException {
|
||||
this(new FileInputStream(file));
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
public Properties getMetaTags() throws IOException,
|
||||
InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return metaTags;
|
||||
}
|
||||
|
||||
|
||||
public String getSummary() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
if (summary.length() > SUMMARY_LENGTH)
|
||||
summary.setLength(SUMMARY_LENGTH);
|
||||
|
||||
String sum = summary.toString().trim();
|
||||
String tit = getTitle();
|
||||
if (sum.startsWith(tit) || sum.equals(""))
|
||||
return tit;
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
|
||||
public Reader getReader() throws IOException {
|
||||
if (pipeIn == null) {
|
||||
pipeInStream = new MyPipedInputStream();
|
||||
pipeOutStream = new PipedOutputStream(pipeInStream);
|
||||
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
|
||||
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
|
||||
|
||||
Thread thread = new ParserThread(this);
|
||||
thread.start(); // start parsing
|
||||
}
|
||||
|
||||
return pipeIn;
|
||||
}
|
||||
|
||||
void addToSummary(String text) {
|
||||
if (summary.length() < SUMMARY_LENGTH) {
|
||||
summary.append(text);
|
||||
if (summary.length() >= SUMMARY_LENGTH) {
|
||||
synchronized(this) {
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addText(String text) throws IOException {
|
||||
if (inStyle)
|
||||
return;
|
||||
if (inTitle)
|
||||
title.append(text);
|
||||
else {
|
||||
addToSummary(text);
|
||||
if (!titleComplete && !(title.length() == 0)) { // finished title
|
||||
synchronized(this) {
|
||||
titleComplete = true; // tell waiting threads
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length += text.length();
|
||||
pipeOut.write(text);
|
||||
|
||||
afterSpace = false;
|
||||
}
|
||||
|
||||
void addMetaTag() {
|
||||
metaTags.setProperty(currentMetaTag, currentMetaContent);
|
||||
currentMetaTag = null;
|
||||
currentMetaContent = null;
|
||||
return;
|
||||
}
|
||||
|
||||
void addSpace() throws IOException {
|
||||
if (!afterSpace) {
|
||||
if (inTitle)
|
||||
title.append(" ");
|
||||
else
|
||||
addToSummary(" ");
|
||||
|
||||
String space = afterTag ? eol : " ";
|
||||
length += space.length();
|
||||
pipeOut.write(space);
|
||||
afterSpace = true;
|
||||
}
|
||||
}
|
||||
|
||||
final public void HTMLDocument() throws ParseException, IOException {
|
||||
Token t;
|
||||
label_1:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ScriptStart:
|
||||
case TagName:
|
||||
case DeclName:
|
||||
case Comment1:
|
||||
case Comment2:
|
||||
case Word:
|
||||
case Entity:
|
||||
case Space:
|
||||
case Punct:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[0] = jj_gen;
|
||||
break label_1;
|
||||
}
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case TagName:
|
||||
Tag();
|
||||
afterTag = true;
|
||||
break;
|
||||
case DeclName:
|
||||
t = Decl();
|
||||
afterTag = true;
|
||||
break;
|
||||
case Comment1:
|
||||
case Comment2:
|
||||
CommentTag();
|
||||
afterTag = true;
|
||||
break;
|
||||
case ScriptStart:
|
||||
ScriptTag();
|
||||
afterTag = true;
|
||||
break;
|
||||
case Word:
|
||||
t = jj_consume_token(Word);
|
||||
addText(t.image); afterTag = false;
|
||||
break;
|
||||
case Entity:
|
||||
t = jj_consume_token(Entity);
|
||||
addText(Entities.decode(t.image)); afterTag = false;
|
||||
break;
|
||||
case Punct:
|
||||
t = jj_consume_token(Punct);
|
||||
addText(t.image); afterTag = false;
|
||||
break;
|
||||
case Space:
|
||||
jj_consume_token(Space);
|
||||
addSpace(); afterTag = false;
|
||||
break;
|
||||
default:
|
||||
jj_la1[1] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
jj_consume_token(0);
|
||||
}
|
||||
|
||||
final public void Tag() throws ParseException, IOException {
|
||||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
t1 = jj_consume_token(TagName);
|
||||
String tagName = t1.image.toLowerCase();
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
|
||||
label_2:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgName:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[2] = jj_gen;
|
||||
break label_2;
|
||||
}
|
||||
t1 = jj_consume_token(ArgName);
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgEquals:
|
||||
jj_consume_token(ArgEquals);
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgValue:
|
||||
case ArgQuote1:
|
||||
case ArgQuote2:
|
||||
t2 = ArgValue();
|
||||
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
|
||||
addText("[" + t2.image + "]");
|
||||
|
||||
if(inMetaTag &&
|
||||
( t1.image.equalsIgnoreCase("name") ||
|
||||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
|
||||
)
|
||||
&& t2 != null)
|
||||
{
|
||||
currentMetaTag=t2.image.toLowerCase();
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||
null)
|
||||
{
|
||||
currentMetaContent=t2.image.toLowerCase();
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
jj_la1[3] = jj_gen;
|
||||
;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
jj_la1[4] = jj_gen;
|
||||
;
|
||||
}
|
||||
}
|
||||
jj_consume_token(TagEnd);
|
||||
}
|
||||
|
||||
final public Token ArgValue() throws ParseException {
|
||||
Token t = null;
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgValue:
|
||||
t = jj_consume_token(ArgValue);
|
||||
{if (true) return t;}
|
||||
break;
|
||||
default:
|
||||
jj_la1[5] = jj_gen;
|
||||
if (jj_2_1(2)) {
|
||||
jj_consume_token(ArgQuote1);
|
||||
jj_consume_token(CloseQuote1);
|
||||
{if (true) return t;}
|
||||
} else {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgQuote1:
|
||||
jj_consume_token(ArgQuote1);
|
||||
t = jj_consume_token(Quote1Text);
|
||||
jj_consume_token(CloseQuote1);
|
||||
{if (true) return t;}
|
||||
break;
|
||||
default:
|
||||
jj_la1[6] = jj_gen;
|
||||
if (jj_2_2(2)) {
|
||||
jj_consume_token(ArgQuote2);
|
||||
jj_consume_token(CloseQuote2);
|
||||
{if (true) return t;}
|
||||
} else {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgQuote2:
|
||||
jj_consume_token(ArgQuote2);
|
||||
t = jj_consume_token(Quote2Text);
|
||||
jj_consume_token(CloseQuote2);
|
||||
{if (true) return t;}
|
||||
break;
|
||||
default:
|
||||
jj_la1[7] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new Error("Missing return statement in function");
|
||||
}
|
||||
|
||||
final public Token Decl() throws ParseException {
|
||||
Token t;
|
||||
t = jj_consume_token(DeclName);
|
||||
label_3:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgName:
|
||||
case ArgEquals:
|
||||
case ArgValue:
|
||||
case ArgQuote1:
|
||||
case ArgQuote2:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[8] = jj_gen;
|
||||
break label_3;
|
||||
}
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgName:
|
||||
jj_consume_token(ArgName);
|
||||
break;
|
||||
case ArgValue:
|
||||
case ArgQuote1:
|
||||
case ArgQuote2:
|
||||
ArgValue();
|
||||
break;
|
||||
case ArgEquals:
|
||||
jj_consume_token(ArgEquals);
|
||||
break;
|
||||
default:
|
||||
jj_la1[9] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
jj_consume_token(TagEnd);
|
||||
{if (true) return t;}
|
||||
throw new Error("Missing return statement in function");
|
||||
}
|
||||
|
||||
final public void CommentTag() throws ParseException {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case Comment1:
|
||||
jj_consume_token(Comment1);
|
||||
label_4:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case CommentText1:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[10] = jj_gen;
|
||||
break label_4;
|
||||
}
|
||||
jj_consume_token(CommentText1);
|
||||
}
|
||||
jj_consume_token(CommentEnd1);
|
||||
break;
|
||||
case Comment2:
|
||||
jj_consume_token(Comment2);
|
||||
label_5:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case CommentText2:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[11] = jj_gen;
|
||||
break label_5;
|
||||
}
|
||||
jj_consume_token(CommentText2);
|
||||
}
|
||||
jj_consume_token(CommentEnd2);
|
||||
break;
|
||||
default:
|
||||
jj_la1[12] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
|
||||
final public void ScriptTag() throws ParseException {
|
||||
jj_consume_token(ScriptStart);
|
||||
label_6:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ScriptText:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[13] = jj_gen;
|
||||
break label_6;
|
||||
}
|
||||
jj_consume_token(ScriptText);
|
||||
}
|
||||
jj_consume_token(ScriptEnd);
|
||||
}
|
||||
|
||||
private boolean jj_2_1(int xla) {
|
||||
jj_la = xla; jj_lastpos = jj_scanpos = token;
|
||||
try { return !jj_3_1(); }
|
||||
catch(LookaheadSuccess ls) { return true; }
|
||||
finally { jj_save(0, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_2_2(int xla) {
|
||||
jj_la = xla; jj_lastpos = jj_scanpos = token;
|
||||
try { return !jj_3_2(); }
|
||||
catch(LookaheadSuccess ls) { return true; }
|
||||
finally { jj_save(1, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_3_1() {
|
||||
if (jj_scan_token(ArgQuote1)) return true;
|
||||
if (jj_scan_token(CloseQuote1)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3_2() {
|
||||
if (jj_scan_token(ArgQuote2)) return true;
|
||||
if (jj_scan_token(CloseQuote2)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Generated Token Manager. */
|
||||
public HTMLParserTokenManager token_source;
|
||||
SimpleCharStream jj_input_stream;
|
||||
/** Current token. */
|
||||
public Token token;
|
||||
/** Next token. */
|
||||
public Token jj_nt;
|
||||
private int jj_ntk;
|
||||
private Token jj_scanpos, jj_lastpos;
|
||||
private int jj_la;
|
||||
private int jj_gen;
|
||||
final private int[] jj_la1 = new int[14];
|
||||
static private int[] jj_la1_0;
|
||||
static {
|
||||
jj_la1_init_0();
|
||||
}
|
||||
private static void jj_la1_init_0() {
|
||||
jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
|
||||
}
|
||||
final private JJCalls[] jj_2_rtns = new JJCalls[2];
|
||||
private boolean jj_rescan = false;
|
||||
private int jj_gc = 0;
|
||||
|
||||
/** Constructor with InputStream. */
|
||||
public HTMLParser(java.io.InputStream stream) {
|
||||
this(stream, null);
|
||||
}
|
||||
/** Constructor with InputStream and supplied encoding */
|
||||
public HTMLParser(java.io.InputStream stream, String encoding) {
|
||||
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
|
||||
token_source = new HTMLParserTokenManager(jj_input_stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream stream) {
|
||||
ReInit(stream, null);
|
||||
}
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream stream, String encoding) {
|
||||
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
|
||||
token_source.ReInit(jj_input_stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public HTMLParser(java.io.Reader stream) {
|
||||
jj_input_stream = new SimpleCharStream(stream, 1, 1);
|
||||
token_source = new HTMLParserTokenManager(jj_input_stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.Reader stream) {
|
||||
jj_input_stream.ReInit(stream, 1, 1);
|
||||
token_source.ReInit(jj_input_stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Constructor with generated Token Manager. */
|
||||
public HTMLParser(HTMLParserTokenManager tm) {
|
||||
token_source = tm;
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(HTMLParserTokenManager tm) {
|
||||
token_source = tm;
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
private Token jj_consume_token(int kind) throws ParseException {
|
||||
Token oldToken;
|
||||
if ((oldToken = token).next != null) token = token.next;
|
||||
else token = token.next = token_source.getNextToken();
|
||||
jj_ntk = -1;
|
||||
if (token.kind == kind) {
|
||||
jj_gen++;
|
||||
if (++jj_gc > 100) {
|
||||
jj_gc = 0;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) {
|
||||
JJCalls c = jj_2_rtns[i];
|
||||
while (c != null) {
|
||||
if (c.gen < jj_gen) c.first = null;
|
||||
c = c.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
return token;
|
||||
}
|
||||
token = oldToken;
|
||||
jj_kind = kind;
|
||||
throw generateParseException();
|
||||
}
|
||||
|
||||
static private final class LookaheadSuccess extends java.lang.Error { }
|
||||
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
|
||||
private boolean jj_scan_token(int kind) {
|
||||
if (jj_scanpos == jj_lastpos) {
|
||||
jj_la--;
|
||||
if (jj_scanpos.next == null) {
|
||||
jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
|
||||
} else {
|
||||
jj_lastpos = jj_scanpos = jj_scanpos.next;
|
||||
}
|
||||
} else {
|
||||
jj_scanpos = jj_scanpos.next;
|
||||
}
|
||||
if (jj_rescan) {
|
||||
int i = 0; Token tok = token;
|
||||
while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
|
||||
if (tok != null) jj_add_error_token(kind, i);
|
||||
}
|
||||
if (jj_scanpos.kind != kind) return true;
|
||||
if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/** Get the next Token. */
|
||||
final public Token getNextToken() {
|
||||
if (token.next != null) token = token.next;
|
||||
else token = token.next = token_source.getNextToken();
|
||||
jj_ntk = -1;
|
||||
jj_gen++;
|
||||
return token;
|
||||
}
|
||||
|
||||
/** Get the specific Token. */
|
||||
final public Token getToken(int index) {
|
||||
Token t = token;
|
||||
for (int i = 0; i < index; i++) {
|
||||
if (t.next != null) t = t.next;
|
||||
else t = t.next = token_source.getNextToken();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
private int jj_ntk() {
|
||||
if ((jj_nt=token.next) == null)
|
||||
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
|
||||
else
|
||||
return (jj_ntk = jj_nt.kind);
|
||||
}
|
||||
|
||||
private java.util.List<int[]> jj_expentries = new java.util.ArrayList<int[]>();
|
||||
private int[] jj_expentry;
|
||||
private int jj_kind = -1;
|
||||
private int[] jj_lasttokens = new int[100];
|
||||
private int jj_endpos;
|
||||
|
||||
private void jj_add_error_token(int kind, int pos) {
|
||||
if (pos >= 100) return;
|
||||
if (pos == jj_endpos + 1) {
|
||||
jj_lasttokens[jj_endpos++] = kind;
|
||||
} else if (jj_endpos != 0) {
|
||||
jj_expentry = new int[jj_endpos];
|
||||
for (int i = 0; i < jj_endpos; i++) {
|
||||
jj_expentry[i] = jj_lasttokens[i];
|
||||
}
|
||||
jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
|
||||
int[] oldentry = (int[])(it.next());
|
||||
if (oldentry.length == jj_expentry.length) {
|
||||
for (int i = 0; i < jj_expentry.length; i++) {
|
||||
if (oldentry[i] != jj_expentry[i]) {
|
||||
continue jj_entries_loop;
|
||||
}
|
||||
}
|
||||
jj_expentries.add(jj_expentry);
|
||||
break jj_entries_loop;
|
||||
}
|
||||
}
|
||||
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate ParseException. */
|
||||
public ParseException generateParseException() {
|
||||
jj_expentries.clear();
|
||||
boolean[] la1tokens = new boolean[31];
|
||||
if (jj_kind >= 0) {
|
||||
la1tokens[jj_kind] = true;
|
||||
jj_kind = -1;
|
||||
}
|
||||
for (int i = 0; i < 14; i++) {
|
||||
if (jj_la1[i] == jj_gen) {
|
||||
for (int j = 0; j < 32; j++) {
|
||||
if ((jj_la1_0[i] & (1<<j)) != 0) {
|
||||
la1tokens[j] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 31; i++) {
|
||||
if (la1tokens[i]) {
|
||||
jj_expentry = new int[1];
|
||||
jj_expentry[0] = i;
|
||||
jj_expentries.add(jj_expentry);
|
||||
}
|
||||
}
|
||||
jj_endpos = 0;
|
||||
jj_rescan_token();
|
||||
jj_add_error_token(0, 0);
|
||||
int[][] exptokseq = new int[jj_expentries.size()][];
|
||||
for (int i = 0; i < jj_expentries.size(); i++) {
|
||||
exptokseq[i] = jj_expentries.get(i);
|
||||
}
|
||||
return new ParseException(token, exptokseq, tokenImage);
|
||||
}
|
||||
|
||||
/** Enable tracing. */
|
||||
final public void enable_tracing() {
|
||||
}
|
||||
|
||||
/** Disable tracing. */
|
||||
final public void disable_tracing() {
|
||||
}
|
||||
|
||||
private void jj_rescan_token() {
|
||||
jj_rescan = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
try {
|
||||
JJCalls p = jj_2_rtns[i];
|
||||
do {
|
||||
if (p.gen > jj_gen) {
|
||||
jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
|
||||
switch (i) {
|
||||
case 0: jj_3_1(); break;
|
||||
case 1: jj_3_2(); break;
|
||||
}
|
||||
}
|
||||
p = p.next;
|
||||
} while (p != null);
|
||||
} catch(LookaheadSuccess ls) { }
|
||||
}
|
||||
jj_rescan = false;
|
||||
}
|
||||
|
||||
private void jj_save(int index, int xla) {
|
||||
JJCalls p = jj_2_rtns[index];
|
||||
while (p.gen > jj_gen) {
|
||||
if (p.next == null) { p = p.next = new JJCalls(); break; }
|
||||
p = p.next;
|
||||
}
|
||||
p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
|
||||
}
|
||||
|
||||
static final class JJCalls {
|
||||
int gen;
|
||||
Token first;
|
||||
int arg;
|
||||
JJCalls next;
|
||||
}
|
||||
|
||||
// void handleException(Exception e) {
|
||||
// System.out.println(e.toString()); // print the error message
|
||||
// System.out.println("Skipping...");
|
||||
// Token t;
|
||||
// do {
|
||||
// t = getNextToken();
|
||||
// } while (t.kind != TagEnd);
|
||||
// }
|
||||
}
|
|
@ -0,0 +1,392 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// HTMLParser.jj
|
||||
|
||||
options {
|
||||
STATIC = false;
|
||||
OPTIMIZE_TOKEN_MANAGER = true;
|
||||
//DEBUG_LOOKAHEAD = true;
|
||||
//DEBUG_TOKEN_MANAGER = true;
|
||||
}
|
||||
|
||||
PARSER_BEGIN(HTMLParser)
|
||||
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Properties;
|
||||
|
||||
public class HTMLParser {
|
||||
public static int SUMMARY_LENGTH = 200;
|
||||
|
||||
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
|
||||
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
|
||||
Properties metaTags=new Properties();
|
||||
String currentMetaTag=null;
|
||||
String currentMetaContent=null;
|
||||
int length = 0;
|
||||
boolean titleComplete = false;
|
||||
boolean inTitle = false;
|
||||
boolean inMetaTag = false;
|
||||
boolean inStyle = false;
|
||||
boolean afterTag = false;
|
||||
boolean afterSpace = false;
|
||||
String eol = System.getProperty("line.separator");
|
||||
Reader pipeIn = null;
|
||||
Writer pipeOut;
|
||||
private MyPipedInputStream pipeInStream = null;
|
||||
private PipedOutputStream pipeOutStream = null;
|
||||
|
||||
private class MyPipedInputStream extends PipedInputStream{
|
||||
|
||||
public MyPipedInputStream(){
|
||||
super();
|
||||
}
|
||||
|
||||
public MyPipedInputStream(PipedOutputStream src) throws IOException{
|
||||
super(src);
|
||||
}
|
||||
|
||||
public boolean full() throws IOException{
|
||||
return this.available() >= PipedInputStream.PIPE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||
*/
|
||||
public HTMLParser(File file) throws FileNotFoundException {
|
||||
this(new FileInputStream(file));
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
public Properties getMetaTags() throws IOException,
|
||||
InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return metaTags;
|
||||
}
|
||||
|
||||
|
||||
public String getSummary() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
if (summary.length() > SUMMARY_LENGTH)
|
||||
summary.setLength(SUMMARY_LENGTH);
|
||||
|
||||
String sum = summary.toString().trim();
|
||||
String tit = getTitle();
|
||||
if (sum.startsWith(tit) || sum.equals(""))
|
||||
return tit;
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
|
||||
public Reader getReader() throws IOException {
|
||||
if (pipeIn == null) {
|
||||
pipeInStream = new MyPipedInputStream();
|
||||
pipeOutStream = new PipedOutputStream(pipeInStream);
|
||||
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
|
||||
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
|
||||
|
||||
Thread thread = new ParserThread(this);
|
||||
thread.start(); // start parsing
|
||||
}
|
||||
|
||||
return pipeIn;
|
||||
}
|
||||
|
||||
void addToSummary(String text) {
|
||||
if (summary.length() < SUMMARY_LENGTH) {
|
||||
summary.append(text);
|
||||
if (summary.length() >= SUMMARY_LENGTH) {
|
||||
synchronized(this) {
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addText(String text) throws IOException {
|
||||
if (inStyle)
|
||||
return;
|
||||
if (inTitle)
|
||||
title.append(text);
|
||||
else {
|
||||
addToSummary(text);
|
||||
if (!titleComplete && !(title.length() == 0)) { // finished title
|
||||
synchronized(this) {
|
||||
titleComplete = true; // tell waiting threads
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length += text.length();
|
||||
pipeOut.write(text);
|
||||
|
||||
afterSpace = false;
|
||||
}
|
||||
|
||||
void addMetaTag() {
|
||||
metaTags.setProperty(currentMetaTag, currentMetaContent);
|
||||
currentMetaTag = null;
|
||||
currentMetaContent = null;
|
||||
return;
|
||||
}
|
||||
|
||||
void addSpace() throws IOException {
|
||||
if (!afterSpace) {
|
||||
if (inTitle)
|
||||
title.append(" ");
|
||||
else
|
||||
addToSummary(" ");
|
||||
|
||||
String space = afterTag ? eol : " ";
|
||||
length += space.length();
|
||||
pipeOut.write(space);
|
||||
afterSpace = true;
|
||||
}
|
||||
}
|
||||
|
||||
// void handleException(Exception e) {
|
||||
// System.out.println(e.toString()); // print the error message
|
||||
// System.out.println("Skipping...");
|
||||
// Token t;
|
||||
// do {
|
||||
// t = getNextToken();
|
||||
// } while (t.kind != TagEnd);
|
||||
// }
|
||||
}
|
||||
|
||||
PARSER_END(HTMLParser)
|
||||
|
||||
|
||||
void HTMLDocument() throws IOException :
|
||||
{
|
||||
Token t;
|
||||
}
|
||||
{
|
||||
// try {
|
||||
( Tag() { afterTag = true; }
|
||||
| t=Decl() { afterTag = true; }
|
||||
| CommentTag() { afterTag = true; }
|
||||
| ScriptTag() { afterTag = true; }
|
||||
| t=<Word> { addText(t.image); afterTag = false; }
|
||||
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
|
||||
| t=<Punct> { addText(t.image); afterTag = false; }
|
||||
| <Space> { addSpace(); afterTag = false; }
|
||||
)* <EOF>
|
||||
// } catch (ParseException e) {
|
||||
// handleException(e);
|
||||
// }
|
||||
}
|
||||
|
||||
void Tag() throws IOException :
|
||||
{
|
||||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
}
|
||||
{
|
||||
t1=<TagName> {
|
||||
String tagName = t1.image.toLowerCase();
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
}
|
||||
(t1=<ArgName>
|
||||
(<ArgEquals>
|
||||
(t2=ArgValue() // save ALT text in IMG tag
|
||||
{
|
||||
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
|
||||
addText("[" + t2.image + "]");
|
||||
|
||||
if(inMetaTag &&
|
||||
( t1.image.equalsIgnoreCase("name") ||
|
||||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
|
||||
)
|
||||
&& t2 != null)
|
||||
{
|
||||
currentMetaTag=t2.image.toLowerCase();
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||
null)
|
||||
{
|
||||
currentMetaContent=t2.image.toLowerCase();
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
}
|
||||
)?
|
||||
)?
|
||||
)*
|
||||
<TagEnd>
|
||||
}
|
||||
|
||||
Token ArgValue() :
|
||||
{
|
||||
Token t = null;
|
||||
}
|
||||
{
|
||||
t=<ArgValue> { return t; }
|
||||
| LOOKAHEAD(2)
|
||||
<ArgQuote1> <CloseQuote1> { return t; }
|
||||
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
|
||||
| LOOKAHEAD(2)
|
||||
<ArgQuote2> <CloseQuote2> { return t; }
|
||||
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
|
||||
}
|
||||
|
||||
|
||||
Token Decl() :
|
||||
{
|
||||
Token t;
|
||||
}
|
||||
{
|
||||
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
|
||||
{ return t; }
|
||||
}
|
||||
|
||||
|
||||
void CommentTag() :
|
||||
{}
|
||||
{
|
||||
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
||||
|
|
||||
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
|
||||
}
|
||||
|
||||
void ScriptTag() :
|
||||
{}
|
||||
{
|
||||
<ScriptStart> ( <ScriptText> )* <ScriptEnd>
|
||||
}
|
||||
|
||||
|
||||
TOKEN :
|
||||
{
|
||||
< ScriptStart: "<script" > : WithinScript
|
||||
| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
|
||||
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
|
||||
|
||||
| < Comment1: "<!--" > : WithinComment1
|
||||
| < Comment2: "<!" > : WithinComment2
|
||||
|
||||
| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
|
||||
<LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
|
||||
| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
|
||||
| < #NUM: ["0"-"9"] >
|
||||
| < #HEX: ["0"-"9","A"-"F","a"-"f"] >
|
||||
|
||||
| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
|
||||
|
||||
| < Space: (<SP>)+ >
|
||||
| < #SP: [" ","\t","\r","\n"] >
|
||||
|
||||
| < Punct: ~[] > // Keep this last. It is a catch-all.
|
||||
}
|
||||
|
||||
<WithinScript> TOKEN:
|
||||
{
|
||||
< ScriptText: (~["<",">"])+ | "<" | ">" >
|
||||
| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
|
||||
}
|
||||
|
||||
<WithinTag> TOKEN:
|
||||
{
|
||||
< ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
|
||||
(~[" ","\t","\r","\n","=",">"])* >
|
||||
| < ArgEquals: "=" > : AfterEquals
|
||||
| < TagEnd: ">" | "=>" > : DEFAULT
|
||||
}
|
||||
|
||||
<AfterEquals> TOKEN:
|
||||
{
|
||||
< ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
|
||||
(~[" ","\t","\r","\n",">"])* > : WithinTag
|
||||
}
|
||||
|
||||
<WithinTag, AfterEquals> TOKEN:
|
||||
{
|
||||
< ArgQuote1: "'" > : WithinQuote1
|
||||
| < ArgQuote2: "\"" > : WithinQuote2
|
||||
}
|
||||
|
||||
<WithinTag, AfterEquals> SKIP:
|
||||
{
|
||||
< <Space> >
|
||||
}
|
||||
|
||||
<WithinQuote1> TOKEN:
|
||||
{
|
||||
< Quote1Text: (~["'"])+ >
|
||||
| < CloseQuote1: <ArgQuote1> > : WithinTag
|
||||
}
|
||||
|
||||
<WithinQuote2> TOKEN:
|
||||
{
|
||||
< Quote2Text: (~["\""])+ >
|
||||
| < CloseQuote2: <ArgQuote2> > : WithinTag
|
||||
}
|
||||
|
||||
|
||||
<WithinComment1> TOKEN :
|
||||
{
|
||||
< CommentText1: (~["-"])+ | "-" >
|
||||
| < CommentEnd1: "-->" > : DEFAULT
|
||||
}
|
||||
|
||||
<WithinComment2> TOKEN :
|
||||
{
|
||||
< CommentText2: (~[">"])+ >
|
||||
| < CommentEnd2: ">" > : DEFAULT
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. HTMLParserConstants.java */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
|
||||
/**
|
||||
* Token literal values and constants.
|
||||
* Generated by org.javacc.parser.OtherFilesGen#start()
|
||||
*/
|
||||
public interface HTMLParserConstants {
|
||||
|
||||
/** End of File. */
|
||||
int EOF = 0;
|
||||
/** RegularExpression Id. */
|
||||
int ScriptStart = 1;
|
||||
/** RegularExpression Id. */
|
||||
int TagName = 2;
|
||||
/** RegularExpression Id. */
|
||||
int DeclName = 3;
|
||||
/** RegularExpression Id. */
|
||||
int Comment1 = 4;
|
||||
/** RegularExpression Id. */
|
||||
int Comment2 = 5;
|
||||
/** RegularExpression Id. */
|
||||
int Word = 6;
|
||||
/** RegularExpression Id. */
|
||||
int LET = 7;
|
||||
/** RegularExpression Id. */
|
||||
int NUM = 8;
|
||||
/** RegularExpression Id. */
|
||||
int HEX = 9;
|
||||
/** RegularExpression Id. */
|
||||
int Entity = 10;
|
||||
/** RegularExpression Id. */
|
||||
int Space = 11;
|
||||
/** RegularExpression Id. */
|
||||
int SP = 12;
|
||||
/** RegularExpression Id. */
|
||||
int Punct = 13;
|
||||
/** RegularExpression Id. */
|
||||
int ScriptText = 14;
|
||||
/** RegularExpression Id. */
|
||||
int ScriptEnd = 15;
|
||||
/** RegularExpression Id. */
|
||||
int ArgName = 16;
|
||||
/** RegularExpression Id. */
|
||||
int ArgEquals = 17;
|
||||
/** RegularExpression Id. */
|
||||
int TagEnd = 18;
|
||||
/** RegularExpression Id. */
|
||||
int ArgValue = 19;
|
||||
/** RegularExpression Id. */
|
||||
int ArgQuote1 = 20;
|
||||
/** RegularExpression Id. */
|
||||
int ArgQuote2 = 21;
|
||||
/** RegularExpression Id. */
|
||||
int Quote1Text = 23;
|
||||
/** RegularExpression Id. */
|
||||
int CloseQuote1 = 24;
|
||||
/** RegularExpression Id. */
|
||||
int Quote2Text = 25;
|
||||
/** RegularExpression Id. */
|
||||
int CloseQuote2 = 26;
|
||||
/** RegularExpression Id. */
|
||||
int CommentText1 = 27;
|
||||
/** RegularExpression Id. */
|
||||
int CommentEnd1 = 28;
|
||||
/** RegularExpression Id. */
|
||||
int CommentText2 = 29;
|
||||
/** RegularExpression Id. */
|
||||
int CommentEnd2 = 30;
|
||||
|
||||
/** Lexical state. */
|
||||
int DEFAULT = 0;
|
||||
/** Lexical state. */
|
||||
int WithinScript = 1;
|
||||
/** Lexical state. */
|
||||
int WithinTag = 2;
|
||||
/** Lexical state. */
|
||||
int AfterEquals = 3;
|
||||
/** Lexical state. */
|
||||
int WithinQuote1 = 4;
|
||||
/** Lexical state. */
|
||||
int WithinQuote2 = 5;
|
||||
/** Lexical state. */
|
||||
int WithinComment1 = 6;
|
||||
/** Lexical state. */
|
||||
int WithinComment2 = 7;
|
||||
|
||||
/** Literal token values. */
|
||||
String[] tokenImage = {
|
||||
"<EOF>",
|
||||
"\"<script\"",
|
||||
"<TagName>",
|
||||
"<DeclName>",
|
||||
"\"<!--\"",
|
||||
"\"<!\"",
|
||||
"<Word>",
|
||||
"<LET>",
|
||||
"<NUM>",
|
||||
"<HEX>",
|
||||
"<Entity>",
|
||||
"<Space>",
|
||||
"<SP>",
|
||||
"<Punct>",
|
||||
"<ScriptText>",
|
||||
"<ScriptEnd>",
|
||||
"<ArgName>",
|
||||
"\"=\"",
|
||||
"<TagEnd>",
|
||||
"<ArgValue>",
|
||||
"\"\\\'\"",
|
||||
"\"\\\"\"",
|
||||
"<token of kind 22>",
|
||||
"<Quote1Text>",
|
||||
"<CloseQuote1>",
|
||||
"<Quote2Text>",
|
||||
"<CloseQuote2>",
|
||||
"<CommentText1>",
|
||||
"\"-->\"",
|
||||
"<CommentText2>",
|
||||
"\">\"",
|
||||
};
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,198 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
|
||||
/* JavaCCOptions:KEEP_LINE_COL=null */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* This exception is thrown when parse errors are encountered.
|
||||
* You can explicitly create objects of this exception type by
|
||||
* calling the method generateParseException in the generated
|
||||
* parser.
|
||||
*
|
||||
* You can modify this class to customize your error reporting
|
||||
* mechanisms so long as you retain the public fields.
|
||||
*/
|
||||
public class ParseException extends Exception {
|
||||
|
||||
/**
|
||||
* This constructor is used by the method "generateParseException"
|
||||
* in the generated parser. Calling this constructor generates
|
||||
* a new object of this type with the fields "currentToken",
|
||||
* "expectedTokenSequences", and "tokenImage" set. The boolean
|
||||
* flag "specialConstructor" is also set to true to indicate that
|
||||
* this constructor was used to create this object.
|
||||
* This constructor calls its super class with the empty string
|
||||
* to force the "toString" method of parent class "Throwable" to
|
||||
* print the error message in the form:
|
||||
* ParseException: <result of getMessage>
|
||||
*/
|
||||
public ParseException(Token currentTokenVal,
|
||||
int[][] expectedTokenSequencesVal,
|
||||
String[] tokenImageVal
|
||||
)
|
||||
{
|
||||
super("");
|
||||
specialConstructor = true;
|
||||
currentToken = currentTokenVal;
|
||||
expectedTokenSequences = expectedTokenSequencesVal;
|
||||
tokenImage = tokenImageVal;
|
||||
}
|
||||
|
||||
/**
|
||||
* The following constructors are for use by you for whatever
|
||||
* purpose you can think of. Constructing the exception in this
|
||||
* manner makes the exception behave in the normal way - i.e., as
|
||||
* documented in the class "Throwable". The fields "errorToken",
|
||||
* "expectedTokenSequences", and "tokenImage" do not contain
|
||||
* relevant information. The JavaCC generated code does not use
|
||||
* these constructors.
|
||||
*/
|
||||
|
||||
public ParseException() {
|
||||
super();
|
||||
specialConstructor = false;
|
||||
}
|
||||
|
||||
/** Constructor with message. */
|
||||
public ParseException(String message) {
|
||||
super(message);
|
||||
specialConstructor = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* This variable determines which constructor was used to create
|
||||
* this object and thereby affects the semantics of the
|
||||
* "getMessage" method (see below).
|
||||
*/
|
||||
protected boolean specialConstructor;
|
||||
|
||||
/**
|
||||
* This is the last token that has been consumed successfully. If
|
||||
* this object has been created due to a parse error, the token
|
||||
* followng this token will (therefore) be the first error token.
|
||||
*/
|
||||
public Token currentToken;
|
||||
|
||||
/**
|
||||
* Each entry in this array is an array of integers. Each array
|
||||
* of integers represents a sequence of tokens (by their ordinal
|
||||
* values) that is expected at this point of the parse.
|
||||
*/
|
||||
public int[][] expectedTokenSequences;
|
||||
|
||||
/**
|
||||
* This is a reference to the "tokenImage" array of the generated
|
||||
* parser within which the parse error occurred. This array is
|
||||
* defined in the generated ...Constants interface.
|
||||
*/
|
||||
public String[] tokenImage;
|
||||
|
||||
/**
|
||||
* This method has the standard behavior when this object has been
|
||||
* created using the standard constructors. Otherwise, it uses
|
||||
* "currentToken" and "expectedTokenSequences" to generate a parse
|
||||
* error message and returns it. If this object has been created
|
||||
* due to a parse error, and you do not catch it (it gets thrown
|
||||
* from the parser), then this method is called during the printing
|
||||
* of the final stack trace, and hence the correct error message
|
||||
* gets displayed.
|
||||
*/
|
||||
public String getMessage() {
|
||||
if (!specialConstructor) {
|
||||
return super.getMessage();
|
||||
}
|
||||
StringBuffer expected = new StringBuffer();
|
||||
int maxSize = 0;
|
||||
for (int i = 0; i < expectedTokenSequences.length; i++) {
|
||||
if (maxSize < expectedTokenSequences[i].length) {
|
||||
maxSize = expectedTokenSequences[i].length;
|
||||
}
|
||||
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
|
||||
expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
|
||||
}
|
||||
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
|
||||
expected.append("...");
|
||||
}
|
||||
expected.append(eol).append(" ");
|
||||
}
|
||||
String retval = "Encountered \"";
|
||||
Token tok = currentToken.next;
|
||||
for (int i = 0; i < maxSize; i++) {
|
||||
if (i != 0) retval += " ";
|
||||
if (tok.kind == 0) {
|
||||
retval += tokenImage[0];
|
||||
break;
|
||||
}
|
||||
retval += " " + tokenImage[tok.kind];
|
||||
retval += " \"";
|
||||
retval += add_escapes(tok.image);
|
||||
retval += " \"";
|
||||
tok = tok.next;
|
||||
}
|
||||
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
|
||||
retval += "." + eol;
|
||||
if (expectedTokenSequences.length == 1) {
|
||||
retval += "Was expecting:" + eol + " ";
|
||||
} else {
|
||||
retval += "Was expecting one of:" + eol + " ";
|
||||
}
|
||||
retval += expected.toString();
|
||||
return retval;
|
||||
}
|
||||
|
||||
/**
|
||||
* The end of line string for this machine.
|
||||
*/
|
||||
protected String eol = System.getProperty("line.separator", "\n");
|
||||
|
||||
/**
|
||||
* Used to convert raw characters to their escaped version
|
||||
* when these raw version cannot be used as part of an ASCII
|
||||
* string literal.
|
||||
*/
|
||||
protected String add_escapes(String str) {
|
||||
StringBuffer retval = new StringBuffer();
|
||||
char ch;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
switch (str.charAt(i))
|
||||
{
|
||||
case 0 :
|
||||
continue;
|
||||
case '\b':
|
||||
retval.append("\\b");
|
||||
continue;
|
||||
case '\t':
|
||||
retval.append("\\t");
|
||||
continue;
|
||||
case '\n':
|
||||
retval.append("\\n");
|
||||
continue;
|
||||
case '\f':
|
||||
retval.append("\\f");
|
||||
continue;
|
||||
case '\r':
|
||||
retval.append("\\r");
|
||||
continue;
|
||||
case '\"':
|
||||
retval.append("\\\"");
|
||||
continue;
|
||||
case '\'':
|
||||
retval.append("\\\'");
|
||||
continue;
|
||||
case '\\':
|
||||
retval.append("\\\\");
|
||||
continue;
|
||||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return retval.toString();
|
||||
}
|
||||
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=63b2008c66e199b79536447c26bee2ab (do not edit this line) */
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
class ParserThread extends Thread {
|
||||
HTMLParser parser;
|
||||
|
||||
ParserThread(HTMLParser p) {
|
||||
parser = p;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() { // convert pipeOut to pipeIn
|
||||
try {
|
||||
try { // parse document to pipeOut
|
||||
parser.HTMLDocument();
|
||||
} catch (ParseException e) {
|
||||
System.out.println("Parse Aborted: " + e.getMessage());
|
||||
} catch (TokenMgrError e) {
|
||||
System.out.println("Parse Aborted: " + e.getMessage());
|
||||
} finally {
|
||||
parser.pipeOut.close();
|
||||
synchronized (parser) {
|
||||
parser.summary.setLength(HTMLParser.SUMMARY_LENGTH);
|
||||
parser.titleComplete = true;
|
||||
parser.notifyAll();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,472 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.1 */
|
||||
/* JavaCCOptions:STATIC=false */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* An implementation of interface CharStream, where the stream is assumed to
|
||||
* contain only ASCII characters (without unicode processing).
|
||||
*/
|
||||
|
||||
public class SimpleCharStream
|
||||
{
|
||||
/** Whether parser is static. */
|
||||
public static final boolean staticFlag = false;
|
||||
int bufsize;
|
||||
int available;
|
||||
int tokenBegin;
|
||||
/** Position in buffer. */
|
||||
public int bufpos = -1;
|
||||
protected int bufline[];
|
||||
protected int bufcolumn[];
|
||||
|
||||
protected int column = 0;
|
||||
protected int line = 1;
|
||||
|
||||
protected boolean prevCharIsCR = false;
|
||||
protected boolean prevCharIsLF = false;
|
||||
|
||||
protected java.io.Reader inputStream;
|
||||
|
||||
protected char[] buffer;
|
||||
protected int maxNextCharInd = 0;
|
||||
protected int inBuf = 0;
|
||||
protected int tabSize = 8;
|
||||
|
||||
protected void setTabSize(int i) { tabSize = i; }
|
||||
protected int getTabSize(int i) { return tabSize; }
|
||||
|
||||
|
||||
protected void ExpandBuff(boolean wrapAround)
|
||||
{
|
||||
char[] newbuffer = new char[bufsize + 2048];
|
||||
int newbufline[] = new int[bufsize + 2048];
|
||||
int newbufcolumn[] = new int[bufsize + 2048];
|
||||
|
||||
try
|
||||
{
|
||||
if (wrapAround)
|
||||
{
|
||||
System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
|
||||
System.arraycopy(buffer, 0, newbuffer,
|
||||
bufsize - tokenBegin, bufpos);
|
||||
buffer = newbuffer;
|
||||
|
||||
System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
|
||||
System.arraycopy(bufline, 0, newbufline, bufsize - tokenBegin, bufpos);
|
||||
bufline = newbufline;
|
||||
|
||||
System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
|
||||
System.arraycopy(bufcolumn, 0, newbufcolumn, bufsize - tokenBegin, bufpos);
|
||||
bufcolumn = newbufcolumn;
|
||||
|
||||
maxNextCharInd = (bufpos += (bufsize - tokenBegin));
|
||||
}
|
||||
else
|
||||
{
|
||||
System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
|
||||
buffer = newbuffer;
|
||||
|
||||
System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
|
||||
bufline = newbufline;
|
||||
|
||||
System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
|
||||
bufcolumn = newbufcolumn;
|
||||
|
||||
maxNextCharInd = (bufpos -= tokenBegin);
|
||||
}
|
||||
}
|
||||
catch (Throwable t)
|
||||
{
|
||||
throw new Error(t.getMessage());
|
||||
}
|
||||
|
||||
|
||||
bufsize += 2048;
|
||||
available = bufsize;
|
||||
tokenBegin = 0;
|
||||
}
|
||||
|
||||
protected void FillBuff() throws java.io.IOException
|
||||
{
|
||||
if (maxNextCharInd == available)
|
||||
{
|
||||
if (available == bufsize)
|
||||
{
|
||||
if (tokenBegin > 2048)
|
||||
{
|
||||
bufpos = maxNextCharInd = 0;
|
||||
available = tokenBegin;
|
||||
}
|
||||
else if (tokenBegin < 0)
|
||||
bufpos = maxNextCharInd = 0;
|
||||
else
|
||||
ExpandBuff(false);
|
||||
}
|
||||
else if (available > tokenBegin)
|
||||
available = bufsize;
|
||||
else if ((tokenBegin - available) < 2048)
|
||||
ExpandBuff(true);
|
||||
else
|
||||
available = tokenBegin;
|
||||
}
|
||||
|
||||
int i;
|
||||
try {
|
||||
if ((i = inputStream.read(buffer, maxNextCharInd,
|
||||
available - maxNextCharInd)) == -1)
|
||||
{
|
||||
inputStream.close();
|
||||
throw new java.io.IOException();
|
||||
}
|
||||
else
|
||||
maxNextCharInd += i;
|
||||
return;
|
||||
}
|
||||
catch(java.io.IOException e) {
|
||||
--bufpos;
|
||||
backup(0);
|
||||
if (tokenBegin == -1)
|
||||
tokenBegin = bufpos;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/** Start. */
|
||||
public char BeginToken() throws java.io.IOException
|
||||
{
|
||||
tokenBegin = -1;
|
||||
char c = readChar();
|
||||
tokenBegin = bufpos;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
protected void UpdateLineColumn(char c)
|
||||
{
|
||||
column++;
|
||||
|
||||
if (prevCharIsLF)
|
||||
{
|
||||
prevCharIsLF = false;
|
||||
line += (column = 1);
|
||||
}
|
||||
else if (prevCharIsCR)
|
||||
{
|
||||
prevCharIsCR = false;
|
||||
if (c == '\n')
|
||||
{
|
||||
prevCharIsLF = true;
|
||||
}
|
||||
else
|
||||
line += (column = 1);
|
||||
}
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '\r' :
|
||||
prevCharIsCR = true;
|
||||
break;
|
||||
case '\n' :
|
||||
prevCharIsLF = true;
|
||||
break;
|
||||
case '\t' :
|
||||
column--;
|
||||
column += (tabSize - (column % tabSize));
|
||||
break;
|
||||
default :
|
||||
break;
|
||||
}
|
||||
|
||||
bufline[bufpos] = line;
|
||||
bufcolumn[bufpos] = column;
|
||||
}
|
||||
|
||||
/** Read a character. */
|
||||
public char readChar() throws java.io.IOException
|
||||
{
|
||||
if (inBuf > 0)
|
||||
{
|
||||
--inBuf;
|
||||
|
||||
if (++bufpos == bufsize)
|
||||
bufpos = 0;
|
||||
|
||||
return buffer[bufpos];
|
||||
}
|
||||
|
||||
if (++bufpos >= maxNextCharInd)
|
||||
FillBuff();
|
||||
|
||||
char c = buffer[bufpos];
|
||||
|
||||
UpdateLineColumn(c);
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
* @see #getEndColumn
|
||||
*/
|
||||
|
||||
public int getColumn() {
|
||||
return bufcolumn[bufpos];
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
* @see #getEndLine
|
||||
*/
|
||||
|
||||
public int getLine() {
|
||||
return bufline[bufpos];
|
||||
}
|
||||
|
||||
/** Get token end column number. */
|
||||
public int getEndColumn() {
|
||||
return bufcolumn[bufpos];
|
||||
}
|
||||
|
||||
/** Get token end line number. */
|
||||
public int getEndLine() {
|
||||
return bufline[bufpos];
|
||||
}
|
||||
|
||||
/** Get token beginning column number. */
|
||||
public int getBeginColumn() {
|
||||
return bufcolumn[tokenBegin];
|
||||
}
|
||||
|
||||
/** Get token beginning line number. */
|
||||
public int getBeginLine() {
|
||||
return bufline[tokenBegin];
|
||||
}
|
||||
|
||||
/** Backup a number of characters. */
|
||||
public void backup(int amount) {
|
||||
|
||||
inBuf += amount;
|
||||
if ((bufpos -= amount) < 0)
|
||||
bufpos += bufsize;
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.Reader dstream, int startline,
|
||||
int startcolumn, int buffersize)
|
||||
{
|
||||
inputStream = dstream;
|
||||
line = startline;
|
||||
column = startcolumn - 1;
|
||||
|
||||
available = bufsize = buffersize;
|
||||
buffer = new char[buffersize];
|
||||
bufline = new int[buffersize];
|
||||
bufcolumn = new int[buffersize];
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.Reader dstream, int startline,
|
||||
int startcolumn)
|
||||
{
|
||||
this(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.Reader dstream)
|
||||
{
|
||||
this(dstream, 1, 1, 4096);
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.Reader dstream, int startline,
|
||||
int startcolumn, int buffersize)
|
||||
{
|
||||
inputStream = dstream;
|
||||
line = startline;
|
||||
column = startcolumn - 1;
|
||||
|
||||
if (buffer == null || buffersize != buffer.length)
|
||||
{
|
||||
available = bufsize = buffersize;
|
||||
buffer = new char[buffersize];
|
||||
bufline = new int[buffersize];
|
||||
bufcolumn = new int[buffersize];
|
||||
}
|
||||
prevCharIsLF = prevCharIsCR = false;
|
||||
tokenBegin = inBuf = maxNextCharInd = 0;
|
||||
bufpos = -1;
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.Reader dstream, int startline,
|
||||
int startcolumn)
|
||||
{
|
||||
ReInit(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.Reader dstream)
|
||||
{
|
||||
ReInit(dstream, 1, 1, 4096);
|
||||
}
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
||||
int startcolumn, int buffersize)
|
||||
{
|
||||
this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
this(dstream, encoding, startline, startcolumn, 4096);
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
||||
int startcolumn)
|
||||
{
|
||||
this(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
this(dstream, encoding, 1, 1, 4096);
|
||||
}
|
||||
|
||||
/** Constructor. */
|
||||
public SimpleCharStream(java.io.InputStream dstream)
|
||||
{
|
||||
this(dstream, 1, 1, 4096);
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream dstream, int startline,
|
||||
int startcolumn, int buffersize)
|
||||
{
|
||||
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
ReInit(dstream, encoding, 1, 1, 4096);
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream dstream)
|
||||
{
|
||||
ReInit(dstream, 1, 1, 4096);
|
||||
}
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
ReInit(dstream, encoding, startline, startcolumn, 4096);
|
||||
}
|
||||
/** Reinitialise. */
|
||||
public void ReInit(java.io.InputStream dstream, int startline,
|
||||
int startcolumn)
|
||||
{
|
||||
ReInit(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
/** Get token literal value. */
|
||||
public String GetImage()
|
||||
{
|
||||
if (bufpos >= tokenBegin)
|
||||
return new String(buffer, tokenBegin, bufpos - tokenBegin + 1);
|
||||
else
|
||||
return new String(buffer, tokenBegin, bufsize - tokenBegin) +
|
||||
new String(buffer, 0, bufpos + 1);
|
||||
}
|
||||
|
||||
/** Get the suffix. */
|
||||
public char[] GetSuffix(int len)
|
||||
{
|
||||
char[] ret = new char[len];
|
||||
|
||||
if ((bufpos + 1) >= len)
|
||||
System.arraycopy(buffer, bufpos - len + 1, ret, 0, len);
|
||||
else
|
||||
{
|
||||
System.arraycopy(buffer, bufsize - (len - bufpos - 1), ret, 0,
|
||||
len - bufpos - 1);
|
||||
System.arraycopy(buffer, 0, ret, len - bufpos - 1, bufpos + 1);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Reset buffer when finished. */
|
||||
public void Done()
|
||||
{
|
||||
buffer = null;
|
||||
bufline = null;
|
||||
bufcolumn = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to adjust line and column numbers for the start of a token.
|
||||
*/
|
||||
public void adjustBeginLineColumn(int newLine, int newCol)
|
||||
{
|
||||
int start = tokenBegin;
|
||||
int len;
|
||||
|
||||
if (bufpos >= tokenBegin)
|
||||
{
|
||||
len = bufpos - tokenBegin + inBuf + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
len = bufsize - tokenBegin + bufpos + 1 + inBuf;
|
||||
}
|
||||
|
||||
int i = 0, j = 0, k = 0;
|
||||
int nextColDiff = 0, columnDiff = 0;
|
||||
|
||||
while (i < len &&
|
||||
bufline[j = start % bufsize] == bufline[k = ++start % bufsize])
|
||||
{
|
||||
bufline[j] = newLine;
|
||||
nextColDiff = columnDiff + bufcolumn[k] - bufcolumn[j];
|
||||
bufcolumn[j] = newCol + columnDiff;
|
||||
columnDiff = nextColDiff;
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < len)
|
||||
{
|
||||
bufline[j] = newLine++;
|
||||
bufcolumn[j] = newCol + columnDiff;
|
||||
|
||||
while (i++ < len)
|
||||
{
|
||||
if (bufline[j = start % bufsize] != bufline[++start % bufsize])
|
||||
bufline[j] = newLine++;
|
||||
else
|
||||
bufline[j] = newLine;
|
||||
}
|
||||
}
|
||||
|
||||
line = bufline[j];
|
||||
column = bufcolumn[j];
|
||||
}
|
||||
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=7393ed4ac2709e2de22d164f9db78b65 (do not edit this line) */
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public final class Tags {
|
||||
|
||||
/**
|
||||
* contains all tags for which whitespaces have to be inserted for proper tokenization
|
||||
*/
|
||||
public static final Set<String> WS_ELEMS = Collections.synchronizedSet(new HashSet<String>());
|
||||
|
||||
static{
|
||||
WS_ELEMS.add("<hr");
|
||||
WS_ELEMS.add("<hr/"); // note that "<hr />" does not need to be listed explicitly
|
||||
WS_ELEMS.add("<br");
|
||||
WS_ELEMS.add("<br/");
|
||||
WS_ELEMS.add("<p");
|
||||
WS_ELEMS.add("</p");
|
||||
WS_ELEMS.add("<div");
|
||||
WS_ELEMS.add("</div");
|
||||
WS_ELEMS.add("<td");
|
||||
WS_ELEMS.add("</td");
|
||||
WS_ELEMS.add("<li");
|
||||
WS_ELEMS.add("</li");
|
||||
WS_ELEMS.add("<q");
|
||||
WS_ELEMS.add("</q");
|
||||
WS_ELEMS.add("<blockquote");
|
||||
WS_ELEMS.add("</blockquote");
|
||||
WS_ELEMS.add("<dt");
|
||||
WS_ELEMS.add("</dt");
|
||||
WS_ELEMS.add("<h1");
|
||||
WS_ELEMS.add("</h1");
|
||||
WS_ELEMS.add("<h2");
|
||||
WS_ELEMS.add("</h2");
|
||||
WS_ELEMS.add("<h3");
|
||||
WS_ELEMS.add("</h3");
|
||||
WS_ELEMS.add("<h4");
|
||||
WS_ELEMS.add("</h4");
|
||||
WS_ELEMS.add("<h5");
|
||||
WS_ELEMS.add("</h5");
|
||||
WS_ELEMS.add("<h6");
|
||||
WS_ELEMS.add("</h6");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
class Test {
|
||||
public static void main(String[] argv) throws IOException, InterruptedException {
|
||||
if ("-dir".equals(argv[0])) {
|
||||
String[] files = new File(argv[1]).list();
|
||||
java.util.Arrays.sort(files);
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
System.err.println(files[i]);
|
||||
File file = new File(argv[1], files[i]);
|
||||
parse(file);
|
||||
}
|
||||
} else
|
||||
parse(new File(argv[0]));
|
||||
}
|
||||
|
||||
public static void parse(File file) throws IOException, InterruptedException {
|
||||
FileInputStream fis = null;
|
||||
try {
|
||||
fis = new FileInputStream(file);
|
||||
HTMLParser parser = new HTMLParser(fis);
|
||||
System.out.println("Title: " + Entities.encode(parser.getTitle()));
|
||||
System.out.println("Summary: " + Entities.encode(parser.getSummary()));
|
||||
System.out.println("Content:");
|
||||
LineNumberReader reader = new LineNumberReader(parser.getReader());
|
||||
for (String l = reader.readLine(); l != null; l = reader.readLine())
|
||||
System.out.println(l);
|
||||
} finally {
|
||||
if (fis != null) fis.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
|
||||
/* JavaCCOptions:TOKEN_EXTENDS=,KEEP_LINE_COL=null */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* Describes the input token stream.
|
||||
*/
|
||||
|
||||
public class Token {
|
||||
|
||||
/**
|
||||
* An integer that describes the kind of this token. This numbering
|
||||
* system is determined by JavaCCParser, and a table of these numbers is
|
||||
* stored in the file ...Constants.java.
|
||||
*/
|
||||
public int kind;
|
||||
|
||||
/** The line number of the first character of this Token. */
|
||||
public int beginLine;
|
||||
/** The column number of the first character of this Token. */
|
||||
public int beginColumn;
|
||||
/** The line number of the last character of this Token. */
|
||||
public int endLine;
|
||||
/** The column number of the last character of this Token. */
|
||||
public int endColumn;
|
||||
|
||||
/**
|
||||
* The string image of the token.
|
||||
*/
|
||||
public String image;
|
||||
|
||||
/**
|
||||
* A reference to the next regular (non-special) token from the input
|
||||
* stream. If this is the last token from the input stream, or if the
|
||||
* token manager has not read tokens beyond this one, this field is
|
||||
* set to null. This is true only if this token is also a regular
|
||||
* token. Otherwise, see below for a description of the contents of
|
||||
* this field.
|
||||
*/
|
||||
public Token next;
|
||||
|
||||
/**
|
||||
* This field is used to access special tokens that occur prior to this
|
||||
* token, but after the immediately preceding regular (non-special) token.
|
||||
* If there are no such special tokens, this field is set to null.
|
||||
* When there are more than one such special token, this field refers
|
||||
* to the last of these special tokens, which in turn refers to the next
|
||||
* previous special token through its specialToken field, and so on
|
||||
* until the first special token (whose specialToken field is null).
|
||||
* The next fields of special tokens refer to other special tokens that
|
||||
* immediately follow it (without an intervening regular token). If there
|
||||
* is no such token, this field is null.
|
||||
*/
|
||||
public Token specialToken;
|
||||
|
||||
/**
|
||||
* An optional attribute value of the Token.
|
||||
* Tokens which are not used as syntactic sugar will often contain
|
||||
* meaningful values that will be used later on by the compiler or
|
||||
* interpreter. This attribute value is often different from the image.
|
||||
* Any subclass of Token that actually wants to return a non-null value can
|
||||
* override this method as appropriate.
|
||||
*/
|
||||
public Object getValue() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* No-argument constructor
|
||||
*/
|
||||
public Token() {}
|
||||
|
||||
/**
|
||||
* Constructs a new token for the specified Image.
|
||||
*/
|
||||
public Token(int kind)
|
||||
{
|
||||
this(kind, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new token for the specified Image and Kind.
|
||||
*/
|
||||
public Token(int kind, String image)
|
||||
{
|
||||
this.kind = kind;
|
||||
this.image = image;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the image.
|
||||
*/
|
||||
public String toString()
|
||||
{
|
||||
return image;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new Token object, by default. However, if you want, you
|
||||
* can create and return subclass objects based on the value of ofKind.
|
||||
* Simply add the cases to the switch for all those special cases.
|
||||
* For example, if you have a subclass of Token called IDToken that
|
||||
* you want to create if ofKind is ID, simply add something like :
|
||||
*
|
||||
* case MyParserConstants.ID : return new IDToken(ofKind, image);
|
||||
*
|
||||
* to the following switch statement. Then you can cast matchedToken
|
||||
* variable to the appropriate type and use sit in your lexical actions.
|
||||
*/
|
||||
public static Token newToken(int ofKind, String image)
|
||||
{
|
||||
switch(ofKind)
|
||||
{
|
||||
default : return new Token(ofKind, image);
|
||||
}
|
||||
}
|
||||
|
||||
public static Token newToken(int ofKind)
|
||||
{
|
||||
return newToken(ofKind, null);
|
||||
}
|
||||
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=7bf8bdbb1c45bccd8162cdd48316d5e0 (do not edit this line) */
|
|
@ -0,0 +1,141 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 4.1 */
|
||||
/* JavaCCOptions: */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
/** Token Manager Error. */
|
||||
@SuppressWarnings("serial")
|
||||
public class TokenMgrError extends Error
|
||||
{
|
||||
|
||||
/*
|
||||
* Ordinals for various reasons why an Error of this type can be thrown.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lexical error occurred.
|
||||
*/
|
||||
static final int LEXICAL_ERROR = 0;
|
||||
|
||||
/**
|
||||
* An attempt was made to create a second instance of a static token manager.
|
||||
*/
|
||||
static final int STATIC_LEXER_ERROR = 1;
|
||||
|
||||
/**
|
||||
* Tried to change to an invalid lexical state.
|
||||
*/
|
||||
static final int INVALID_LEXICAL_STATE = 2;
|
||||
|
||||
/**
|
||||
* Detected (and bailed out of) an infinite loop in the token manager.
|
||||
*/
|
||||
static final int LOOP_DETECTED = 3;
|
||||
|
||||
/**
|
||||
* Indicates the reason why the exception is thrown. It will have
|
||||
* one of the above 4 values.
|
||||
*/
|
||||
int errorCode;
|
||||
|
||||
/**
|
||||
* Replaces unprintable characters by their escaped (or unicode escaped)
|
||||
* equivalents in the given string
|
||||
*/
|
||||
protected static final String addEscapes(String str) {
|
||||
StringBuffer retval = new StringBuffer();
|
||||
char ch;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
switch (str.charAt(i))
|
||||
{
|
||||
case 0 :
|
||||
continue;
|
||||
case '\b':
|
||||
retval.append("\\b");
|
||||
continue;
|
||||
case '\t':
|
||||
retval.append("\\t");
|
||||
continue;
|
||||
case '\n':
|
||||
retval.append("\\n");
|
||||
continue;
|
||||
case '\f':
|
||||
retval.append("\\f");
|
||||
continue;
|
||||
case '\r':
|
||||
retval.append("\\r");
|
||||
continue;
|
||||
case '\"':
|
||||
retval.append("\\\"");
|
||||
continue;
|
||||
case '\'':
|
||||
retval.append("\\\'");
|
||||
continue;
|
||||
case '\\':
|
||||
retval.append("\\\\");
|
||||
continue;
|
||||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return retval.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a detailed message for the Error when it is thrown by the
|
||||
* token manager to indicate a lexical error.
|
||||
* Parameters :
|
||||
* EOFSeen : indicates if EOF caused the lexical error
|
||||
* curLexState : lexical state in which this error occurred
|
||||
* errorLine : line number when the error occurred
|
||||
* errorColumn : column number when the error occurred
|
||||
* errorAfter : prefix that was seen before this error occurred
|
||||
* curchar : the offending character
|
||||
* Note: You can customize the lexical error message by modifying this method.
|
||||
*/
|
||||
protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
|
||||
return("Lexical error at line " +
|
||||
errorLine + ", column " +
|
||||
errorColumn + ". Encountered: " +
|
||||
(EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
|
||||
"after : \"" + addEscapes(errorAfter) + "\"");
|
||||
}
|
||||
|
||||
/**
|
||||
* You can also modify the body of this method to customize your error messages.
|
||||
* For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
|
||||
* of end-users concern, so you can return something like :
|
||||
*
|
||||
* "Internal Error : Please file a bug report .... "
|
||||
*
|
||||
* from this method for such cases in the release version of your parser.
|
||||
*/
|
||||
public String getMessage() {
|
||||
return super.getMessage();
|
||||
}
|
||||
|
||||
/*
|
||||
* Constructors of various flavors follow.
|
||||
*/
|
||||
|
||||
/** No arg constructor. */
|
||||
public TokenMgrError() {
|
||||
}
|
||||
|
||||
/** Constructor with message and reason. */
|
||||
public TokenMgrError(String message, int reason) {
|
||||
super(message);
|
||||
errorCode = reason;
|
||||
}
|
||||
|
||||
/** Full Constructor. */
|
||||
public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
|
||||
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
|
||||
}
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=5ffb7e46d5ae93d8d59e6f4ae7eb36d1 (do not edit this line) */
|
|
@ -0,0 +1,29 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Lucene's package information, including version. **/
|
||||
public final class LucenePackage {
|
||||
|
||||
private LucenePackage() {} // can't construct
|
||||
|
||||
/** Return Lucene's package, including version information. */
|
||||
public static Package get() {
|
||||
return LucenePackage.class.getPackage();
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,144 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
import java.io.Closeable;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
import org.apache.lucene.util.CloseableThreadLocal;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
|
||||
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
|
||||
* policy for extracting index terms from text.
|
||||
* <p>
|
||||
* Typical implementations first build a Tokenizer, which breaks the stream of
|
||||
* characters from the Reader into raw Tokens. One or more TokenFilters may
|
||||
* then be applied to the output of the Tokenizer.
|
||||
*/
|
||||
public abstract class Analyzer implements Closeable {
|
||||
/** Creates a TokenStream which tokenizes all the text in the provided
|
||||
* Reader. Must be able to handle null field name for
|
||||
* backward compatibility.
|
||||
*/
|
||||
public abstract TokenStream tokenStream(String fieldName, Reader reader);
|
||||
|
||||
/** Creates a TokenStream that is allowed to be re-used
|
||||
* from the previous time that the same thread called
|
||||
* this method. Callers that do not need to use more
|
||||
* than one TokenStream at the same time from this
|
||||
* analyzer should use this method for better
|
||||
* performance.
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
|
||||
|
||||
/** Used by Analyzers that implement reusableTokenStream
|
||||
* to retrieve previously saved TokenStreams for re-use
|
||||
* by the same thread. */
|
||||
protected Object getPreviousTokenStream() {
|
||||
try {
|
||||
return tokenStreams.get();
|
||||
} catch (NullPointerException npe) {
|
||||
if (tokenStreams == null) {
|
||||
throw new AlreadyClosedException("this Analyzer is closed");
|
||||
} else {
|
||||
throw npe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Used by Analyzers that implement reusableTokenStream
|
||||
* to save a TokenStream for later re-use by the same
|
||||
* thread. */
|
||||
protected void setPreviousTokenStream(Object obj) {
|
||||
try {
|
||||
tokenStreams.set(obj);
|
||||
} catch (NullPointerException npe) {
|
||||
if (tokenStreams == null) {
|
||||
throw new AlreadyClosedException("this Analyzer is closed");
|
||||
} else {
|
||||
throw npe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
protected boolean overridesTokenStreamMethod = false;
|
||||
|
||||
/** @deprecated This is only present to preserve
|
||||
* back-compat of classes that subclass a core analyzer
|
||||
* and override tokenStream but not reusableTokenStream */
|
||||
protected void setOverridesTokenStreamMethod(Class<? extends Analyzer> baseClass) {
|
||||
try {
|
||||
Method m = this.getClass().getMethod("tokenStream", String.class, Reader.class);
|
||||
overridesTokenStreamMethod = m.getDeclaringClass() != baseClass;
|
||||
} catch (NoSuchMethodException nsme) {
|
||||
// cannot happen, as baseClass is subclass of Analyzer through generics
|
||||
overridesTokenStreamMethod = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Invoked before indexing a Fieldable instance if
|
||||
* terms have already been added to that field. This allows custom
|
||||
* analyzers to place an automatic position increment gap between
|
||||
* Fieldable instances using the same field name. The default value
|
||||
* position increment gap is 0. With a 0 position increment gap and
|
||||
* the typical default token position increment of 1, all terms in a field,
|
||||
* including across Fieldable instances, are in successive positions, allowing
|
||||
* exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
|
||||
*
|
||||
* @param fieldName Fieldable name being indexed.
|
||||
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
|
||||
*/
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Just like {@link #getPositionIncrementGap}, except for
|
||||
* Token offsets instead. By default this returns 1 for
|
||||
* tokenized fields and, as if the fields were joined
|
||||
* with an extra space character, and 0 for un-tokenized
|
||||
* fields. This method is only called if the field
|
||||
* produced at least one token for indexing.
|
||||
*
|
||||
* @param field the field just indexed
|
||||
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
|
||||
*/
|
||||
public int getOffsetGap(Fieldable field) {
|
||||
if (field.isTokenized())
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Frees persistent resources used by this Analyzer */
|
||||
public void close() {
|
||||
tokenStreams.close();
|
||||
tokenStreams = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Base utility class for implementing a {@link CharFilter}.
|
||||
* You subclass this, and then record mappings by calling
|
||||
* {@link #addOffCorrectMap}, and then invoke the correct
|
||||
* method to correct an offset.
|
||||
*
|
||||
* <p><b>NOTE</b>: This class is not particularly efficient.
|
||||
* For example, a new class instance is created for every
|
||||
* call to {@link #addOffCorrectMap}, which is then appended
|
||||
* to a private list.
|
||||
*/
|
||||
public abstract class BaseCharFilter extends CharFilter {
|
||||
|
||||
private List<OffCorrectMap> pcmList;
|
||||
|
||||
public BaseCharFilter(CharStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Retrieve the corrected offset. Note that this method
|
||||
* is slow, if you correct positions far before the most
|
||||
* recently added position, as it's a simple linear
|
||||
* search backwards through all offset corrections added
|
||||
* by {@link #addOffCorrectMap}. */
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
if (pcmList == null || pcmList.isEmpty()) {
|
||||
return currentOff;
|
||||
}
|
||||
for (int i = pcmList.size() - 1; i >= 0; i--) {
|
||||
if (currentOff >= pcmList.get(i).off) {
|
||||
return currentOff + pcmList.get(i).cumulativeDiff;
|
||||
}
|
||||
}
|
||||
return currentOff;
|
||||
}
|
||||
|
||||
protected int getLastCumulativeDiff() {
|
||||
return pcmList == null || pcmList.isEmpty() ?
|
||||
0 : pcmList.get(pcmList.size() - 1).cumulativeDiff;
|
||||
}
|
||||
|
||||
protected void addOffCorrectMap(int off, int cumulativeDiff) {
|
||||
if (pcmList == null) {
|
||||
pcmList = new ArrayList<OffCorrectMap>();
|
||||
}
|
||||
pcmList.add(new OffCorrectMap(off, cumulativeDiff));
|
||||
}
|
||||
|
||||
static class OffCorrectMap {
|
||||
|
||||
int off;
|
||||
int cumulativeDiff;
|
||||
|
||||
OffCorrectMap(int off, int cumulativeDiff) {
|
||||
this.off = off;
|
||||
this.cumulativeDiff = cumulativeDiff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append('(');
|
||||
sb.append(off);
|
||||
sb.append(',');
|
||||
sb.append(cumulativeDiff);
|
||||
sb.append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This class can be used if the token attributes of a TokenStream
|
||||
* are intended to be consumed more than once. It caches
|
||||
* all token attribute states locally in a List.
|
||||
*
|
||||
* <P>CachingTokenFilter implements the optional method
|
||||
* {@link TokenStream#reset()}, which repositions the
|
||||
* stream to the first Token.
|
||||
*/
|
||||
public final class CachingTokenFilter extends TokenFilter {
|
||||
private List<AttributeSource.State> cache = null;
|
||||
private Iterator<AttributeSource.State> iterator = null;
|
||||
private AttributeSource.State finalState;
|
||||
|
||||
public CachingTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (cache == null) {
|
||||
// fill cache lazily
|
||||
cache = new LinkedList<AttributeSource.State>();
|
||||
fillCache();
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
|
||||
if (!iterator.hasNext()) {
|
||||
// the cache is exhausted, return false
|
||||
return false;
|
||||
}
|
||||
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||
restoreState(iterator.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
if(cache != null) {
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
private void fillCache() throws IOException {
|
||||
while(input.incrementToken()) {
|
||||
cache.add(captureState());
|
||||
}
|
||||
// capture final state
|
||||
input.end();
|
||||
finalState = captureState();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,390 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* A simple class that stores Strings as char[]'s in a
|
||||
* hash table. Note that this is not a general purpose
|
||||
* class. For example, it cannot remove items from the
|
||||
* set, nor does it resize its hash table to be smaller,
|
||||
* etc. It is designed to be quick to test if a char[]
|
||||
* is in the set without the necessity of converting it
|
||||
* to a String first.
|
||||
* <P>
|
||||
* <em>Please note:</em> This class implements {@link java.util.Set Set} but
|
||||
* does not behave like it should in all cases. The generic type is
|
||||
* {@code Set<Object>}, because you can add any object to it,
|
||||
* that has a string representation. The add methods will use
|
||||
* {@link Object#toString} and store the result using a {@code char[]}
|
||||
* buffer. The same behaviour have the {@code contains()} methods.
|
||||
* The {@link #iterator()} returns an {@code Iterator<String>}.
|
||||
* For type safety also {@link #stringIterator()} is provided.
|
||||
*/
|
||||
|
||||
public class CharArraySet extends AbstractSet<Object> {
|
||||
private final static int INIT_SIZE = 8;
|
||||
private char[][] entries;
|
||||
private int count;
|
||||
private final boolean ignoreCase;
|
||||
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
|
||||
|
||||
/** Create set with enough capacity to hold startSize
|
||||
* terms */
|
||||
public CharArraySet(int startSize, boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
int size = INIT_SIZE;
|
||||
while(startSize + (startSize>>2) > size)
|
||||
size <<= 1;
|
||||
entries = new char[size][];
|
||||
}
|
||||
|
||||
/** Create set from a Collection of char[] or String */
|
||||
public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
|
||||
this(c.size(), ignoreCase);
|
||||
addAll(c);
|
||||
}
|
||||
|
||||
/** Create set from entries */
|
||||
private CharArraySet(char[][] entries, boolean ignoreCase, int count){
|
||||
this.entries = entries;
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
|
||||
* are in the set */
|
||||
public boolean contains(char[] text, int off, int len) {
|
||||
return entries[getSlot(text, off, len)] != null;
|
||||
}
|
||||
|
||||
/** true if the <code>CharSequence</code> is in the set */
|
||||
public boolean contains(CharSequence cs) {
|
||||
return entries[getSlot(cs)] != null;
|
||||
}
|
||||
|
||||
private int getSlot(char[] text, int off, int len) {
|
||||
int code = getHashCode(text, off, len);
|
||||
int pos = code & (entries.length-1);
|
||||
char[] text2 = entries[pos];
|
||||
if (text2 != null && !equals(text, off, len, text2)) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & (entries.length-1);
|
||||
text2 = entries[pos];
|
||||
} while (text2 != null && !equals(text, off, len, text2));
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
/** Returns true if the String is in the set */
|
||||
private int getSlot(CharSequence text) {
|
||||
int code = getHashCode(text);
|
||||
int pos = code & (entries.length-1);
|
||||
char[] text2 = entries[pos];
|
||||
if (text2 != null && !equals(text, text2)) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & (entries.length-1);
|
||||
text2 = entries[pos];
|
||||
} while (text2 != null && !equals(text, text2));
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
/** Add this CharSequence into the set */
|
||||
public boolean add(CharSequence text) {
|
||||
return add(text.toString()); // could be more efficient
|
||||
}
|
||||
|
||||
/** Add this String into the set */
|
||||
public boolean add(String text) {
|
||||
return add(text.toCharArray());
|
||||
}
|
||||
|
||||
/** Add this char[] directly to the set.
|
||||
* If ignoreCase is true for this Set, the text array will be directly modified.
|
||||
* The user should never modify this text array after calling this method.
|
||||
*/
|
||||
public boolean add(char[] text) {
|
||||
if (ignoreCase)
|
||||
for(int i=0;i<text.length;i++)
|
||||
text[i] = Character.toLowerCase(text[i]);
|
||||
int slot = getSlot(text, 0, text.length);
|
||||
if (entries[slot] != null) return false;
|
||||
entries[slot] = text;
|
||||
count++;
|
||||
|
||||
if (count + (count>>2) > entries.length) {
|
||||
rehash();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean equals(char[] text1, int off, int len, char[] text2) {
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1[off+i]) != text2[i])
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (text1[off+i] != text2[i])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean equals(CharSequence text1, char[] text2) {
|
||||
int len = text1.length();
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (text1.charAt(i) != text2[i])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private void rehash() {
|
||||
final int newSize = 2*entries.length;
|
||||
char[][] oldEntries = entries;
|
||||
entries = new char[newSize][];
|
||||
|
||||
for(int i=0;i<oldEntries.length;i++) {
|
||||
char[] text = oldEntries[i];
|
||||
if (text != null) {
|
||||
// todo: could be faster... no need to compare strings on collision
|
||||
entries[getSlot(text,0,text.length)] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int getHashCode(char[] text, int offset, int len) {
|
||||
int code = 0;
|
||||
final int stop = offset + len;
|
||||
if (ignoreCase) {
|
||||
for (int i=offset; i<stop; i++) {
|
||||
code = code*31 + Character.toLowerCase(text[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i=offset; i<stop; i++) {
|
||||
code = code*31 + text[i];
|
||||
}
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
private int getHashCode(CharSequence text) {
|
||||
int code = 0;
|
||||
int len = text.length();
|
||||
if (ignoreCase) {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + Character.toLowerCase(text.charAt(i));
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + text.charAt(i);
|
||||
}
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return count==0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(Object o) {
|
||||
if (o instanceof char[]) {
|
||||
final char[] text = (char[])o;
|
||||
return contains(text, 0, text.length);
|
||||
}
|
||||
return contains(o.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(Object o) {
|
||||
if (o instanceof char[]) {
|
||||
return add((char[])o);
|
||||
}
|
||||
return add(o.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable {@link CharArraySet}. This allows to provide
|
||||
* unmodifiable views of internal sets for "read-only" use.
|
||||
*
|
||||
* @param set
|
||||
* a set for which the unmodifiable set is returned.
|
||||
* @return an new unmodifiable {@link CharArraySet}.
|
||||
* @throws NullPointerException
|
||||
* if the given set is <code>null</code>.
|
||||
*/
|
||||
public static CharArraySet unmodifiableSet(CharArraySet set) {
|
||||
if (set == null)
|
||||
throw new NullPointerException("Given set is null");
|
||||
if (set == EMPTY_SET)
|
||||
return EMPTY_SET;
|
||||
if (set instanceof UnmodifiableCharArraySet)
|
||||
return set;
|
||||
|
||||
/*
|
||||
* Instead of delegating calls to the given set copy the low-level values to
|
||||
* the unmodifiable Subclass
|
||||
*/
|
||||
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be preserved.
|
||||
*
|
||||
* @param set
|
||||
* a set to copy
|
||||
* @return a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be
|
||||
* preserved.
|
||||
*/
|
||||
public static CharArraySet copy(Set<?> set) {
|
||||
if (set == null)
|
||||
throw new NullPointerException("Given set is null");
|
||||
if(set == EMPTY_SET)
|
||||
return EMPTY_SET;
|
||||
final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
|
||||
: false;
|
||||
return new CharArraySet(set, ignoreCase);
|
||||
}
|
||||
|
||||
|
||||
/** The Iterator<String> for this set. Strings are constructed on the fly, so
|
||||
* use <code>nextCharArray</code> for more efficient access. */
|
||||
public class CharArraySetIterator implements Iterator<String> {
|
||||
int pos=-1;
|
||||
char[] next;
|
||||
CharArraySetIterator() {
|
||||
goNext();
|
||||
}
|
||||
|
||||
private void goNext() {
|
||||
next = null;
|
||||
pos++;
|
||||
while (pos < entries.length && (next=entries[pos]) == null) pos++;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
/** do not modify the returned char[] */
|
||||
public char[] nextCharArray() {
|
||||
char[] ret = next;
|
||||
goNext();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Returns the next String, as a Set<String> would...
|
||||
* use nextCharArray() for better efficiency. */
|
||||
public String next() {
|
||||
return new String(nextCharArray());
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
/** returns an iterator of new allocated Strings */
|
||||
public Iterator<String> stringIterator() {
|
||||
return new CharArraySetIterator();
|
||||
}
|
||||
|
||||
/** returns an iterator of new allocated Strings, this method violates the Set interface */
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Iterator<Object> iterator() {
|
||||
return (Iterator) stringIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Efficient unmodifiable {@link CharArraySet}. This implementation does not
|
||||
* delegate calls to a give {@link CharArraySet} like
|
||||
* {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
|
||||
* the internal representation of a {@link CharArraySet} to a super
|
||||
* constructor and overrides all mutators.
|
||||
*/
|
||||
private static final class UnmodifiableCharArraySet extends CharArraySet {
|
||||
|
||||
private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
|
||||
int count) {
|
||||
super(entries, ignoreCase, count);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(Object o){
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(Collection<? extends Object> coll) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(char[] text) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(CharSequence text) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(String text) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Subclasses of CharFilter can be chained to filter CharStream.
|
||||
* They can be used as {@link java.io.Reader} with additional offset
|
||||
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
|
||||
* if a CharFilter/CharStream subclass is used.
|
||||
*
|
||||
* @version $Id$
|
||||
*
|
||||
*/
|
||||
public abstract class CharFilter extends CharStream {
|
||||
|
||||
protected CharStream input;
|
||||
|
||||
protected CharFilter(CharStream in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclass may want to override to correct the current offset.
|
||||
*
|
||||
* @param currentOff current offset
|
||||
* @return corrected offset
|
||||
*/
|
||||
protected int correct(int currentOff) {
|
||||
return currentOff;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chains the corrected offset through the input
|
||||
* CharFilter.
|
||||
*/
|
||||
@Override
|
||||
public final int correctOffset(int currentOff) {
|
||||
return input.correctOffset(correct(currentOff));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
return input.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported(){
|
||||
return input.markSupported();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mark( int readAheadLimit ) throws IOException {
|
||||
input.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* CharReader is a Reader wrapper. It reads chars from
|
||||
* Reader and outputs {@link CharStream}, defining an
|
||||
* identify function {@link #correctOffset} method that
|
||||
* simply returns the provided offset.
|
||||
*/
|
||||
public final class CharReader extends CharStream {
|
||||
|
||||
protected Reader input;
|
||||
|
||||
public static CharStream get(Reader input) {
|
||||
return input instanceof CharStream ?
|
||||
(CharStream)input : new CharReader(input);
|
||||
}
|
||||
|
||||
private CharReader(Reader in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int correctOffset(int currentOff) {
|
||||
return currentOff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
return input.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported(){
|
||||
return input.markSupported();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mark( int readAheadLimit ) throws IOException {
|
||||
input.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* CharStream adds {@link #correctOffset}
|
||||
* functionality over {@link Reader}. All Tokenizers accept a
|
||||
* CharStream instead of {@link Reader} as input, which enables
|
||||
* arbitrary character based filtering before tokenization.
|
||||
* The {@link #correctOffset} method fixed offsets to account for
|
||||
* removal or insertion of characters, so that the offsets
|
||||
* reported in the tokens match the character offsets of the
|
||||
* original Reader.
|
||||
*/
|
||||
public abstract class CharStream extends Reader {
|
||||
|
||||
/**
|
||||
* Called by CharFilter(s) and Tokenizer to correct token offset.
|
||||
*
|
||||
* @param currentOff offset as seen in the output
|
||||
* @return corrected offset based on the input
|
||||
*/
|
||||
public abstract int correctOffset(int currentOff);
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
||||
public abstract class CharTokenizer extends Tokenizer {
|
||||
public CharTokenizer(Reader input) {
|
||||
super(input);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public CharTokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public CharTokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
/** Returns true iff a character should be included in a token. This
|
||||
* tokenizer generates as tokens adjacent sequences of characters which
|
||||
* satisfy this predicate. Characters for which this is false are used to
|
||||
* define token boundaries and are not included in tokens. */
|
||||
protected abstract boolean isTokenChar(char c);
|
||||
|
||||
/** Called on each token character to normalize it before it is added to the
|
||||
* token. The default implementation does nothing. Subclasses may use this
|
||||
* to, e.g., lowercase tokens. */
|
||||
protected char normalize(char c) {
|
||||
return c;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
while (true) {
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
dataLen = input.read(ioBuffer);
|
||||
if (dataLen == -1) {
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
final char c = ioBuffer[bufferIndex++];
|
||||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length == buffer.length)
|
||||
buffer = termAtt.resizeTermBuffer(1+length);
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
}
|
||||
|
||||
termAtt.setTermLength(length);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(offset);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,260 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A filter that replaces accented characters in the ISO Latin 1 character set
|
||||
* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
|
||||
* <p>
|
||||
* For instance, 'à' will be replaced by 'a'.
|
||||
* <p>
|
||||
*
|
||||
* @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
|
||||
* which covers a superset of Latin 1.
|
||||
* This class is included for use with existing
|
||||
* indexes and will be removed in a future release (possibly Lucene 4.0).
|
||||
*/
|
||||
public final class ISOLatin1AccentFilter extends TokenFilter {
|
||||
public ISOLatin1AccentFilter(TokenStream input) {
|
||||
super(input);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private char[] output = new char[256];
|
||||
private int outputPos;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
// If no characters actually require rewriting then we
|
||||
// just return token as-is:
|
||||
for(int i=0;i<length;i++) {
|
||||
final char c = buffer[i];
|
||||
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||
removeAccents(buffer, length);
|
||||
termAtt.setTermBuffer(output, 0, outputPos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* To replace accented characters in a String by unaccented equivalents.
|
||||
*/
|
||||
public final void removeAccents(char[] input, int length) {
|
||||
|
||||
// Worst-case length required:
|
||||
final int maxSizeNeeded = 2*length;
|
||||
|
||||
int size = output.length;
|
||||
while (size < maxSizeNeeded)
|
||||
size *= 2;
|
||||
|
||||
if (size != output.length)
|
||||
output = new char[size];
|
||||
|
||||
outputPos = 0;
|
||||
|
||||
int pos = 0;
|
||||
|
||||
for (int i=0; i<length; i++, pos++) {
|
||||
final char c = input[pos];
|
||||
|
||||
// Quick test: if it's not in range then just keep
|
||||
// current character
|
||||
if (c < '\u00c0' || c > '\uFB06')
|
||||
output[outputPos++] = c;
|
||||
else {
|
||||
switch (c) {
|
||||
case '\u00C0' : // À
|
||||
case '\u00C1' : // Á
|
||||
case '\u00C2' : // Â
|
||||
case '\u00C3' : // Ã
|
||||
case '\u00C4' : // Ä
|
||||
case '\u00C5' : // Å
|
||||
output[outputPos++] = 'A';
|
||||
break;
|
||||
case '\u00C6' : // Æ
|
||||
output[outputPos++] = 'A';
|
||||
output[outputPos++] = 'E';
|
||||
break;
|
||||
case '\u00C7' : // Ç
|
||||
output[outputPos++] = 'C';
|
||||
break;
|
||||
case '\u00C8' : // È
|
||||
case '\u00C9' : // É
|
||||
case '\u00CA' : // Ê
|
||||
case '\u00CB' : // Ë
|
||||
output[outputPos++] = 'E';
|
||||
break;
|
||||
case '\u00CC' : // Ì
|
||||
case '\u00CD' : // Í
|
||||
case '\u00CE' : // Î
|
||||
case '\u00CF' : // Ï
|
||||
output[outputPos++] = 'I';
|
||||
break;
|
||||
case '\u0132' : // IJ
|
||||
output[outputPos++] = 'I';
|
||||
output[outputPos++] = 'J';
|
||||
break;
|
||||
case '\u00D0' : // Ð
|
||||
output[outputPos++] = 'D';
|
||||
break;
|
||||
case '\u00D1' : // Ñ
|
||||
output[outputPos++] = 'N';
|
||||
break;
|
||||
case '\u00D2' : // Ò
|
||||
case '\u00D3' : // Ó
|
||||
case '\u00D4' : // Ô
|
||||
case '\u00D5' : // Õ
|
||||
case '\u00D6' : // Ö
|
||||
case '\u00D8' : // Ø
|
||||
output[outputPos++] = 'O';
|
||||
break;
|
||||
case '\u0152' : // Œ
|
||||
output[outputPos++] = 'O';
|
||||
output[outputPos++] = 'E';
|
||||
break;
|
||||
case '\u00DE' : // Þ
|
||||
output[outputPos++] = 'T';
|
||||
output[outputPos++] = 'H';
|
||||
break;
|
||||
case '\u00D9' : // Ù
|
||||
case '\u00DA' : // Ú
|
||||
case '\u00DB' : // Û
|
||||
case '\u00DC' : // Ü
|
||||
output[outputPos++] = 'U';
|
||||
break;
|
||||
case '\u00DD' : // Ý
|
||||
case '\u0178' : // Ÿ
|
||||
output[outputPos++] = 'Y';
|
||||
break;
|
||||
case '\u00E0' : // à
|
||||
case '\u00E1' : // á
|
||||
case '\u00E2' : // â
|
||||
case '\u00E3' : // ã
|
||||
case '\u00E4' : // ä
|
||||
case '\u00E5' : // å
|
||||
output[outputPos++] = 'a';
|
||||
break;
|
||||
case '\u00E6' : // æ
|
||||
output[outputPos++] = 'a';
|
||||
output[outputPos++] = 'e';
|
||||
break;
|
||||
case '\u00E7' : // ç
|
||||
output[outputPos++] = 'c';
|
||||
break;
|
||||
case '\u00E8' : // è
|
||||
case '\u00E9' : // é
|
||||
case '\u00EA' : // ê
|
||||
case '\u00EB' : // ë
|
||||
output[outputPos++] = 'e';
|
||||
break;
|
||||
case '\u00EC' : // ì
|
||||
case '\u00ED' : // í
|
||||
case '\u00EE' : // î
|
||||
case '\u00EF' : // ï
|
||||
output[outputPos++] = 'i';
|
||||
break;
|
||||
case '\u0133' : // ij
|
||||
output[outputPos++] = 'i';
|
||||
output[outputPos++] = 'j';
|
||||
break;
|
||||
case '\u00F0' : // ð
|
||||
output[outputPos++] = 'd';
|
||||
break;
|
||||
case '\u00F1' : // ñ
|
||||
output[outputPos++] = 'n';
|
||||
break;
|
||||
case '\u00F2' : // ò
|
||||
case '\u00F3' : // ó
|
||||
case '\u00F4' : // ô
|
||||
case '\u00F5' : // õ
|
||||
case '\u00F6' : // ö
|
||||
case '\u00F8' : // ø
|
||||
output[outputPos++] = 'o';
|
||||
break;
|
||||
case '\u0153' : // œ
|
||||
output[outputPos++] = 'o';
|
||||
output[outputPos++] = 'e';
|
||||
break;
|
||||
case '\u00DF' : // ß
|
||||
output[outputPos++] = 's';
|
||||
output[outputPos++] = 's';
|
||||
break;
|
||||
case '\u00FE' : // þ
|
||||
output[outputPos++] = 't';
|
||||
output[outputPos++] = 'h';
|
||||
break;
|
||||
case '\u00F9' : // ù
|
||||
case '\u00FA' : // ú
|
||||
case '\u00FB' : // û
|
||||
case '\u00FC' : // ü
|
||||
output[outputPos++] = 'u';
|
||||
break;
|
||||
case '\u00FD' : // ý
|
||||
case '\u00FF' : // ÿ
|
||||
output[outputPos++] = 'y';
|
||||
break;
|
||||
case '\uFB00': // ff
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 'f';
|
||||
break;
|
||||
case '\uFB01': // fi
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 'i';
|
||||
break;
|
||||
case '\uFB02': // fl
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 'l';
|
||||
break;
|
||||
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
|
||||
// case '\uFB03': // ffi
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'i';
|
||||
// break;
|
||||
// case '\uFB04': // ffl
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'l';
|
||||
// break;
|
||||
case '\uFB05': // ſt
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 't';
|
||||
break;
|
||||
case '\uFB06': // st
|
||||
output[outputPos++] = 's';
|
||||
output[outputPos++] = 't';
|
||||
break;
|
||||
default :
|
||||
output[outputPos++] = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* "Tokenizes" the entire stream as a single token. This is useful
|
||||
* for data like zip codes, ids, and some product names.
|
||||
*/
|
||||
public class KeywordAnalyzer extends Analyzer {
|
||||
public KeywordAnalyzer() {
|
||||
setOverridesTokenStreamMethod(KeywordAnalyzer.class);
|
||||
}
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName,
|
||||
final Reader reader) {
|
||||
return new KeywordTokenizer(reader);
|
||||
}
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName,
|
||||
final Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||
if (tokenizer == null) {
|
||||
tokenizer = new KeywordTokenizer(reader);
|
||||
setPreviousTokenStream(tokenizer);
|
||||
} else
|
||||
tokenizer.reset(reader);
|
||||
return tokenizer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Emits the entire input as a single token.
|
||||
*/
|
||||
public final class KeywordTokenizer extends Tokenizer {
|
||||
|
||||
private static final int DEFAULT_BUFFER_SIZE = 256;
|
||||
|
||||
private boolean done;
|
||||
private int finalOffset;
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
public KeywordTokenizer(Reader input) {
|
||||
this(input, DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(Reader input, int bufferSize) {
|
||||
super(input);
|
||||
init(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
|
||||
super(source, input);
|
||||
init(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
|
||||
super(factory, input);
|
||||
init(bufferSize);
|
||||
}
|
||||
|
||||
private void init(int bufferSize) {
|
||||
this.done = false;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
termAtt.resizeTermBuffer(bufferSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!done) {
|
||||
clearAttributes();
|
||||
done = true;
|
||||
int upto = 0;
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
while (true) {
|
||||
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||
if (length == -1) break;
|
||||
upto += length;
|
||||
if (upto == buffer.length)
|
||||
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
||||
}
|
||||
termAtt.setTermLength(upto);
|
||||
finalOffset = correctOffset(upto);
|
||||
offsetAtt.setOffset(correctOffset(0), finalOffset);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
this.done = false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Removes words that are too long or too short from the stream.
|
||||
*/
|
||||
public final class LengthFilter extends TokenFilter {
|
||||
|
||||
final int min;
|
||||
final int max;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
*/
|
||||
public LengthFilter(TokenStream in, int min, int max)
|
||||
{
|
||||
super(in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// return the first non-stop word found
|
||||
while (input.incrementToken()) {
|
||||
int len = termAtt.termLength();
|
||||
if (len >= min && len <= max) {
|
||||
return true;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
// reached EOS -- return false
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
|
||||
to say, it defines tokens as maximal strings of adjacent letters, as defined
|
||||
by java.lang.Character.isLetter() predicate.
|
||||
|
||||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public class LetterTokenizer extends CharTokenizer {
|
||||
/** Construct a new LetterTokenizer. */
|
||||
public LetterTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
|
||||
public LetterTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
/** Construct a new LetterTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
|
||||
public LetterTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/** Collects only characters which satisfy
|
||||
* {@link Character#isLetter(char)}.*/
|
||||
@Override
|
||||
protected boolean isTokenChar(char c) {
|
||||
return Character.isLetter(c);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case.
|
||||
*/
|
||||
public final class LowerCaseFilter extends TokenFilter {
|
||||
public LowerCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
|
||||
final char[] buffer = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
for(int i=0;i<length;i++)
|
||||
buffer[i] = Character.toLowerCase(buffer[i]);
|
||||
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
* and LowerCaseFilter together. It divides text at non-letters and converts
|
||||
* them to lower case. While it is functionally equivalent to the combination
|
||||
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
|
||||
* to doing the two tasks at once, hence this (redundant) implementation.
|
||||
* <P>
|
||||
* Note: this does a decent job for most European languages, but does a terrible
|
||||
* job for some Asian languages, where words are not separated by spaces.
|
||||
*/
|
||||
public final class LowerCaseTokenizer extends LetterTokenizer {
|
||||
/** Construct a new LowerCaseTokenizer. */
|
||||
public LowerCaseTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
|
||||
public LowerCaseTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
/** Construct a new LowerCaseTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
|
||||
public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/** Converts char to lower case
|
||||
* {@link Character#toLowerCase(char)}.*/
|
||||
@Override
|
||||
protected char normalize(char c) {
|
||||
return Character.toLowerCase(c);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* Simplistic {@link CharFilter} that applies the mappings
|
||||
* contained in a {@link NormalizeCharMap} to the character
|
||||
* stream, and correcting the resulting changes to the
|
||||
* offsets.
|
||||
*/
|
||||
public class MappingCharFilter extends BaseCharFilter {
|
||||
|
||||
private final NormalizeCharMap normMap;
|
||||
private LinkedList<Character> buffer;
|
||||
private String replacement;
|
||||
private int charPointer;
|
||||
private int nextCharCounter;
|
||||
|
||||
/** Default constructor that takes a {@link CharStream}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
|
||||
super(in);
|
||||
this.normMap = normMap;
|
||||
}
|
||||
|
||||
/** Easy-use constructor that takes a {@link Reader}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
|
||||
super(CharReader.get(in));
|
||||
this.normMap = normMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
while(true) {
|
||||
if (replacement != null && charPointer < replacement.length()) {
|
||||
return replacement.charAt(charPointer++);
|
||||
}
|
||||
|
||||
int firstChar = nextChar();
|
||||
if (firstChar == -1) return -1;
|
||||
NormalizeCharMap nm = normMap.submap != null ?
|
||||
normMap.submap.get(Character.valueOf((char) firstChar)) : null;
|
||||
if (nm == null) return firstChar;
|
||||
NormalizeCharMap result = match(nm);
|
||||
if (result == null) return firstChar;
|
||||
replacement = result.normStr;
|
||||
charPointer = 0;
|
||||
if (result.diff != 0) {
|
||||
int prevCumulativeDiff = getLastCumulativeDiff();
|
||||
if (result.diff < 0) {
|
||||
for(int i = 0; i < -result.diff ; i++)
|
||||
addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
|
||||
} else {
|
||||
addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int nextChar() throws IOException {
|
||||
nextCharCounter++;
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst().charValue();
|
||||
}
|
||||
return input.read();
|
||||
}
|
||||
|
||||
private void pushChar(int c) {
|
||||
nextCharCounter--;
|
||||
if(buffer == null)
|
||||
buffer = new LinkedList<Character>();
|
||||
buffer.addFirst(Character.valueOf((char) c));
|
||||
}
|
||||
|
||||
private void pushLastChar(int c) {
|
||||
if (buffer == null) {
|
||||
buffer = new LinkedList<Character>();
|
||||
}
|
||||
buffer.addLast(Character.valueOf((char) c));
|
||||
}
|
||||
|
||||
private NormalizeCharMap match(NormalizeCharMap map) throws IOException {
|
||||
NormalizeCharMap result = null;
|
||||
if (map.submap != null) {
|
||||
int chr = nextChar();
|
||||
if (chr != -1) {
|
||||
NormalizeCharMap subMap = map.submap.get(Character.valueOf((char) chr));
|
||||
if (subMap != null) {
|
||||
result = match(subMap);
|
||||
}
|
||||
if (result == null) {
|
||||
pushChar(chr);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (result == null && map.normStr != null) {
|
||||
result = map;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
char[] tmp = new char[len];
|
||||
int l = input.read(tmp, 0, len);
|
||||
if (l != -1) {
|
||||
for(int i = 0; i < l; i++)
|
||||
pushLastChar(tmp[i]);
|
||||
}
|
||||
l = 0;
|
||||
for(int i = off; i < off + len; i++) {
|
||||
int c = read();
|
||||
if (c == -1) break;
|
||||
cbuf[i] = (char) c;
|
||||
l++;
|
||||
}
|
||||
return l == 0 ? -1 : l;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Holds a map of String input to String output, to be used
|
||||
* with {@link MappingCharFilter}.
|
||||
*/
|
||||
public class NormalizeCharMap {
|
||||
|
||||
Map<Character, NormalizeCharMap> submap;
|
||||
String normStr;
|
||||
int diff;
|
||||
|
||||
/** Records a replacement to be applied to the inputs
|
||||
* stream. Whenever <code>singleMatch</code> occurs in
|
||||
* the input, it will be replaced with
|
||||
* <code>replacement</code>.
|
||||
*
|
||||
* @param singleMatch input String to be replaced
|
||||
* @param replacement output String
|
||||
*/
|
||||
public void add(String singleMatch, String replacement) {
|
||||
NormalizeCharMap currMap = this;
|
||||
for(int i = 0; i < singleMatch.length(); i++) {
|
||||
char c = singleMatch.charAt(i);
|
||||
if (currMap.submap == null) {
|
||||
currMap.submap = new HashMap<Character, NormalizeCharMap>(1);
|
||||
}
|
||||
NormalizeCharMap map = currMap.submap.get(Character.valueOf(c));
|
||||
if (map == null) {
|
||||
map = new NormalizeCharMap();
|
||||
currMap.submap.put(Character.valueOf(c), map);
|
||||
}
|
||||
currMap = map;
|
||||
}
|
||||
if (currMap.normStr != null) {
|
||||
throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch);
|
||||
}
|
||||
currMap.normStr = replacement;
|
||||
currMap.diff = singleMatch.length() - replacement.length();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,252 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.lucene.document.NumericField; // for javadocs
|
||||
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
|
||||
import org.apache.lucene.search.NumericRangeFilter; // for javadocs
|
||||
import org.apache.lucene.search.SortField; // for javadocs
|
||||
import org.apache.lucene.search.FieldCache; // javadocs
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* <b>Expert:</b> This class provides a {@link TokenStream}
|
||||
* for indexing numeric values that can be used by {@link
|
||||
* NumericRangeQuery} or {@link NumericRangeFilter}.
|
||||
*
|
||||
* <p>Note that for simple usage, {@link NumericField} is
|
||||
* recommended. {@link NumericField} disables norms and
|
||||
* term freqs, as they are not usually needed during
|
||||
* searching. If you need to change these settings, you
|
||||
* should use this class.
|
||||
*
|
||||
* <p>See {@link NumericField} for capabilities of fields
|
||||
* indexed numerically.</p>
|
||||
*
|
||||
* <p>Here's an example usage, for an <code>int</code> field:
|
||||
*
|
||||
* <pre>
|
||||
* Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value));
|
||||
* field.setOmitNorms(true);
|
||||
* field.setOmitTermFreqAndPositions(true);
|
||||
* document.add(field);
|
||||
* </pre>
|
||||
*
|
||||
* <p>For optimal performance, re-use the TokenStream and Field instance
|
||||
* for more than one document:
|
||||
*
|
||||
* <pre>
|
||||
* NumericTokenStream stream = new NumericTokenStream(precisionStep);
|
||||
* Field field = new Field(name, stream);
|
||||
* field.setOmitNorms(true);
|
||||
* field.setOmitTermFreqAndPositions(true);
|
||||
* Document document = new Document();
|
||||
* document.add(field);
|
||||
*
|
||||
* for(all documents) {
|
||||
* stream.setIntValue(value)
|
||||
* writer.addDocument(document);
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p>This stream is not intended to be used in analyzers;
|
||||
* it's more for iterating the different precisions during
|
||||
* indexing a specific numeric value.</p>
|
||||
|
||||
* <p><b>NOTE</b>: as token streams are only consumed once
|
||||
* the document is added to the index, if you index more
|
||||
* than one numeric field, use a separate <code>NumericTokenStream</code>
|
||||
* instance for each.</p>
|
||||
*
|
||||
* <p>See {@link NumericRangeQuery} for more details on the
|
||||
* <a
|
||||
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
|
||||
* parameter as well as how numeric fields work under the hood.</p>
|
||||
*
|
||||
* <p><font color="red"><b>NOTE:</b> This API is experimental and
|
||||
* might change in incompatible ways in the next release.</font>
|
||||
*
|
||||
* @since 2.9
|
||||
*/
|
||||
public final class NumericTokenStream extends TokenStream {
|
||||
|
||||
/** The full precision token gets this token type assigned. */
|
||||
public static final String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric";
|
||||
|
||||
/** The lower precision tokens gets this token type assigned. */
|
||||
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric";
|
||||
|
||||
/**
|
||||
* Creates a token stream for numeric values using the default <code>precisionStep</code>
|
||||
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized,
|
||||
* before using set a value using the various set<em>???</em>Value() methods.
|
||||
*/
|
||||
public NumericTokenStream() {
|
||||
this(NumericUtils.PRECISION_STEP_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream for numeric values with the specified
|
||||
* <code>precisionStep</code>. The stream is not yet initialized,
|
||||
* before using set a value using the various set<em>???</em>Value() methods.
|
||||
*/
|
||||
public NumericTokenStream(final int precisionStep) {
|
||||
super();
|
||||
this.precisionStep = precisionStep;
|
||||
if (precisionStep < 1)
|
||||
throw new IllegalArgumentException("precisionStep must be >=1");
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Creates a token stream for numeric values with the specified
|
||||
* <code>precisionStep</code> using the given {@link AttributeSource}.
|
||||
* The stream is not yet initialized,
|
||||
* before using set a value using the various set<em>???</em>Value() methods.
|
||||
*/
|
||||
public NumericTokenStream(AttributeSource source, final int precisionStep) {
|
||||
super(source);
|
||||
this.precisionStep = precisionStep;
|
||||
if (precisionStep < 1)
|
||||
throw new IllegalArgumentException("precisionStep must be >=1");
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Creates a token stream for numeric values with the specified
|
||||
* <code>precisionStep</code> using the given
|
||||
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
|
||||
* The stream is not yet initialized,
|
||||
* before using set a value using the various set<em>???</em>Value() methods.
|
||||
*/
|
||||
public NumericTokenStream(AttributeFactory factory, final int precisionStep) {
|
||||
super(factory);
|
||||
this.precisionStep = precisionStep;
|
||||
if (precisionStep < 1)
|
||||
throw new IllegalArgumentException("precisionStep must be >=1");
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the token stream with the supplied <code>long</code> value.
|
||||
* @param value the value, for which this TokenStream should enumerate tokens.
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setLongValue(value))</code>
|
||||
*/
|
||||
public NumericTokenStream setLongValue(final long value) {
|
||||
this.value = value;
|
||||
valSize = 64;
|
||||
shift = 0;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the token stream with the supplied <code>int</code> value.
|
||||
* @param value the value, for which this TokenStream should enumerate tokens.
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setIntValue(value))</code>
|
||||
*/
|
||||
public NumericTokenStream setIntValue(final int value) {
|
||||
this.value = (long) value;
|
||||
valSize = 32;
|
||||
shift = 0;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the token stream with the supplied <code>double</code> value.
|
||||
* @param value the value, for which this TokenStream should enumerate tokens.
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value))</code>
|
||||
*/
|
||||
public NumericTokenStream setDoubleValue(final double value) {
|
||||
this.value = NumericUtils.doubleToSortableLong(value);
|
||||
valSize = 64;
|
||||
shift = 0;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the token stream with the supplied <code>float</code> value.
|
||||
* @param value the value, for which this TokenStream should enumerate tokens.
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value))</code>
|
||||
*/
|
||||
public NumericTokenStream setFloatValue(final float value) {
|
||||
this.value = (long) NumericUtils.floatToSortableInt(value);
|
||||
valSize = 32;
|
||||
shift = 0;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
if (valSize == 0)
|
||||
throw new IllegalStateException("call set???Value() before usage");
|
||||
shift = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (valSize == 0)
|
||||
throw new IllegalStateException("call set???Value() before usage");
|
||||
if (shift >= valSize)
|
||||
return false;
|
||||
|
||||
clearAttributes();
|
||||
final char[] buffer;
|
||||
switch (valSize) {
|
||||
case 64:
|
||||
buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
|
||||
termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer));
|
||||
break;
|
||||
|
||||
case 32:
|
||||
buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT);
|
||||
termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer));
|
||||
break;
|
||||
|
||||
default:
|
||||
// should not happen
|
||||
throw new IllegalArgumentException("valSize must be 32 or 64");
|
||||
}
|
||||
|
||||
typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
|
||||
posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0);
|
||||
shift += precisionStep;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder("(numeric,valSize=").append(valSize);
|
||||
sb.append(",precisionStep=").append(precisionStep).append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// members
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private int shift = 0, valSize = 0; // valSize==0 means not initialized
|
||||
private final int precisionStep;
|
||||
|
||||
private long value = 0L;
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* This analyzer is used to facilitate scenarios where different
|
||||
* fields require different analysis techniques. Use {@link #addAnalyzer}
|
||||
* to add a non-default analyzer on a field name basis.
|
||||
*
|
||||
* <p>Example usage:
|
||||
*
|
||||
* <pre>
|
||||
* PerFieldAnalyzerWrapper aWrapper =
|
||||
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
|
||||
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
|
||||
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
|
||||
* </pre>
|
||||
*
|
||||
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
|
||||
* and "lastname", for which KeywordAnalyzer will be used.
|
||||
*
|
||||
* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
|
||||
* and query parsing.
|
||||
*/
|
||||
public class PerFieldAnalyzerWrapper extends Analyzer {
|
||||
private Analyzer defaultAnalyzer;
|
||||
private Map<String,Analyzer> analyzerMap = new HashMap<String,Analyzer>();
|
||||
|
||||
|
||||
/**
|
||||
* Constructs with default analyzer.
|
||||
*
|
||||
* @param defaultAnalyzer Any fields not specifically
|
||||
* defined to use a different analyzer will use the one provided here.
|
||||
*/
|
||||
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
this(defaultAnalyzer, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs with default analyzer and a map of analyzers to use for
|
||||
* specific fields.
|
||||
*
|
||||
* @param defaultAnalyzer Any fields not specifically
|
||||
* defined to use a different analyzer will use the one provided here.
|
||||
* @param fieldAnalyzers a Map (String field name to the Analyzer) to be
|
||||
* used for those fields
|
||||
*/
|
||||
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer,
|
||||
Map<String,Analyzer> fieldAnalyzers) {
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
if (fieldAnalyzers != null) {
|
||||
analyzerMap.putAll(fieldAnalyzers);
|
||||
}
|
||||
setOverridesTokenStreamMethod(PerFieldAnalyzerWrapper.class);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Defines an analyzer to use for the specified field.
|
||||
*
|
||||
* @param fieldName field name requiring a non-default analyzer
|
||||
* @param analyzer non-default analyzer to use for field
|
||||
*/
|
||||
public void addAnalyzer(String fieldName, Analyzer analyzer) {
|
||||
analyzerMap.put(fieldName, analyzer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
Analyzer analyzer = analyzerMap.get(fieldName);
|
||||
if (analyzer == null) {
|
||||
analyzer = defaultAnalyzer;
|
||||
}
|
||||
|
||||
return analyzer.tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
Analyzer analyzer = analyzerMap.get(fieldName);
|
||||
if (analyzer == null)
|
||||
analyzer = defaultAnalyzer;
|
||||
|
||||
return analyzer.reusableTokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
Analyzer analyzer = analyzerMap.get(fieldName);
|
||||
if (analyzer == null)
|
||||
analyzer = defaultAnalyzer;
|
||||
return analyzer.getPositionIncrementGap(fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/** Transforms the token stream as per the Porter stemming algorithm.
|
||||
Note: the input to the stemming filter must already be in lower case,
|
||||
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
|
||||
down the Tokenizer chain in order for this to work properly!
|
||||
<P>
|
||||
To use this filter with other analyzers, you'll want to write an
|
||||
Analyzer class that sets up the TokenStream chain as you want it.
|
||||
To use this with LowerCaseTokenizer, for example, you'd write an
|
||||
analyzer like this:
|
||||
<P>
|
||||
<PRE>
|
||||
class MyAnalyzer extends Analyzer {
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new PorterStemFilter(new LowerCaseTokenizer(reader));
|
||||
}
|
||||
}
|
||||
</PRE>
|
||||
*/
|
||||
public final class PorterStemFilter extends TokenFilter {
|
||||
private PorterStemmer stemmer;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public PorterStemFilter(TokenStream in) {
|
||||
super(in);
|
||||
stemmer = new PorterStemmer();
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken())
|
||||
return false;
|
||||
|
||||
if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
|
||||
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,546 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
Porter stemmer in Java. The original paper is in
|
||||
|
||||
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
||||
no. 3, pp 130-137,
|
||||
|
||||
See also http://www.tartarus.org/~martin/PorterStemmer/index.html
|
||||
|
||||
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
|
||||
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
|
||||
is then out outside the bounds of b.
|
||||
|
||||
Similarly,
|
||||
|
||||
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
|
||||
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
|
||||
b[j] is then outside the bounds of b.
|
||||
|
||||
Release 3.
|
||||
|
||||
[ This version is derived from Release 3, modified by Brian Goetz to
|
||||
optimize for fewer object creations. ]
|
||||
|
||||
*/
|
||||
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* Stemmer, implementing the Porter Stemming Algorithm
|
||||
*
|
||||
* The Stemmer class transforms a word into its root form. The input
|
||||
* word can be provided a character at time (by calling add()), or at once
|
||||
* by calling one of the various stem(something) methods.
|
||||
*/
|
||||
|
||||
class PorterStemmer
|
||||
{
|
||||
private char[] b;
|
||||
private int i, /* offset into b */
|
||||
j, k, k0;
|
||||
private boolean dirty = false;
|
||||
private static final int INC = 50; /* unit of size whereby b is increased */
|
||||
private static final int EXTRA = 1;
|
||||
|
||||
public PorterStemmer() {
|
||||
b = new char[INC];
|
||||
i = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* reset() resets the stemmer so it can stem another word. If you invoke
|
||||
* the stemmer by calling add(char) and then stem(), you must call reset()
|
||||
* before starting another word.
|
||||
*/
|
||||
public void reset() { i = 0; dirty = false; }
|
||||
|
||||
/**
|
||||
* Add a character to the word being stemmed. When you are finished
|
||||
* adding characters, you can call stem(void) to process the word.
|
||||
*/
|
||||
public void add(char ch) {
|
||||
if (b.length <= i + EXTRA) {
|
||||
char[] new_b = new char[b.length+INC];
|
||||
System.arraycopy(b, 0, new_b, 0, b.length);
|
||||
b = new_b;
|
||||
}
|
||||
b[i++] = ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* After a word has been stemmed, it can be retrieved by toString(),
|
||||
* or a reference to the internal buffer can be retrieved by getResultBuffer
|
||||
* and getResultLength (which is generally more efficient.)
|
||||
*/
|
||||
@Override
|
||||
public String toString() { return new String(b,0,i); }
|
||||
|
||||
/**
|
||||
* Returns the length of the word resulting from the stemming process.
|
||||
*/
|
||||
public int getResultLength() { return i; }
|
||||
|
||||
/**
|
||||
* Returns a reference to a character buffer containing the results of
|
||||
* the stemming process. You also need to consult getResultLength()
|
||||
* to determine the length of the result.
|
||||
*/
|
||||
public char[] getResultBuffer() { return b; }
|
||||
|
||||
/* cons(i) is true <=> b[i] is a consonant. */
|
||||
|
||||
private final boolean cons(int i) {
|
||||
switch (b[i]) {
|
||||
case 'a': case 'e': case 'i': case 'o': case 'u':
|
||||
return false;
|
||||
case 'y':
|
||||
return (i==k0) ? true : !cons(i-1);
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* m() measures the number of consonant sequences between k0 and j. if c is
|
||||
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
|
||||
presence,
|
||||
|
||||
<c><v> gives 0
|
||||
<c>vc<v> gives 1
|
||||
<c>vcvc<v> gives 2
|
||||
<c>vcvcvc<v> gives 3
|
||||
....
|
||||
*/
|
||||
|
||||
private final int m() {
|
||||
int n = 0;
|
||||
int i = k0;
|
||||
while(true) {
|
||||
if (i > j)
|
||||
return n;
|
||||
if (! cons(i))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
while(true) {
|
||||
while(true) {
|
||||
if (i > j)
|
||||
return n;
|
||||
if (cons(i))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
n++;
|
||||
while(true) {
|
||||
if (i > j)
|
||||
return n;
|
||||
if (! cons(i))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
/* vowelinstem() is true <=> k0,...j contains a vowel */
|
||||
|
||||
private final boolean vowelinstem() {
|
||||
int i;
|
||||
for (i = k0; i <= j; i++)
|
||||
if (! cons(i))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
|
||||
|
||||
private final boolean doublec(int j) {
|
||||
if (j < k0+1)
|
||||
return false;
|
||||
if (b[j] != b[j-1])
|
||||
return false;
|
||||
return cons(j);
|
||||
}
|
||||
|
||||
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
|
||||
and also if the second c is not w,x or y. this is used when trying to
|
||||
restore an e at the end of a short word. e.g.
|
||||
|
||||
cav(e), lov(e), hop(e), crim(e), but
|
||||
snow, box, tray.
|
||||
|
||||
*/
|
||||
|
||||
private final boolean cvc(int i) {
|
||||
if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
|
||||
return false;
|
||||
else {
|
||||
int ch = b[i];
|
||||
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private final boolean ends(String s) {
|
||||
int l = s.length();
|
||||
int o = k-l+1;
|
||||
if (o < k0)
|
||||
return false;
|
||||
for (int i = 0; i < l; i++)
|
||||
if (b[o+i] != s.charAt(i))
|
||||
return false;
|
||||
j = k-l;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
|
||||
k. */
|
||||
|
||||
void setto(String s) {
|
||||
int l = s.length();
|
||||
int o = j+1;
|
||||
for (int i = 0; i < l; i++)
|
||||
b[o+i] = s.charAt(i);
|
||||
k = j+l;
|
||||
dirty = true;
|
||||
}
|
||||
|
||||
/* r(s) is used further down. */
|
||||
|
||||
void r(String s) { if (m() > 0) setto(s); }
|
||||
|
||||
/* step1() gets rid of plurals and -ed or -ing. e.g.
|
||||
|
||||
caresses -> caress
|
||||
ponies -> poni
|
||||
ties -> ti
|
||||
caress -> caress
|
||||
cats -> cat
|
||||
|
||||
feed -> feed
|
||||
agreed -> agree
|
||||
disabled -> disable
|
||||
|
||||
matting -> mat
|
||||
mating -> mate
|
||||
meeting -> meet
|
||||
milling -> mill
|
||||
messing -> mess
|
||||
|
||||
meetings -> meet
|
||||
|
||||
*/
|
||||
|
||||
private final void step1() {
|
||||
if (b[k] == 's') {
|
||||
if (ends("sses")) k -= 2;
|
||||
else if (ends("ies")) setto("i");
|
||||
else if (b[k-1] != 's') k--;
|
||||
}
|
||||
if (ends("eed")) {
|
||||
if (m() > 0)
|
||||
k--;
|
||||
}
|
||||
else if ((ends("ed") || ends("ing")) && vowelinstem()) {
|
||||
k = j;
|
||||
if (ends("at")) setto("ate");
|
||||
else if (ends("bl")) setto("ble");
|
||||
else if (ends("iz")) setto("ize");
|
||||
else if (doublec(k)) {
|
||||
int ch = b[k--];
|
||||
if (ch == 'l' || ch == 's' || ch == 'z')
|
||||
k++;
|
||||
}
|
||||
else if (m() == 1 && cvc(k))
|
||||
setto("e");
|
||||
}
|
||||
}
|
||||
|
||||
/* step2() turns terminal y to i when there is another vowel in the stem. */
|
||||
|
||||
private final void step2() {
|
||||
if (ends("y") && vowelinstem()) {
|
||||
b[k] = 'i';
|
||||
dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
|
||||
-ation) maps to -ize etc. note that the string before the suffix must give
|
||||
m() > 0. */
|
||||
|
||||
private final void step3() {
|
||||
if (k == k0) return; /* For Bug 1 */
|
||||
switch (b[k-1]) {
|
||||
case 'a':
|
||||
if (ends("ational")) { r("ate"); break; }
|
||||
if (ends("tional")) { r("tion"); break; }
|
||||
break;
|
||||
case 'c':
|
||||
if (ends("enci")) { r("ence"); break; }
|
||||
if (ends("anci")) { r("ance"); break; }
|
||||
break;
|
||||
case 'e':
|
||||
if (ends("izer")) { r("ize"); break; }
|
||||
break;
|
||||
case 'l':
|
||||
if (ends("bli")) { r("ble"); break; }
|
||||
if (ends("alli")) { r("al"); break; }
|
||||
if (ends("entli")) { r("ent"); break; }
|
||||
if (ends("eli")) { r("e"); break; }
|
||||
if (ends("ousli")) { r("ous"); break; }
|
||||
break;
|
||||
case 'o':
|
||||
if (ends("ization")) { r("ize"); break; }
|
||||
if (ends("ation")) { r("ate"); break; }
|
||||
if (ends("ator")) { r("ate"); break; }
|
||||
break;
|
||||
case 's':
|
||||
if (ends("alism")) { r("al"); break; }
|
||||
if (ends("iveness")) { r("ive"); break; }
|
||||
if (ends("fulness")) { r("ful"); break; }
|
||||
if (ends("ousness")) { r("ous"); break; }
|
||||
break;
|
||||
case 't':
|
||||
if (ends("aliti")) { r("al"); break; }
|
||||
if (ends("iviti")) { r("ive"); break; }
|
||||
if (ends("biliti")) { r("ble"); break; }
|
||||
break;
|
||||
case 'g':
|
||||
if (ends("logi")) { r("log"); break; }
|
||||
}
|
||||
}
|
||||
|
||||
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
|
||||
|
||||
private final void step4() {
|
||||
switch (b[k]) {
|
||||
case 'e':
|
||||
if (ends("icate")) { r("ic"); break; }
|
||||
if (ends("ative")) { r(""); break; }
|
||||
if (ends("alize")) { r("al"); break; }
|
||||
break;
|
||||
case 'i':
|
||||
if (ends("iciti")) { r("ic"); break; }
|
||||
break;
|
||||
case 'l':
|
||||
if (ends("ical")) { r("ic"); break; }
|
||||
if (ends("ful")) { r(""); break; }
|
||||
break;
|
||||
case 's':
|
||||
if (ends("ness")) { r(""); break; }
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
|
||||
|
||||
private final void step5() {
|
||||
if (k == k0) return; /* for Bug 1 */
|
||||
switch (b[k-1]) {
|
||||
case 'a':
|
||||
if (ends("al")) break;
|
||||
return;
|
||||
case 'c':
|
||||
if (ends("ance")) break;
|
||||
if (ends("ence")) break;
|
||||
return;
|
||||
case 'e':
|
||||
if (ends("er")) break; return;
|
||||
case 'i':
|
||||
if (ends("ic")) break; return;
|
||||
case 'l':
|
||||
if (ends("able")) break;
|
||||
if (ends("ible")) break; return;
|
||||
case 'n':
|
||||
if (ends("ant")) break;
|
||||
if (ends("ement")) break;
|
||||
if (ends("ment")) break;
|
||||
/* element etc. not stripped before the m */
|
||||
if (ends("ent")) break;
|
||||
return;
|
||||
case 'o':
|
||||
if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
|
||||
/* j >= 0 fixes Bug 2 */
|
||||
if (ends("ou")) break;
|
||||
return;
|
||||
/* takes care of -ous */
|
||||
case 's':
|
||||
if (ends("ism")) break;
|
||||
return;
|
||||
case 't':
|
||||
if (ends("ate")) break;
|
||||
if (ends("iti")) break;
|
||||
return;
|
||||
case 'u':
|
||||
if (ends("ous")) break;
|
||||
return;
|
||||
case 'v':
|
||||
if (ends("ive")) break;
|
||||
return;
|
||||
case 'z':
|
||||
if (ends("ize")) break;
|
||||
return;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
if (m() > 1)
|
||||
k = j;
|
||||
}
|
||||
|
||||
/* step6() removes a final -e if m() > 1. */
|
||||
|
||||
private final void step6() {
|
||||
j = k;
|
||||
if (b[k] == 'e') {
|
||||
int a = m();
|
||||
if (a > 1 || a == 1 && !cvc(k-1))
|
||||
k--;
|
||||
}
|
||||
if (b[k] == 'l' && doublec(k) && m() > 1)
|
||||
k--;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Stem a word provided as a String. Returns the result as a String.
|
||||
*/
|
||||
public String stem(String s) {
|
||||
if (stem(s.toCharArray(), s.length()))
|
||||
return toString();
|
||||
else
|
||||
return s;
|
||||
}
|
||||
|
||||
/** Stem a word contained in a char[]. Returns true if the stemming process
|
||||
* resulted in a word different from the input. You can retrieve the
|
||||
* result with getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem(char[] word) {
|
||||
return stem(word, word.length);
|
||||
}
|
||||
|
||||
/** Stem a word contained in a portion of a char[] array. Returns
|
||||
* true if the stemming process resulted in a word different from
|
||||
* the input. You can retrieve the result with
|
||||
* getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem(char[] wordBuffer, int offset, int wordLen) {
|
||||
reset();
|
||||
if (b.length < wordLen) {
|
||||
char[] new_b = new char[wordLen + EXTRA];
|
||||
b = new_b;
|
||||
}
|
||||
System.arraycopy(wordBuffer, offset, b, 0, wordLen);
|
||||
i = wordLen;
|
||||
return stem(0);
|
||||
}
|
||||
|
||||
/** Stem a word contained in a leading portion of a char[] array.
|
||||
* Returns true if the stemming process resulted in a word different
|
||||
* from the input. You can retrieve the result with
|
||||
* getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem(char[] word, int wordLen) {
|
||||
return stem(word, 0, wordLen);
|
||||
}
|
||||
|
||||
/** Stem the word placed into the Stemmer buffer through calls to add().
|
||||
* Returns true if the stemming process resulted in a word different
|
||||
* from the input. You can retrieve the result with
|
||||
* getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem() {
|
||||
return stem(0);
|
||||
}
|
||||
|
||||
public boolean stem(int i0) {
|
||||
k = i - 1;
|
||||
k0 = i0;
|
||||
if (k > k0+1) {
|
||||
step1(); step2(); step3(); step4(); step5(); step6();
|
||||
}
|
||||
// Also, a word is considered dirty if we lopped off letters
|
||||
// Thanks to Ifigenia Vairelles for pointing this out.
|
||||
if (i != k+1)
|
||||
dirty = true;
|
||||
i = k+1;
|
||||
return dirty;
|
||||
}
|
||||
|
||||
/** Test program for demonstrating the Stemmer. It reads a file and
|
||||
* stems each word, writing the result to standard out.
|
||||
* Usage: Stemmer file-name
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
PorterStemmer s = new PorterStemmer();
|
||||
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
try {
|
||||
InputStream in = new FileInputStream(args[i]);
|
||||
byte[] buffer = new byte[1024];
|
||||
int bufferLen, offset, ch;
|
||||
|
||||
bufferLen = in.read(buffer);
|
||||
offset = 0;
|
||||
s.reset();
|
||||
|
||||
while(true) {
|
||||
if (offset < bufferLen)
|
||||
ch = buffer[offset++];
|
||||
else {
|
||||
bufferLen = in.read(buffer);
|
||||
offset = 0;
|
||||
if (bufferLen < 0)
|
||||
ch = -1;
|
||||
else
|
||||
ch = buffer[offset++];
|
||||
}
|
||||
|
||||
if (Character.isLetter((char) ch)) {
|
||||
s.add(Character.toLowerCase((char) ch));
|
||||
}
|
||||
else {
|
||||
s.stem();
|
||||
System.out.print(s.toString());
|
||||
s.reset();
|
||||
if (ch < 0)
|
||||
break;
|
||||
else {
|
||||
System.out.print((char) ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
in.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
System.out.println("error reading " + args[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/** An {@link Analyzer} that filters {@link LetterTokenizer}
|
||||
* with {@link LowerCaseFilter} */
|
||||
|
||||
public final class SimpleAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new LowerCaseTokenizer(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||
if (tokenizer == null) {
|
||||
tokenizer = new LowerCaseTokenizer(reader);
|
||||
setPreviousTokenStream(tokenizer);
|
||||
} else
|
||||
tokenizer.reset(reader);
|
||||
return tokenizer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StopAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 2.9, position increments are preserved
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public final class StopAnalyzer extends Analyzer {
|
||||
private final Set<?> stopWords;
|
||||
private final boolean enablePositionIncrements;
|
||||
|
||||
/** An unmodifiable set containing some common English words that are not usually useful
|
||||
for searching.*/
|
||||
public static final Set<?> ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
static {
|
||||
final List<String> stopWords = Arrays.asList(
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
);
|
||||
final CharArraySet stopSet = new CharArraySet(stopWords.size(), false);
|
||||
stopSet.addAll(stopWords);
|
||||
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in
|
||||
* {@link #ENGLISH_STOP_WORDS_SET}.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
*/
|
||||
public StopAnalyzer(Version matchVersion) {
|
||||
stopWords = ENGLISH_STOP_WORDS_SET;
|
||||
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given set.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopWords Set of stop words */
|
||||
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
* @see WordlistLoader#getWordSet(File)
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopwordsFile File to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwordsFile);
|
||||
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader)
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopwords Reader to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwords);
|
||||
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new LowerCaseTokenizer(reader);
|
||||
streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
|
||||
setPreviousTokenStream(streams);
|
||||
} else
|
||||
streams.source.reset(reader);
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,191 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.queryParser.QueryParser; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes stop words from a token stream.
|
||||
*/
|
||||
|
||||
public final class StopFilter extends TokenFilter {
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
private boolean enablePositionIncrements = false;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input.
|
||||
* If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
|
||||
* <code>makeStopSet()</code> was used to construct the set) it will be directly used
|
||||
* and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
|
||||
* directly controls case sensitivity.
|
||||
* <p/>
|
||||
* If <code>stopWords</code> is not an instance of {@link CharArraySet},
|
||||
* a new CharArraySet will be constructed and <code>ignoreCase</code> will be
|
||||
* used to specify the case sensitivity of that set.
|
||||
*
|
||||
* @param enablePositionIncrements true if token positions should record the removed stop words
|
||||
* @param input Input TokenStream
|
||||
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
*/
|
||||
public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
|
||||
{
|
||||
super(input);
|
||||
if (stopWords instanceof CharArraySet) {
|
||||
this.stopWords = (CharArraySet)stopWords;
|
||||
} else {
|
||||
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
this.stopWords.addAll(stopWords);
|
||||
}
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input
|
||||
* TokenStream that are named in the Set.
|
||||
*
|
||||
* @param enablePositionIncrements true if token positions should record the removed stop words
|
||||
* @param in Input stream
|
||||
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
|
||||
* @see #makeStopSet(java.lang.String[])
|
||||
*/
|
||||
public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
|
||||
this(enablePositionIncrements, in, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(String... stopWords) {
|
||||
return makeStopSet(stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(List<?> stopWords) {
|
||||
return makeStopSet(stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param stopWords An array of stopwords
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
return stopSet;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
stopSet.addAll(stopWords);
|
||||
return stopSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is not a stop word.
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// return the first non-stop word found
|
||||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
|
||||
if (enablePositionIncrements) {
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
// reached EOS -- return false
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns version-dependent default for
|
||||
* enablePositionIncrements. Analyzers that embed
|
||||
* StopFilter use this method when creating the
|
||||
* StopFilter. Prior to 2.9, this returns false. On 2.9
|
||||
* or later, it returns true.
|
||||
*/
|
||||
public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_29);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrements(boolean).
|
||||
*/
|
||||
public boolean getEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
/**
|
||||
* If <code>true</code>, this StopFilter will preserve
|
||||
* positions of the incoming tokens (ie, accumulate and
|
||||
* set position increments of the removed stop tokens).
|
||||
* Generally, <code>true</code> is best as it does not
|
||||
* lose information (positions of the original tokens)
|
||||
* during indexing.
|
||||
*
|
||||
* <p> When set, when a token is stopped
|
||||
* (omitted), the position increment of the following
|
||||
* token is incremented.
|
||||
*
|
||||
* <p> <b>NOTE</b>: be sure to also
|
||||
* set {@link QueryParser#setEnablePositionIncrements} if
|
||||
* you use QueryParser to create queries.
|
||||
*/
|
||||
public void setEnablePositionIncrements(boolean enable) {
|
||||
this.enablePositionIncrements = enable;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,245 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This TokenFilter provides the ability to set aside attribute states
|
||||
* that have already been analyzed. This is useful in situations where multiple fields share
|
||||
* many common analysis steps and then go their separate ways.
|
||||
* <p/>
|
||||
* It is also useful for doing things like entity extraction or proper noun analysis as
|
||||
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||
*
|
||||
* <pre>
|
||||
TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
|
||||
TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
|
||||
|
||||
TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
|
||||
source2.addSinkTokenStream(sink1);
|
||||
source2.addSinkTokenStream(sink2);
|
||||
|
||||
TokenStream final1 = new LowerCaseFilter(source1);
|
||||
TokenStream final2 = source2;
|
||||
TokenStream final3 = new EntityDetect(sink1);
|
||||
TokenStream final4 = new URLDetect(sink2);
|
||||
|
||||
d.add(new Field("f1", final1));
|
||||
d.add(new Field("f2", final2));
|
||||
d.add(new Field("f3", final3));
|
||||
d.add(new Field("f4", final4));
|
||||
* </pre>
|
||||
* In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
|
||||
* <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
|
||||
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||
* It is important, that tees are consumed before sinks (in the above example, the field names must be
|
||||
* less the sink's field names). If you are not sure, which stream is consumed first, you can simply
|
||||
* add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
|
||||
* This TokenFilter is exhausted after this. In the above example, change
|
||||
* the example above to:
|
||||
* <pre>
|
||||
...
|
||||
TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
|
||||
TokenStream final2 = source2.newSinkTokenStream();
|
||||
sink1.consumeAllTokens();
|
||||
sink2.consumeAllTokens();
|
||||
...
|
||||
* </pre>
|
||||
* In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
|
||||
* <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
|
||||
*/
|
||||
public final class TeeSinkTokenFilter extends TokenFilter {
|
||||
private final List<WeakReference<SinkTokenStream>> sinks = new LinkedList<WeakReference<SinkTokenStream>>();
|
||||
|
||||
/**
|
||||
* Instantiates a new TeeSinkTokenFilter.
|
||||
*/
|
||||
public TeeSinkTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream() {
|
||||
return newSinkTokenStream(ACCEPT_ALL_FILTER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
|
||||
* that pass the supplied filter.
|
||||
* @see SinkFilter
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
|
||||
SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
|
||||
this.sinks.add(new WeakReference<SinkTokenStream>(sink));
|
||||
return sink;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
|
||||
* to this one. The supplied stream will also receive all consumed tokens.
|
||||
* This method can be used to pass tokens from two different tees to one sink.
|
||||
*/
|
||||
public void addSinkTokenStream(final SinkTokenStream sink) {
|
||||
// check that sink has correct factory
|
||||
if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
|
||||
throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
|
||||
}
|
||||
// add eventually missing attribute impls to the existing sink
|
||||
for (Iterator<AttributeImpl> it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
|
||||
sink.addAttributeImpl(it.next());
|
||||
}
|
||||
this.sinks.add(new WeakReference<SinkTokenStream>(sink));
|
||||
}
|
||||
|
||||
/**
|
||||
* <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
|
||||
* when itself is consumed. To be sure, that all tokens from the input
|
||||
* stream are passed to the sinks, you can call this methods.
|
||||
* This instance is exhausted after this, but all sinks are instant available.
|
||||
*/
|
||||
public void consumeAllTokens() throws IOException {
|
||||
while (incrementToken());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
// capture state lazily - maybe no SinkFilter accepts this state
|
||||
AttributeSource.State state = null;
|
||||
for (WeakReference<SinkTokenStream> ref : sinks) {
|
||||
final SinkTokenStream sink = ref.get();
|
||||
if (sink != null) {
|
||||
if (sink.accept(this)) {
|
||||
if (state == null) {
|
||||
state = this.captureState();
|
||||
}
|
||||
sink.addState(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
AttributeSource.State finalState = captureState();
|
||||
for (WeakReference<SinkTokenStream> ref : sinks) {
|
||||
final SinkTokenStream sink = ref.get();
|
||||
if (sink != null) {
|
||||
sink.setFinalState(finalState);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A filter that decides which {@link AttributeSource} states to store in the sink.
|
||||
*/
|
||||
public static abstract class SinkFilter {
|
||||
/**
|
||||
* Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored
|
||||
* in the sink.
|
||||
*/
|
||||
public abstract boolean accept(AttributeSource source);
|
||||
|
||||
/**
|
||||
* Called by {@link SinkTokenStream#reset()}. This method does nothing by default
|
||||
* and can optionally be overridden.
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
// nothing to do; can be overridden
|
||||
}
|
||||
}
|
||||
|
||||
public static final class SinkTokenStream extends TokenStream {
|
||||
private final List<AttributeSource.State> cachedStates = new LinkedList<AttributeSource.State>();
|
||||
private AttributeSource.State finalState;
|
||||
private Iterator<AttributeSource.State> it = null;
|
||||
private SinkFilter filter;
|
||||
|
||||
private SinkTokenStream(AttributeSource source, SinkFilter filter) {
|
||||
super(source);
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
private boolean accept(AttributeSource source) {
|
||||
return filter.accept(source);
|
||||
}
|
||||
|
||||
private void addState(AttributeSource.State state) {
|
||||
if (it != null) {
|
||||
throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
|
||||
}
|
||||
cachedStates.add(state);
|
||||
}
|
||||
|
||||
private void setFinalState(AttributeSource.State finalState) {
|
||||
this.finalState = finalState;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// lazy init the iterator
|
||||
if (it == null) {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
|
||||
if (!it.hasNext()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AttributeSource.State state = it.next();
|
||||
restoreState(state);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void reset() {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource source) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,811 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.TermPositions; // for javadoc
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
A Token is an occurrence of a term from the text of a field. It consists of
|
||||
a term's text, the start and end offset of the term in the text of the field,
|
||||
and a type string.
|
||||
<p>
|
||||
The start and end offsets permit applications to re-associate a token with
|
||||
its source text, e.g., to display highlighted query terms in a document
|
||||
browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
|
||||
display, etc.
|
||||
<p>
|
||||
The type is a string, assigned by a lexical analyzer
|
||||
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
|
||||
belongs to. For example an end of sentence marker token might be implemented
|
||||
with type "eos". The default token type is "word".
|
||||
<p>
|
||||
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
|
||||
length byte array. Use {@link TermPositions#getPayloadLength()} and
|
||||
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
|
||||
|
||||
<br><br>
|
||||
|
||||
<p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
|
||||
that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
|
||||
Even though it is not necessary to use Token anymore, with the new TokenStream API it can
|
||||
be used as convenience class that implements all {@link Attribute}s, which is especially useful
|
||||
to easily switch from the old to the new TokenStream API.
|
||||
|
||||
<br><br>
|
||||
|
||||
<p>Tokenizers and TokenFilters should try to re-use a Token
|
||||
instance when possible for best performance, by
|
||||
implementing the {@link TokenStream#incrementToken()} API.
|
||||
Failing that, to create a new Token you should first use
|
||||
one of the constructors that starts with null text. To load
|
||||
the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
|
||||
To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}.
|
||||
Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
|
||||
if you know that your text is shorter than the capacity of the termBuffer
|
||||
or {@link #resizeTermBuffer(int)}, if there is any possibility
|
||||
that you may need to grow the buffer. Fill in the characters of your term into this
|
||||
buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
|
||||
or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
|
||||
set the length of the term text. See <a target="_top"
|
||||
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
|
||||
for details.</p>
|
||||
<p>Typical Token reuse patterns:
|
||||
<ul>
|
||||
<li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
|
||||
<pre>
|
||||
return reusableToken.reinit(string, startOffset, endOffset[, type]);
|
||||
</pre>
|
||||
</li>
|
||||
<li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
|
||||
<pre>
|
||||
return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
|
||||
</pre>
|
||||
</li>
|
||||
</li>
|
||||
<li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
|
||||
<pre>
|
||||
return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
|
||||
</pre>
|
||||
</li>
|
||||
<li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
|
||||
<pre>
|
||||
return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
|
||||
</pre>
|
||||
</li>
|
||||
<li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
|
||||
<pre>
|
||||
return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
|
||||
</pre>
|
||||
</li>
|
||||
</ul>
|
||||
A few things to note:
|
||||
<ul>
|
||||
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
|
||||
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
|
||||
<li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
|
||||
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
|
||||
</ul>
|
||||
</p>
|
||||
|
||||
@see org.apache.lucene.index.Payload
|
||||
*/
|
||||
public class Token extends AttributeImpl
|
||||
implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
|
||||
FlagsAttribute, OffsetAttribute, PayloadAttribute {
|
||||
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
private static int MIN_BUFFER_SIZE = 10;
|
||||
|
||||
private char[] termBuffer;
|
||||
private int termLength;
|
||||
private int startOffset,endOffset;
|
||||
private String type = DEFAULT_TYPE;
|
||||
private int flags;
|
||||
private Payload payload;
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Constructs a Token will null text. */
|
||||
public Token() {
|
||||
}
|
||||
|
||||
/** Constructs a Token with null text and start & end
|
||||
* offsets.
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text */
|
||||
public Token(int start, int end) {
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Constructs a Token with null text and start & end
|
||||
* offsets plus the Token type.
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text
|
||||
* @param typ the lexical type of this Token */
|
||||
public Token(int start, int end, String typ) {
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
type = typ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a Token with null text and start & end
|
||||
* offsets plus flags. NOTE: flags is EXPERIMENTAL.
|
||||
* @param start start offset in the source text
|
||||
* @param end end offset in the source text
|
||||
* @param flags The bits to set for this token
|
||||
*/
|
||||
public Token(int start, int end, int flags) {
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
/** Constructs a Token with the given term text, and start
|
||||
* & end offsets. The type defaults to "word."
|
||||
* <b>NOTE:</b> for better indexing speed you should
|
||||
* instead use the char[] termBuffer methods to set the
|
||||
* term text.
|
||||
* @param text term text
|
||||
* @param start start offset
|
||||
* @param end end offset
|
||||
*/
|
||||
public Token(String text, int start, int end) {
|
||||
setTermBuffer(text);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Constructs a Token with the given text, start and end
|
||||
* offsets, & type. <b>NOTE:</b> for better indexing
|
||||
* speed you should instead use the char[] termBuffer
|
||||
* methods to set the term text.
|
||||
* @param text term text
|
||||
* @param start start offset
|
||||
* @param end end offset
|
||||
* @param typ token type
|
||||
*/
|
||||
public Token(String text, int start, int end, String typ) {
|
||||
setTermBuffer(text);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
type = typ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a Token with the given text, start and end
|
||||
* offsets, & type. <b>NOTE:</b> for better indexing
|
||||
* speed you should instead use the char[] termBuffer
|
||||
* methods to set the term text.
|
||||
* @param text
|
||||
* @param start
|
||||
* @param end
|
||||
* @param flags token type bits
|
||||
*/
|
||||
public Token(String text, int start, int end, int flags) {
|
||||
setTermBuffer(text);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a Token with the given term buffer (offset
|
||||
* & length), start and end
|
||||
* offsets
|
||||
* @param startTermBuffer
|
||||
* @param termBufferOffset
|
||||
* @param termBufferLength
|
||||
* @param start
|
||||
* @param end
|
||||
*/
|
||||
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
|
||||
setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength);
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Set the position increment. This determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
* @param positionIncrement the distance from the prior term
|
||||
* @see org.apache.lucene.index.TermPositions
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
throw new IllegalArgumentException
|
||||
("Increment must be zero or greater: " + positionIncrement);
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the Token's term text.
|
||||
*
|
||||
* This method has a performance penalty
|
||||
* because the text is stored internally in a char[]. If
|
||||
* possible, use {@link #termBuffer()} and {@link
|
||||
* #termLength()} directly instead. If you really need a
|
||||
* String, use this method, which is nothing more than
|
||||
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||
*/
|
||||
public final String term() {
|
||||
initTermBuffer();
|
||||
return new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset for
|
||||
* length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public final void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
growTermBuffer(length);
|
||||
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
*/
|
||||
public final void setTermBuffer(String buffer) {
|
||||
final int length = buffer.length();
|
||||
growTermBuffer(length);
|
||||
buffer.getChars(0, length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset and continuing
|
||||
* for length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public final void setTermBuffer(String buffer, int offset, int length) {
|
||||
assert offset <= buffer.length();
|
||||
assert offset + length <= buffer.length();
|
||||
growTermBuffer(length);
|
||||
buffer.getChars(offset, offset + length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Returns the internal termBuffer character array which
|
||||
* you can then directly alter. If the array is too
|
||||
* small for your token, use {@link
|
||||
* #resizeTermBuffer(int)} to increase it. After
|
||||
* altering the buffer be sure to call {@link
|
||||
* #setTermLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
public final char[] termBuffer() {
|
||||
initTermBuffer();
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
* existing content. Note: If the next operation is to change
|
||||
* the contents of the term buffer use
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setTermBuffer(String)}, or
|
||||
* {@link #setTermBuffer(String, int, int)}
|
||||
* to optimally combine the resize with the setting of the termBuffer.
|
||||
* @param newSize minimum size of the new termBuffer
|
||||
* @return newly created termBuffer with length >= newSize
|
||||
*/
|
||||
public char[] resizeTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation and preserve content
|
||||
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
}
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Allocates a buffer char[] of at least newSize, without preserving the existing content.
|
||||
* its always used in places that set the content
|
||||
* @param newSize minimum size of the buffer
|
||||
*/
|
||||
private void growTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return number of valid characters (length of the term)
|
||||
* in the termBuffer array. */
|
||||
public final int termLength() {
|
||||
initTermBuffer();
|
||||
return termLength;
|
||||
}
|
||||
|
||||
/** Set number of valid characters (length of the term) in
|
||||
* the termBuffer array. Use this to truncate the termBuffer
|
||||
* or to synchronize with external manipulation of the termBuffer.
|
||||
* Note: to grow the size of the array,
|
||||
* use {@link #resizeTermBuffer(int)} first.
|
||||
* @param length the truncated length
|
||||
*/
|
||||
public final void setTermLength(int length) {
|
||||
initTermBuffer();
|
||||
if (length > termBuffer.length)
|
||||
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to {@link #termLength}, as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public final int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
/** Set the starting offset.
|
||||
@see #startOffset() */
|
||||
public void setStartOffset(int offset) {
|
||||
this.startOffset = offset;
|
||||
}
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public final int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
/** Set the ending offset.
|
||||
@see #endOffset() */
|
||||
public void setEndOffset(int offset) {
|
||||
this.endOffset = offset;
|
||||
}
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public final String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public final void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
*/
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public Payload getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append('(');
|
||||
initTermBuffer();
|
||||
if (termBuffer == null)
|
||||
sb.append("null");
|
||||
else
|
||||
sb.append(termBuffer, 0, termLength);
|
||||
sb.append(',').append(startOffset).append(',').append(endOffset);
|
||||
if (!type.equals("word"))
|
||||
sb.append(",type=").append(type);
|
||||
if (positionIncrement != 1)
|
||||
sb.append(",posIncr=").append(positionIncrement);
|
||||
sb.append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/** Resets the term text, payload, flags, and positionIncrement,
|
||||
* startOffset, endOffset and token type to default.
|
||||
*/
|
||||
@Override
|
||||
public void clear() {
|
||||
payload = null;
|
||||
// Leave termBuffer to allow re-use
|
||||
termLength = 0;
|
||||
positionIncrement = 1;
|
||||
flags = 0;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
Token t = (Token)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
if (payload != null) {
|
||||
t.payload = (Payload) payload.clone();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
/** Makes a clone, but replaces the term buffer &
|
||||
* start/end offset in the process. This is more
|
||||
* efficient than doing a full clone (and then calling
|
||||
* setTermBuffer) because it saves a wasted copy of the old
|
||||
* termBuffer. */
|
||||
public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
|
||||
final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
|
||||
t.positionIncrement = positionIncrement;
|
||||
t.flags = flags;
|
||||
t.type = type;
|
||||
if (payload != null)
|
||||
t.payload = (Payload) payload.clone();
|
||||
return t;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this)
|
||||
return true;
|
||||
|
||||
if (obj instanceof Token) {
|
||||
Token other = (Token) obj;
|
||||
|
||||
initTermBuffer();
|
||||
other.initTermBuffer();
|
||||
|
||||
if (termLength == other.termLength &&
|
||||
startOffset == other.startOffset &&
|
||||
endOffset == other.endOffset &&
|
||||
flags == other.flags &&
|
||||
positionIncrement == other.positionIncrement &&
|
||||
subEqual(type, other.type) &&
|
||||
subEqual(payload, other.payload)) {
|
||||
for(int i=0;i<termLength;i++)
|
||||
if (termBuffer[i] != other.termBuffer[i])
|
||||
return false;
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean subEqual(Object o1, Object o2) {
|
||||
if (o1 == null)
|
||||
return o2 == null;
|
||||
else
|
||||
return o1.equals(o2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
initTermBuffer();
|
||||
int code = termLength;
|
||||
code = code * 31 + startOffset;
|
||||
code = code * 31 + endOffset;
|
||||
code = code * 31 + flags;
|
||||
code = code * 31 + positionIncrement;
|
||||
code = code * 31 + type.hashCode();
|
||||
code = (payload == null ? code : code * 31 + payload.hashCode());
|
||||
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
||||
return code;
|
||||
}
|
||||
|
||||
// like clear() but doesn't clear termBuffer/text
|
||||
private void clearNoTermBuffer() {
|
||||
payload = null;
|
||||
positionIncrement = 1;
|
||||
flags = 0;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset},
|
||||
* {@link #setType}
|
||||
* @return this Token instance */
|
||||
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
|
||||
clearNoTermBuffer();
|
||||
payload = null;
|
||||
positionIncrement = 1;
|
||||
setTermBuffer(newTermBuffer, newTermOffset, newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
endOffset = newEndOffset;
|
||||
type = newType;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setType} on Token.DEFAULT_TYPE
|
||||
* @return this Token instance */
|
||||
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
|
||||
clearNoTermBuffer();
|
||||
setTermBuffer(newTermBuffer, newTermOffset, newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
endOffset = newEndOffset;
|
||||
type = DEFAULT_TYPE;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #setTermBuffer(String)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setType}
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
|
||||
clearNoTermBuffer();
|
||||
setTermBuffer(newTerm);
|
||||
startOffset = newStartOffset;
|
||||
endOffset = newEndOffset;
|
||||
type = newType;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #setTermBuffer(String, int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setType}
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
|
||||
clearNoTermBuffer();
|
||||
setTermBuffer(newTerm, newTermOffset, newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
endOffset = newEndOffset;
|
||||
type = newType;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #setTermBuffer(String)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setType} on Token.DEFAULT_TYPE
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
|
||||
clearNoTermBuffer();
|
||||
setTermBuffer(newTerm);
|
||||
startOffset = newStartOffset;
|
||||
endOffset = newEndOffset;
|
||||
type = DEFAULT_TYPE;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
* {@link #setTermBuffer(String, int, int)},
|
||||
* {@link #setStartOffset},
|
||||
* {@link #setEndOffset}
|
||||
* {@link #setType} on Token.DEFAULT_TYPE
|
||||
* @return this Token instance */
|
||||
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
|
||||
clearNoTermBuffer();
|
||||
setTermBuffer(newTerm, newTermOffset, newTermLength);
|
||||
startOffset = newStartOffset;
|
||||
endOffset = newEndOffset;
|
||||
type = DEFAULT_TYPE;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the prototype token's fields into this one. Note: Payloads are shared.
|
||||
* @param prototype
|
||||
*/
|
||||
public void reinit(Token prototype) {
|
||||
prototype.initTermBuffer();
|
||||
setTermBuffer(prototype.termBuffer, 0, prototype.termLength);
|
||||
positionIncrement = prototype.positionIncrement;
|
||||
flags = prototype.flags;
|
||||
startOffset = prototype.startOffset;
|
||||
endOffset = prototype.endOffset;
|
||||
type = prototype.type;
|
||||
payload = prototype.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
|
||||
* @param prototype
|
||||
* @param newTerm
|
||||
*/
|
||||
public void reinit(Token prototype, String newTerm) {
|
||||
setTermBuffer(newTerm);
|
||||
positionIncrement = prototype.positionIncrement;
|
||||
flags = prototype.flags;
|
||||
startOffset = prototype.startOffset;
|
||||
endOffset = prototype.endOffset;
|
||||
type = prototype.type;
|
||||
payload = prototype.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
|
||||
* @param prototype
|
||||
* @param newTermBuffer
|
||||
* @param offset
|
||||
* @param length
|
||||
*/
|
||||
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
|
||||
setTermBuffer(newTermBuffer, offset, length);
|
||||
positionIncrement = prototype.positionIncrement;
|
||||
flags = prototype.flags;
|
||||
startOffset = prototype.startOffset;
|
||||
endOffset = prototype.endOffset;
|
||||
type = prototype.type;
|
||||
payload = prototype.payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
if (target instanceof Token) {
|
||||
final Token to = (Token) target;
|
||||
to.reinit(this);
|
||||
// reinit shares the payload, so clone it:
|
||||
if (payload !=null) {
|
||||
to.payload = (Payload) payload.clone();
|
||||
}
|
||||
} else {
|
||||
initTermBuffer();
|
||||
((TermAttribute) target).setTermBuffer(termBuffer, 0, termLength);
|
||||
((OffsetAttribute) target).setOffset(startOffset, endOffset);
|
||||
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
|
||||
((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
|
||||
((FlagsAttribute) target).setFlags(flags);
|
||||
((TypeAttribute) target).setType(type);
|
||||
}
|
||||
}
|
||||
|
||||
/** Convenience factory that returns <code>Token</code> as implementation for the basic
|
||||
* attributes and return the default impl (with "Impl" appended) for all other
|
||||
* attributes.
|
||||
* @since 3.0
|
||||
*/
|
||||
public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
|
||||
new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
|
||||
|
||||
/** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
|
||||
* and for all other attributes calls the given delegate factory.
|
||||
* @since 3.0
|
||||
*/
|
||||
public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
|
||||
|
||||
private final AttributeSource.AttributeFactory delegate;
|
||||
|
||||
/** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
|
||||
* and for all other attributes calls the given delegate factory. */
|
||||
public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
|
||||
return attClass.isAssignableFrom(Token.class)
|
||||
? new Token() : delegate.createAttributeInstance(attClass);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) return true;
|
||||
if (other instanceof TokenAttributeFactory) {
|
||||
final TokenAttributeFactory af = (TokenAttributeFactory) other;
|
||||
return this.delegate.equals(af.delegate);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return delegate.hashCode() ^ 0x0a45aa31;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** A TokenFilter is a TokenStream whose input is another TokenStream.
|
||||
<p>
|
||||
This is an abstract class; subclasses must override {@link #incrementToken()}.
|
||||
@see TokenStream
|
||||
*/
|
||||
public abstract class TokenFilter extends TokenStream {
|
||||
/** The source of tokens for this filter. */
|
||||
protected final TokenStream input;
|
||||
|
||||
/** Construct a token stream filtering the given input. */
|
||||
protected TokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/** Performs end-of-stream operations, if any, and calls then <code>end()</code> on the
|
||||
* input TokenStream.<p/>
|
||||
* <b>NOTE:</b> Be sure to call <code>super.end()</code> first when overriding this method.*/
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
input.end();
|
||||
}
|
||||
|
||||
/** Close the input TokenStream. */
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
/** Reset the filter as well as the input TokenStream. */
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Closeable;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* A <code>TokenStream</code> enumerates the sequence of tokens, either from
|
||||
* {@link Field}s of a {@link Document} or from query text.
|
||||
* <p>
|
||||
* This is an abstract class; concrete subclasses are:
|
||||
* <ul>
|
||||
* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
|
||||
* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
|
||||
* <code>TokenStream</code>.
|
||||
* </ul>
|
||||
* A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
|
||||
* has moved from being {@link Token}-based to {@link Attribute}-based. While
|
||||
* {@link Token} still exists in 2.9 as a convenience class, the preferred way
|
||||
* to store the information of a {@link Token} is to use {@link AttributeImpl}s.
|
||||
* <p>
|
||||
* <code>TokenStream</code> now extends {@link AttributeSource}, which provides
|
||||
* access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
|
||||
* Note that only one instance per {@link AttributeImpl} is created and reused
|
||||
* for every token. This approach reduces object creation and allows local
|
||||
* caching of references to the {@link AttributeImpl}s. See
|
||||
* {@link #incrementToken()} for further details.
|
||||
* <p>
|
||||
* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
|
||||
* <ol>
|
||||
* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
|
||||
* attributes to/from the {@link AttributeSource}.
|
||||
* <li>The consumer calls {@link TokenStream#reset()}.
|
||||
* <li>The consumer retrieves attributes from the stream and stores local
|
||||
* references to all attributes it wants to access.
|
||||
* <li>The consumer calls {@link #incrementToken()} until it returns false
|
||||
* consuming the attributes after each call.
|
||||
* <li>The consumer calls {@link #end()} so that any end-of-stream operations
|
||||
* can be performed.
|
||||
* <li>The consumer calls {@link #close()} to release any resource when finished
|
||||
* using the <code>TokenStream</code>.
|
||||
* </ol>
|
||||
* To make sure that filters and consumers know which attributes are available,
|
||||
* the attributes must be added during instantiation. Filters and consumers are
|
||||
* not required to check for availability of attributes in
|
||||
* {@link #incrementToken()}.
|
||||
* <p>
|
||||
* You can find some example code for the new API in the analysis package level
|
||||
* Javadoc.
|
||||
* <p>
|
||||
* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
|
||||
* e.g., for buffering purposes (see {@link CachingTokenFilter},
|
||||
* {@link TeeSinkTokenFilter}). For this usecase
|
||||
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
|
||||
* can be used.
|
||||
*/
|
||||
public abstract class TokenStream extends AttributeSource implements Closeable {
|
||||
|
||||
/**
|
||||
* A TokenStream using the default attribute factory.
|
||||
*/
|
||||
protected TokenStream() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* A TokenStream that uses the same attributes as the supplied one.
|
||||
*/
|
||||
protected TokenStream(AttributeSource input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
|
||||
*/
|
||||
protected TokenStream(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
|
||||
* the next token. Implementing classes must implement this method and update
|
||||
* the appropriate {@link AttributeImpl}s with the attributes of the next
|
||||
* token.
|
||||
* <P>
|
||||
* The producer must make no assumptions about the attributes after the method
|
||||
* has been returned: the caller may arbitrarily change it. If the producer
|
||||
* needs to preserve the state for subsequent calls, it can use
|
||||
* {@link #captureState} to create a copy of the current attribute state.
|
||||
* <p>
|
||||
* This method is called for every token of a document, so an efficient
|
||||
* implementation is crucial for good performance. To avoid calls to
|
||||
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
|
||||
* references to all {@link AttributeImpl}s that this stream uses should be
|
||||
* retrieved during instantiation.
|
||||
* <p>
|
||||
* To ensure that filters and consumers know which attributes are available,
|
||||
* the attributes must be added during instantiation. Filters and consumers
|
||||
* are not required to check for availability of attributes in
|
||||
* {@link #incrementToken()}.
|
||||
*
|
||||
* @return false for end of stream; true otherwise
|
||||
*/
|
||||
public abstract boolean incrementToken() throws IOException;
|
||||
|
||||
/**
|
||||
* This method is called by the consumer after the last token has been
|
||||
* consumed, after {@link #incrementToken()} returned <code>false</code>
|
||||
* (using the new <code>TokenStream</code> API). Streams implementing the old API
|
||||
* should upgrade to use this feature.
|
||||
* <p/>
|
||||
* This method can be used to perform any end-of-stream operations, such as
|
||||
* setting the final offset of a stream. The final offset of a stream might
|
||||
* differ from the offset of the last token eg in case one or more whitespaces
|
||||
* followed after the last token, but a {@link WhitespaceTokenizer} was used.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public void end() throws IOException {
|
||||
// do nothing by default
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets this stream to the beginning. This is an optional operation, so
|
||||
* subclasses may or may not implement this method. {@link #reset()} is not needed for
|
||||
* the standard indexing process. However, if the tokens of a
|
||||
* <code>TokenStream</code> are intended to be consumed more than once, it is
|
||||
* necessary to implement {@link #reset()}. Note that if your TokenStream
|
||||
* caches tokens and feeds them back again after a reset, it is imperative
|
||||
* that you clone the tokens when you store them away (on the first pass) as
|
||||
* well as when you return them (on future passes after {@link #reset()}).
|
||||
*/
|
||||
public void reset() throws IOException {}
|
||||
|
||||
/** Releases resources associated with this stream. */
|
||||
public void close() throws IOException {}
|
||||
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/** A Tokenizer is a TokenStream whose input is a Reader.
|
||||
<p>
|
||||
This is an abstract class; subclasses must override {@link #incrementToken()}
|
||||
<p>
|
||||
NOTE: Subclasses overriding {@link #incrementToken()} must
|
||||
call {@link AttributeSource#clearAttributes()} before
|
||||
setting attributes.
|
||||
Subclasses overriding {@link #incrementToken()} must call
|
||||
{@link Token#clear()} before setting Token attributes.
|
||||
*/
|
||||
public abstract class Tokenizer extends TokenStream {
|
||||
/** The text source for this Tokenizer. */
|
||||
protected Reader input;
|
||||
|
||||
/** Construct a tokenizer with null input. */
|
||||
protected Tokenizer() {}
|
||||
|
||||
/** Construct a token stream processing the given input. */
|
||||
protected Tokenizer(Reader input) {
|
||||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Construct a tokenizer with null input using the given AttributeFactory. */
|
||||
protected Tokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeFactory. */
|
||||
protected Tokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory);
|
||||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */
|
||||
protected Tokenizer(AttributeSource source) {
|
||||
super(source);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */
|
||||
protected Tokenizer(AttributeSource source, Reader input) {
|
||||
super(source);
|
||||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** By default, closes the input Reader. */
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
/** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass
|
||||
* this method calls {@link CharStream#correctOffset}, else returns <code>currentOff</code>.
|
||||
* @param currentOff offset as seen in the output
|
||||
* @return corrected offset based on the input
|
||||
* @see CharStream#correctOffset
|
||||
*/
|
||||
protected final int correctOffset(int currentOff) {
|
||||
return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff;
|
||||
}
|
||||
|
||||
/** Expert: Reset the tokenizer to a new reader. Typically, an
|
||||
* analyzer (in its reusableTokenStream method) will use
|
||||
* this to re-use a previously created tokenizer. */
|
||||
public void reset(Reader input) throws IOException {
|
||||
this.input = input;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
|
||||
|
||||
public final class WhitespaceAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||
if (tokenizer == null) {
|
||||
tokenizer = new WhitespaceTokenizer(reader);
|
||||
setPreviousTokenStream(tokenizer);
|
||||
} else
|
||||
tokenizer.reset(reader);
|
||||
return tokenizer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
|
||||
* Adjacent sequences of non-Whitespace characters form tokens. */
|
||||
|
||||
public class WhitespaceTokenizer extends CharTokenizer {
|
||||
/** Construct a new WhitespaceTokenizer. */
|
||||
public WhitespaceTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
|
||||
public WhitespaceTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
/** Construct a new WhitespaceTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
|
||||
public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy
|
||||
* {@link Character#isWhitespace(char)}.*/
|
||||
@Override
|
||||
protected boolean isTokenChar(char c) {
|
||||
return !Character.isWhitespace(c);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Loader for text files that represent a list of stopwords.
|
||||
*/
|
||||
public class WordlistLoader {
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param wordfile File containing the wordlist
|
||||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(File wordfile) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
result = getWordSet(reader);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param wordfile File containing the wordlist
|
||||
* @param comment The comment string to ignore
|
||||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
result = getWordSet(reader, comment);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @return A HashSet with the reader's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(Reader reader) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
finally {
|
||||
if (br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @return A HashSet with the reader's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
if (word.startsWith(comment) == false){
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
if (br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reads a stem dictionary. Each line contains:
|
||||
* <pre>word<b>\t</b>stem</pre>
|
||||
* (i.e. two tab seperated words)
|
||||
*
|
||||
* @return stem dictionary that overrules the stemming algorithm
|
||||
* @throws IOException
|
||||
*/
|
||||
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
|
||||
if (wordstemfile == null)
|
||||
throw new NullPointerException("wordstemfile may not be null");
|
||||
HashMap<String, String> result = new HashMap<String, String>();
|
||||
BufferedReader br = null;
|
||||
FileReader fr = null;
|
||||
try {
|
||||
fr = new FileReader(wordstemfile);
|
||||
br = new BufferedReader(fr);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
} finally {
|
||||
if (fr != null)
|
||||
fr.close();
|
||||
if (br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,635 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
<p>API and code to convert text into indexable/searchable tokens. Covers {@link org.apache.lucene.analysis.Analyzer} and related classes.</p>
|
||||
<h2>Parsing? Tokenization? Analysis!</h2>
|
||||
<p>
|
||||
Lucene, indexing and search library, accepts only plain text input.
|
||||
<p>
|
||||
<h2>Parsing</h2>
|
||||
<p>
|
||||
Applications that build their search capabilities upon Lucene may support documents in various formats – HTML, XML, PDF, Word – just to name a few.
|
||||
Lucene does not care about the <i>Parsing</i> of these and other document formats, and it is the responsibility of the
|
||||
application using Lucene to use an appropriate <i>Parser</i> to convert the original format into plain text before passing that plain text to Lucene.
|
||||
<p>
|
||||
<h2>Tokenization</h2>
|
||||
<p>
|
||||
Plain text passed to Lucene for indexing goes through a process generally called tokenization. Tokenization is the process
|
||||
of breaking input text into small indexing elements – tokens.
|
||||
The way input text is broken into tokens heavily influences how people will then be able to search for that text.
|
||||
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
|
||||
and proximity searches (though sentence identification is not provided by Lucene).
|
||||
<p>
|
||||
In some cases simply breaking the input text into tokens is not enough – a deeper <i>Analysis</i> may be needed.
|
||||
There are many post tokenization steps that can be done, including (but not limited to):
|
||||
<ul>
|
||||
<li><a href="http://en.wikipedia.org/wiki/Stemming">Stemming</a> –
|
||||
Replacing of words by their stems.
|
||||
For instance with English stemming "bikes" is replaced by "bike";
|
||||
now query "bike" can find both documents containing "bike" and those containing "bikes".
|
||||
</li>
|
||||
<li><a href="http://en.wikipedia.org/wiki/Stop_words">Stop Words Filtering</a> –
|
||||
Common words like "the", "and" and "a" rarely add any value to a search.
|
||||
Removing them shrinks the index size and increases performance.
|
||||
It may also reduce some "noise" and actually improve search quality.
|
||||
</li>
|
||||
<li><a href="http://en.wikipedia.org/wiki/Text_normalization">Text Normalization</a> –
|
||||
Stripping accents and other character markings can make for better searching.
|
||||
</li>
|
||||
<li><a href="http://en.wikipedia.org/wiki/Synonym">Synonym Expansion</a> –
|
||||
Adding in synonyms at the same token position as the current word can mean better
|
||||
matching when users search with words in the synonym set.
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
<h2>Core Analysis</h2>
|
||||
<p>
|
||||
The analysis package provides the mechanism to convert Strings and Readers into tokens that can be indexed by Lucene. There
|
||||
are three main classes in the package from which all analysis processes are derived. These are:
|
||||
<ul>
|
||||
<li>{@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
|
||||
by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
|
||||
<li>{@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
|
||||
up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in
|
||||
the analysis process.</li>
|
||||
<li>{@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
|
||||
for modifying tokens that have been created by the Tokenizer. Common modifications performed by a
|
||||
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
|
||||
</ul>
|
||||
<b>Lucene 2.9 introduces a new TokenStream API. Please see the section "New TokenStream API" below for more details.</b>
|
||||
</p>
|
||||
<h2>Hints, Tips and Traps</h2>
|
||||
<p>
|
||||
The synergy between {@link org.apache.lucene.analysis.Analyzer} and {@link org.apache.lucene.analysis.Tokenizer}
|
||||
is sometimes confusing. To ease on this confusion, some clarifications:
|
||||
<ul>
|
||||
<li>The {@link org.apache.lucene.analysis.Analyzer} is responsible for the entire task of
|
||||
<u>creating</u> tokens out of the input text, while the {@link org.apache.lucene.analysis.Tokenizer}
|
||||
is only responsible for <u>breaking</u> the input text into tokens. Very likely, tokens created
|
||||
by the {@link org.apache.lucene.analysis.Tokenizer} would be modified or even omitted
|
||||
by the {@link org.apache.lucene.analysis.Analyzer} (via one or more
|
||||
{@link org.apache.lucene.analysis.TokenFilter}s) before being returned.
|
||||
</li>
|
||||
<li>{@link org.apache.lucene.analysis.Tokenizer} is a {@link org.apache.lucene.analysis.TokenStream},
|
||||
but {@link org.apache.lucene.analysis.Analyzer} is not.
|
||||
</li>
|
||||
<li>{@link org.apache.lucene.analysis.Analyzer} is "field aware", but
|
||||
{@link org.apache.lucene.analysis.Tokenizer} is not.
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
<p>
|
||||
Lucene Java provides a number of analysis capabilities, the most commonly used one being the {@link
|
||||
org.apache.lucene.analysis.standard.StandardAnalyzer}. Many applications will have a long and industrious life with nothing more
|
||||
than the StandardAnalyzer. However, there are a few other classes/packages that are worth mentioning:
|
||||
<ol>
|
||||
<li>{@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper} – Most Analyzers perform the same operation on all
|
||||
{@link org.apache.lucene.document.Field}s. The PerFieldAnalyzerWrapper can be used to associate a different Analyzer with different
|
||||
{@link org.apache.lucene.document.Field}s.</li>
|
||||
<li>The contrib/analyzers library located at the root of the Lucene distribution has a number of different Analyzer implementations to solve a variety
|
||||
of different problems related to searching. Many of the Analyzers are designed to analyze non-English languages.</li>
|
||||
<li>The contrib/snowball library
|
||||
located at the root of the Lucene distribution has Analyzer and TokenFilter
|
||||
implementations for a variety of Snowball stemmers.
|
||||
See <a href="http://snowball.tartarus.org">http://snowball.tartarus.org</a>
|
||||
for more information on Snowball stemmers.</li>
|
||||
<li>There are a variety of Tokenizer and TokenFilter implementations in this package. Take a look around, chances are someone has implemented what you need.</li>
|
||||
</ol>
|
||||
</p>
|
||||
<p>
|
||||
Analysis is one of the main causes of performance degradation during indexing. Simply put, the more you analyze the slower the indexing (in most cases).
|
||||
Perhaps your application would be just fine using the simple {@link org.apache.lucene.analysis.WhitespaceTokenizer} combined with a
|
||||
{@link org.apache.lucene.analysis.StopFilter}. The contrib/benchmark library can be useful for testing out the speed of the analysis process.
|
||||
</p>
|
||||
<h2>Invoking the Analyzer</h2>
|
||||
<p>
|
||||
Applications usually do not invoke analysis – Lucene does it for them:
|
||||
<ul>
|
||||
<li>At indexing, as a consequence of
|
||||
{@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document) addDocument(doc)},
|
||||
the Analyzer in effect for indexing is invoked for each indexed field of the added document.
|
||||
</li>
|
||||
<li>At search, as a consequence of
|
||||
{@link org.apache.lucene.queryParser.QueryParser#parse(java.lang.String) QueryParser.parse(queryText)},
|
||||
the QueryParser may invoke the Analyzer in effect.
|
||||
Note that for some queries analysis does not take place, e.g. wildcard queries.
|
||||
</li>
|
||||
</ul>
|
||||
However an application might invoke Analysis of any text for testing or for any other purpose, something like:
|
||||
<PRE>
|
||||
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
|
||||
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
|
||||
while (ts.incrementToken()) {
|
||||
System.out.println("token: "+ts));
|
||||
}
|
||||
</PRE>
|
||||
</p>
|
||||
<h2>Indexing Analysis vs. Search Analysis</h2>
|
||||
<p>
|
||||
Selecting the "correct" analyzer is crucial
|
||||
for search quality, and can also affect indexing and search performance.
|
||||
The "correct" analyzer differs between applications.
|
||||
Lucene java's wiki page
|
||||
<a href="http://wiki.apache.org/lucene-java/AnalysisParalysis">AnalysisParalysis</a>
|
||||
provides some data on "analyzing your analyzer".
|
||||
Here are some rules of thumb:
|
||||
<ol>
|
||||
<li>Test test test... (did we say test?)</li>
|
||||
<li>Beware of over analysis – might hurt indexing performance.</li>
|
||||
<li>Start with same analyzer for indexing and search, otherwise searches would not find what they are supposed to...</li>
|
||||
<li>In some cases a different analyzer is required for indexing and search, for instance:
|
||||
<ul>
|
||||
<li>Certain searches require more stop words to be filtered. (I.e. more than those that were filtered at indexing.)</li>
|
||||
<li>Query expansion by synonyms, acronyms, auto spell correction, etc.</li>
|
||||
</ul>
|
||||
This might sometimes require a modified analyzer – see the next section on how to do that.
|
||||
</li>
|
||||
</ol>
|
||||
</p>
|
||||
<h2>Implementing your own Analyzer</h2>
|
||||
<p>Creating your own Analyzer is straightforward. It usually involves either wrapping an existing Tokenizer and set of TokenFilters to create a new Analyzer
|
||||
or creating both the Analyzer and a Tokenizer or TokenFilter. Before pursuing this approach, you may find it worthwhile
|
||||
to explore the contrib/analyzers library and/or ask on the java-user@lucene.apache.org mailing list first to see if what you need already exists.
|
||||
If you are still committed to creating your own Analyzer or TokenStream derivation (Tokenizer or TokenFilter) have a look at
|
||||
the source code of any one of the many samples located in this package.
|
||||
</p>
|
||||
<p>
|
||||
The following sections discuss some aspects of implementing your own analyzer.
|
||||
</p>
|
||||
<h3>Field Section Boundaries</h3>
|
||||
<p>
|
||||
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
|
||||
is called multiple times for the same field name, we could say that each such call creates a new
|
||||
section for that field in that document.
|
||||
In fact, a separate call to
|
||||
{@link org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) tokenStream(field,reader)}
|
||||
would take place for each of these so called "sections".
|
||||
However, the default Analyzer behavior is to treat all these sections as one large section.
|
||||
This allows phrase search and proximity search to seamlessly cross
|
||||
boundaries between these "sections".
|
||||
In other words, if a certain field "f" is added like this:
|
||||
<PRE>
|
||||
document.add(new Field("f","first ends",...);
|
||||
document.add(new Field("f","starts two",...);
|
||||
indexWriter.addDocument(document);
|
||||
</PRE>
|
||||
Then, a phrase search for "ends starts" would find that document.
|
||||
Where desired, this behavior can be modified by introducing a "position gap" between consecutive field "sections",
|
||||
simply by overriding
|
||||
{@link org.apache.lucene.analysis.Analyzer#getPositionIncrementGap(java.lang.String) Analyzer.getPositionIncrementGap(fieldName)}:
|
||||
<PRE>
|
||||
Analyzer myAnalyzer = new StandardAnalyzer() {
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return 10;
|
||||
}
|
||||
};
|
||||
</PRE>
|
||||
</p>
|
||||
<h3>Token Position Increments</h3>
|
||||
<p>
|
||||
By default, all tokens created by Analyzers and Tokenizers have a
|
||||
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
|
||||
This means that the position stored for that token in the index would be one more than
|
||||
that of the previous token.
|
||||
Recall that phrase and proximity searches rely on position info.
|
||||
</p>
|
||||
<p>
|
||||
If the selected analyzer filters the stop words "is" and "the", then for a document
|
||||
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
|
||||
with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
|
||||
would find that document, because the same analyzer filters the same stop words from
|
||||
that query. But also the phrase query "blue sky" would find that document.
|
||||
</p>
|
||||
<p>
|
||||
If this behavior does not fit the application needs,
|
||||
a modified analyzer can be used, that would increment further the positions of
|
||||
tokens following a removed stop word, using
|
||||
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
|
||||
This can be done with something like:
|
||||
<PRE>
|
||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
|
||||
TokenStream res = new TokenStream() {
|
||||
TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
int extraIncrement = 0;
|
||||
while (true) {
|
||||
boolean hasNext = ts.incrementToken();
|
||||
if (hasNext) {
|
||||
if (stopWords.contains(termAtt.term())) {
|
||||
extraIncrement++; // filter this word
|
||||
continue;
|
||||
}
|
||||
if (extraIncrement>0) {
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
|
||||
}
|
||||
}
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
};
|
||||
return res;
|
||||
}
|
||||
</PRE>
|
||||
Now, with this modified analyzer, the phrase query "blue sky" would find that document.
|
||||
But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
|
||||
where both w1 and w2 are stop words would match that document.
|
||||
</p>
|
||||
<p>
|
||||
Few more use cases for modifying position increments are:
|
||||
<ol>
|
||||
<li>Inhibiting phrase and proximity matches in sentence boundaries – for this, a tokenizer that
|
||||
identifies a new sentence can add 1 to the position increment of the first token of the new sentence.</li>
|
||||
<li>Injecting synonyms – here, synonyms of a token should be added after that token,
|
||||
and their position increment should be set to 0.
|
||||
As result, all synonyms of a token would be considered to appear in exactly the
|
||||
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
||||
</ol>
|
||||
</p>
|
||||
<h2>New TokenStream API</h2>
|
||||
<p>
|
||||
With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token
|
||||
has getter and setter methods for different properties like positionIncrement and termText.
|
||||
While this approach was sufficient for the default indexing format, it is not versatile enough for
|
||||
Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom
|
||||
index formats.
|
||||
</p>
|
||||
<p>
|
||||
A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API
|
||||
is necessary that can transport custom types of data from the documents to the indexer.
|
||||
</p>
|
||||
<h3>Attribute and AttributeSource</h3>
|
||||
Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and
|
||||
{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a
|
||||
particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
|
||||
contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token.
|
||||
An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which
|
||||
means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also
|
||||
AttributeSources.
|
||||
<p>
|
||||
Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
|
||||
<ul>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.TermAttribute}<p>The term text of a token.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}<p>The start and end offset of token in characters.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}<p>See above for detailed information about position increment.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}<p>The payload that a Token can optionally have.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}<p>The type of the token. Default is 'word'.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}<p>Optional flags a token can have.</p></li>
|
||||
</ul>
|
||||
</p>
|
||||
<h3>Using the new TokenStream API</h3>
|
||||
There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
|
||||
to walk through the example below first and come back to this section afterwards.
|
||||
<ol><li>
|
||||
Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if
|
||||
a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
|
||||
with the TokenStream.
|
||||
</li>
|
||||
<br>
|
||||
<li>
|
||||
Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
|
||||
the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
|
||||
Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
|
||||
was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
|
||||
the Attribute instances.
|
||||
</li>
|
||||
<br>
|
||||
<li>
|
||||
For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
|
||||
constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
|
||||
in incrementToken() will avoid attribute lookups for every token in the document.
|
||||
</li>
|
||||
<br>
|
||||
<li>
|
||||
All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
|
||||
result. This is especially important to know for addAttribute(). The method takes the <b>type</b> (<code>Class</code>)
|
||||
of an Attribute as an argument and returns an <b>instance</b>. If an Attribute of the same type was previously added, then
|
||||
the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
|
||||
can safely call addAttribute() with the same Attribute type multiple times. Even consumers of TokenStreams should
|
||||
normally call addAttribute() instead of getAttribute(), because it would not fail if the TokenStream does not have this
|
||||
Attribute (getAttribute() would throw an IllegalArgumentException, if the Attribute is missing). More advanced code
|
||||
could simply check with hasAttribute(), if a TokenStream has it, and may conditionally leave out processing for
|
||||
extra performance.
|
||||
</li></ol>
|
||||
<h3>Example</h3>
|
||||
In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only
|
||||
have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained
|
||||
here to illustrate the usage of the new TokenStream API.<br>
|
||||
Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
|
||||
utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
|
||||
<h4>Whitespace tokenization</h4>
|
||||
<pre>
|
||||
public class MyAnalyzer extends Analyzer {
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
return stream;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
// text to tokenize
|
||||
final String text = "This is a demo of the new TokenStream API";
|
||||
|
||||
MyAnalyzer analyzer = new MyAnalyzer();
|
||||
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
|
||||
|
||||
// get the TermAttribute from the TokenStream
|
||||
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
|
||||
// print all tokens until stream is exhausted
|
||||
while (stream.incrementToken()) {
|
||||
System.out.println(termAtt.term());
|
||||
}
|
||||
|
||||
stream.end()
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and
|
||||
prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides.
|
||||
Here is the output:
|
||||
<pre>
|
||||
This
|
||||
is
|
||||
a
|
||||
demo
|
||||
of
|
||||
the
|
||||
new
|
||||
TokenStream
|
||||
API
|
||||
</pre>
|
||||
<h4>Adding a LengthFilter</h4>
|
||||
We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter
|
||||
to the chain. Only the tokenStream() method in our analyzer needs to be changed:
|
||||
<pre>
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
|
||||
return stream;
|
||||
}
|
||||
</pre>
|
||||
Note how now only words with 3 or more characters are contained in the output:
|
||||
<pre>
|
||||
This
|
||||
demo
|
||||
the
|
||||
new
|
||||
TokenStream
|
||||
API
|
||||
</pre>
|
||||
Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
|
||||
<pre>
|
||||
public final class LengthFilter extends TokenFilter {
|
||||
|
||||
final int min;
|
||||
final int max;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
*/
|
||||
public LengthFilter(TokenStream in, int min, int max)
|
||||
{
|
||||
super(in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
*/
|
||||
public final boolean incrementToken() throws IOException
|
||||
{
|
||||
assert termAtt != null;
|
||||
// return the first non-stop word found
|
||||
while (input.incrementToken()) {
|
||||
int len = termAtt.termLength();
|
||||
if (len >= min && len <= max) {
|
||||
return true;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return false;
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
The TermAttribute is added in the constructor and stored in the instance variable <code>termAtt</code>.
|
||||
Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
|
||||
<code>addAttribute()</code> call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
|
||||
are retrieved from the input stream in the <code>incrementToken()</code> method. By looking at the term text
|
||||
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
|
||||
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup
|
||||
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
|
||||
|
||||
<h4>Adding a custom Attribute</h4>
|
||||
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
|
||||
<code>PartOfSpeechAttribute</code>. First we need to define the interface of the new Attribute:
|
||||
<pre>
|
||||
public interface PartOfSpeechAttribute extends Attribute {
|
||||
public static enum PartOfSpeech {
|
||||
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
|
||||
}
|
||||
|
||||
public void setPartOfSpeech(PartOfSpeech pos);
|
||||
|
||||
public PartOfSpeech getPartOfSpeech();
|
||||
}
|
||||
</pre>
|
||||
|
||||
Now we also need to write the implementing class. The name of that class is important here: By default, Lucene
|
||||
checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would
|
||||
consequently call the implementing class <code>PartOfSpeechAttributeImpl</code>. <br/>
|
||||
This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions:
|
||||
{@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument
|
||||
and returns an actual instance. You can implement your own factory if you need to change the default behavior. <br/><br/>
|
||||
|
||||
Now here is the actual class that implements our new Attribute. Notice that the class has to extend
|
||||
{@link org.apache.lucene.util.AttributeImpl}:
|
||||
|
||||
<pre>
|
||||
public final class PartOfSpeechAttributeImpl extends AttributeImpl
|
||||
implements PartOfSpeechAttribute{
|
||||
|
||||
private PartOfSpeech pos = PartOfSpeech.Unknown;
|
||||
|
||||
public void setPartOfSpeech(PartOfSpeech pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public PartOfSpeech getPartOfSpeech() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
pos = PartOfSpeech.Unknown;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((PartOfSpeechAttributeImpl) target).pos = pos;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PartOfSpeechAttributeImpl) {
|
||||
return pos == ((PartOfSpeechAttributeImpl) other).pos;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return pos.ordinal();
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the
|
||||
new <code>AttributeImpl</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode()</code>.
|
||||
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
|
||||
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
|
||||
<pre>
|
||||
public static class PartOfSpeechTaggingFilter extends TokenFilter {
|
||||
PartOfSpeechAttribute posAtt;
|
||||
TermAttribute termAtt;
|
||||
|
||||
protected PartOfSpeechTaggingFilter(TokenStream input) {
|
||||
super(input);
|
||||
posAtt = addAttribute(PartOfSpeechAttribute.class);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {return false;}
|
||||
posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// determine the part of speech for the given term
|
||||
protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
|
||||
// naive implementation that tags every uppercased word as noun
|
||||
if (length > 0 && Character.isUpperCase(term[0])) {
|
||||
return PartOfSpeech.Noun;
|
||||
}
|
||||
return PartOfSpeech.Unknown;
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
|
||||
stores references in instance variables. Notice how you only need to pass in the interface of the new
|
||||
Attribute and instantiating the correct class is automatically been taken care of.
|
||||
Now we need to add the filter to the chain:
|
||||
<pre>
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
|
||||
stream = new PartOfSpeechTaggingFilter(stream);
|
||||
return stream;
|
||||
}
|
||||
</pre>
|
||||
Now let's look at the output:
|
||||
<pre>
|
||||
This
|
||||
demo
|
||||
the
|
||||
new
|
||||
TokenStream
|
||||
API
|
||||
</pre>
|
||||
Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
|
||||
affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
|
||||
to make use of the new PartOfSpeechAttribute and print it out:
|
||||
<pre>
|
||||
public static void main(String[] args) throws IOException {
|
||||
// text to tokenize
|
||||
final String text = "This is a demo of the new TokenStream API";
|
||||
|
||||
MyAnalyzer analyzer = new MyAnalyzer();
|
||||
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
|
||||
|
||||
// get the TermAttribute from the TokenStream
|
||||
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
|
||||
|
||||
// get the PartOfSpeechAttribute from the TokenStream
|
||||
PartOfSpeechAttribute posAtt = stream.addAttribute(PartOfSpeechAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
|
||||
// print all tokens until stream is exhausted
|
||||
while (stream.incrementToken()) {
|
||||
System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
|
||||
}
|
||||
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
</pre>
|
||||
The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in
|
||||
the while loop that consumes the stream. Here is the new output:
|
||||
<pre>
|
||||
This: Noun
|
||||
demo: Unknown
|
||||
the: Unknown
|
||||
new: Unknown
|
||||
TokenStream: Noun
|
||||
API: Noun
|
||||
</pre>
|
||||
Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive
|
||||
part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it
|
||||
is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new
|
||||
API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token
|
||||
of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
|
||||
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
|
||||
As a small hint, this is how the new Attribute class could begin:
|
||||
<pre>
|
||||
public class FirstTokenOfSentenceAttributeImpl extends Attribute
|
||||
implements FirstTokenOfSentenceAttribute {
|
||||
|
||||
private boolean firstToken;
|
||||
|
||||
public void setFirstToken(boolean firstToken) {
|
||||
this.firstToken = firstToken;
|
||||
}
|
||||
|
||||
public boolean getFirstToken() {
|
||||
return firstToken;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
firstToken = false;
|
||||
}
|
||||
|
||||
...
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,25 @@
|
|||
/*
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
|
||||
the tokenizer, only use Java 1.4 !!!
|
||||
This grammar currently uses constructs (eg :digit:, :letter:) whose
|
||||
meaning can vary according to the JRE used to run jflex. See
|
||||
https://issues.apache.org/jira/browse/LUCENE-1126 for details.
|
||||
For current backwards compatibility it is needed to support
|
||||
only Java 1.4 - this will change in Lucene 3.1.
|
|
@ -0,0 +1,161 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||
* LowerCaseFilter} and {@link StopFilter}, using a list of
|
||||
* English stop words.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StandardAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
|
||||
* </ul>
|
||||
*/
|
||||
public class StandardAnalyzer extends Analyzer {
|
||||
private Set<?> stopSet;
|
||||
|
||||
/**
|
||||
* Specifies whether deprecated acronyms should be replaced with HOST type.
|
||||
* See {@linkplain https://issues.apache.org/jira/browse/LUCENE-1068}
|
||||
*/
|
||||
private final boolean replaceInvalidAcronym,enableStopPositionIncrements;
|
||||
|
||||
/** An unmodifiable set containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
private final Version matchVersion;
|
||||
|
||||
/** Builds an analyzer with the default stop words ({@link
|
||||
* #STOP_WORDS_SET}).
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
*/
|
||||
public StandardAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopWords stop words */
|
||||
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
stopSet = stopWords;
|
||||
setOverridesTokenStreamMethod(StandardAnalyzer.class);
|
||||
enableStopPositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
* @see WordlistLoader#getWordSet(File)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopwords File to read stop words from */
|
||||
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopwords Reader to read stop words from */
|
||||
public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
|
||||
tokenStream.setMaxTokenLength(maxTokenLength);
|
||||
TokenStream result = new StandardFilter(tokenStream);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(enableStopPositionIncrements, result, stopSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static final class SavedStreams {
|
||||
StandardTokenizer tokenStream;
|
||||
TokenStream filteredTokenStream;
|
||||
}
|
||||
|
||||
/** Default maximum allowed token length */
|
||||
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
||||
|
||||
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/**
|
||||
* Set maximum allowed token length. If a token is seen
|
||||
* that exceeds this length then it is discarded. This
|
||||
* setting only takes effect the next time tokenStream or
|
||||
* reusableTokenStream is called.
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
maxTokenLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setMaxTokenLength
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
setPreviousTokenStream(streams);
|
||||
streams.tokenStream = new StandardTokenizer(matchVersion, reader);
|
||||
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
|
||||
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
|
||||
streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
|
||||
streams.filteredTokenStream, stopSet);
|
||||
} else {
|
||||
streams.tokenStream.reset(reader);
|
||||
}
|
||||
streams.tokenStream.setMaxTokenLength(maxTokenLength);
|
||||
|
||||
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
||||
|
||||
return streams.filteredTokenStream;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
||||
|
||||
public final class StandardFilter extends TokenFilter {
|
||||
|
||||
|
||||
/** Construct filtering <i>in</i>. */
|
||||
public StandardFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
|
||||
|
||||
// this filters uses attribute type
|
||||
private TypeAttribute typeAtt;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
final int bufferLength = termAtt.termLength();
|
||||
final String type = typeAtt.type();
|
||||
|
||||
if (type == APOSTROPHE_TYPE && // remove 's
|
||||
bufferLength >= 2 &&
|
||||
buffer[bufferLength-2] == '\'' &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||
// Strip last 2 characters off
|
||||
termAtt.setTermLength(bufferLength - 2);
|
||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||
int upto = 0;
|
||||
for(int i=0;i<bufferLength;i++) {
|
||||
char c = buffer[i];
|
||||
if (c != '.')
|
||||
buffer[upto++] = c;
|
||||
}
|
||||
termAtt.setTermLength(upto);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,244 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex
|
||||
*
|
||||
* <p> This should be a good tokenizer for most European-language documents:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Splits words at punctuation characters, removing punctuation. However, a
|
||||
* dot that's not followed by whitespace is considered part of a token.
|
||||
* <li>Splits words at hyphens, unless there's a number in the token, in which case
|
||||
* the whole token is interpreted as a product number and is not split.
|
||||
* <li>Recognizes email addresses and internet hostnames as one token.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Many applications have specific tokenizer needs. If this tokenizer does
|
||||
* not suit your application, please consider copying this source code
|
||||
* directory to your project and maintaining your own grammar-based tokenizer.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StandardAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public final class StandardTokenizer extends Tokenizer {
|
||||
/** A private instance of the JFlex-constructed scanner */
|
||||
private final StandardTokenizerImpl scanner;
|
||||
|
||||
public static final int ALPHANUM = 0;
|
||||
public static final int APOSTROPHE = 1;
|
||||
public static final int ACRONYM = 2;
|
||||
public static final int COMPANY = 3;
|
||||
public static final int EMAIL = 4;
|
||||
public static final int HOST = 5;
|
||||
public static final int NUM = 6;
|
||||
public static final int CJ = 7;
|
||||
|
||||
/**
|
||||
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||
* as ACRONYMs.
|
||||
*/
|
||||
public static final int ACRONYM_DEP = 8;
|
||||
|
||||
/** String token types that correspond to token type int constants */
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
"<APOSTROPHE>",
|
||||
"<ACRONYM>",
|
||||
"<COMPANY>",
|
||||
"<EMAIL>",
|
||||
"<HOST>",
|
||||
"<NUM>",
|
||||
"<CJ>",
|
||||
"<ACRONYM_DEP>"
|
||||
};
|
||||
|
||||
private boolean replaceInvalidAcronym;
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
/** @see #setMaxTokenLength */
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
|
||||
* the <code>input</code> to the newly created JFlex scanner.
|
||||
*
|
||||
* @param input The input reader
|
||||
*
|
||||
* See http://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
public StandardTokenizer(Version matchVersion, Reader input) {
|
||||
super();
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
init(input, matchVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new StandardTokenizer with a given {@link AttributeSource}.
|
||||
*/
|
||||
public StandardTokenizer(Version matchVersion, AttributeSource source, Reader input) {
|
||||
super(source);
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
init(input, matchVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
|
||||
*/
|
||||
public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
|
||||
super(factory);
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
init(input, matchVersion);
|
||||
}
|
||||
|
||||
private void init(Reader input, Version matchVersion) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_24)) {
|
||||
replaceInvalidAcronym = true;
|
||||
} else {
|
||||
replaceInvalidAcronym = false;
|
||||
}
|
||||
this.input = input;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
// offset, positionIncrement and type
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int posIncr = 1;
|
||||
|
||||
while(true) {
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (scanner.yylength() <= maxTokenLength) {
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
scanner.getText(termAtt);
|
||||
final int start = scanner.yychar();
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength()));
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||
if (replaceInvalidAcronym) {
|
||||
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||
termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
|
||||
} else {
|
||||
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||
}
|
||||
} else {
|
||||
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
return true;
|
||||
} else
|
||||
// When we skip a too-long term, we still increment the
|
||||
// position increment
|
||||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#reset()
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
|
||||
* when they should have been labeled as hosts instead.
|
||||
* @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
|
||||
*
|
||||
* @deprecated Remove in 3.X and make true the only valid value
|
||||
*/
|
||||
public boolean isReplaceInvalidAcronym() {
|
||||
return replaceInvalidAcronym;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
|
||||
* @deprecated Remove in 3.X and make true the only valid value
|
||||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,723 @@
|
|||
/* The following code was generated by JFlex 1.4.1 on 9/4/08 6:49 PM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
|
||||
the tokenizer, only use Java 1.4 !!!
|
||||
This grammar currently uses constructs (eg :digit:, :letter:) whose
|
||||
meaning can vary according to the JRE used to run jflex. See
|
||||
https://issues.apache.org/jira/browse/LUCENE-1126 for details.
|
||||
For current backwards compatibility it is needed to support
|
||||
only Java 1.4 - this will change in Lucene 3.1.
|
||||
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
||||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
||||
* on 9/4/08 6:49 PM from the specification file
|
||||
* <tt>/tango/mike/src/lucene.standarddigit/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class StandardTokenizerImpl {
|
||||
|
||||
/** This character denotes the end of file */
|
||||
public static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
|
||||
/** lexical states */
|
||||
public static final int YYINITIAL = 0;
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
*/
|
||||
private static final String ZZ_CMAP_PACKED =
|
||||
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
|
||||
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
|
||||
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
|
||||
"\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12\34\0\136\12"+
|
||||
"\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12\11\0\1\12"+
|
||||
"\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12\1\0\24\12"+
|
||||
"\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12\12\0\71\12"+
|
||||
"\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12\67\0\46\12"+
|
||||
"\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12\56\0\32\12"+
|
||||
"\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12\17\0\2\12"+
|
||||
"\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0\46\12\u015f\0"+
|
||||
"\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2\25\0"+
|
||||
"\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12\3\0"+
|
||||
"\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12\23\0\6\12"+
|
||||
"\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12\1\0\2\12"+
|
||||
"\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2\2\0\3\12"+
|
||||
"\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12\1\0\7\12"+
|
||||
"\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0\1\12"+
|
||||
"\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12"+
|
||||
"\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12"+
|
||||
"\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12\3\0\2\12"+
|
||||
"\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12\3\0\10\12"+
|
||||
"\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12\1\0\27\12"+
|
||||
"\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2\25\0\10\12"+
|
||||
"\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\44\0\1\12"+
|
||||
"\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12"+
|
||||
"\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12\3\0\30\12"+
|
||||
"\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1\60\12\1\1"+
|
||||
"\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0\1\12\2\0"+
|
||||
"\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0\7\12\1\0"+
|
||||
"\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0\4\12\1\0"+
|
||||
"\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0\12\2\2\0"+
|
||||
"\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0\42\12\35\0"+
|
||||
"\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0\12\2\6\0"+
|
||||
"\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0\104\12\5\0"+
|
||||
"\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0\4\12\2\0"+
|
||||
"\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0\1\12\1\0"+
|
||||
"\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
|
||||
"\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0\27\12\1\0"+
|
||||
"\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\47\12\1\0"+
|
||||
"\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0"+
|
||||
"\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0\12\2\6\0"+
|
||||
"\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0\26\12\2\0"+
|
||||
"\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0\1\12\1\0"+
|
||||
"\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0\7\12\1\0"+
|
||||
"\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0\6\12\4\0"+
|
||||
"\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0\1\12\4\0"+
|
||||
"\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0\1\12\1\0"+
|
||||
"\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0\7\12\u0ecb\0"+
|
||||
"\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13\2\13\132\13"+
|
||||
"\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
|
||||
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
|
||||
"\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12\5\0\1\12"+
|
||||
"\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12\1\0\2\12"+
|
||||
"\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12\2\0\66\12"+
|
||||
"\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12\23\0\12\2"+
|
||||
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
|
||||
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
*/
|
||||
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
|
||||
|
||||
/**
|
||||
* Translates DFA states to action switch labels.
|
||||
*/
|
||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
|
||||
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
|
||||
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
|
||||
"\1\4";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[51];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackAction(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Translates a state to a row index in the transition table
|
||||
*/
|
||||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||
|
||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
|
||||
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
|
||||
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
|
||||
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
|
||||
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
|
||||
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
|
||||
"\0\u0268\0\u0276\0\u0284";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[51];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int high = packed.charAt(i++) << 16;
|
||||
result[j++] = high | packed.charAt(i++);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/**
|
||||
* The transition table of the DFA
|
||||
*/
|
||||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||
|
||||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
|
||||
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
|
||||
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
|
||||
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
|
||||
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
|
||||
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
|
||||
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
|
||||
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
|
||||
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
|
||||
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
|
||||
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
|
||||
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
|
||||
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
|
||||
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
|
||||
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
|
||||
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
|
||||
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
|
||||
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
|
||||
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
|
||||
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
|
||||
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
|
||||
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
|
||||
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
|
||||
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
|
||||
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
|
||||
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
|
||||
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
|
||||
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
|
||||
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
|
||||
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
|
||||
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
|
||||
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
|
||||
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
|
||||
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
|
||||
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
|
||||
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
|
||||
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
|
||||
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
|
||||
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
|
||||
"\1\11\2\52\1\0\1\24\3\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[658];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackTrans(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
value--;
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
/* error codes */
|
||||
private static final int ZZ_UNKNOWN_ERROR = 0;
|
||||
private static final int ZZ_NO_MATCH = 1;
|
||||
private static final int ZZ_PUSHBACK_2BIG = 2;
|
||||
|
||||
/* error messages for the codes above */
|
||||
private static final String ZZ_ERROR_MSG[] = {
|
||||
"Unkown internal scanner error",
|
||||
"Error: could not match input",
|
||||
"Error: pushback value was too large"
|
||||
};
|
||||
|
||||
/**
|
||||
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
|
||||
*/
|
||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
|
||||
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[51];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/** the input device */
|
||||
private java.io.Reader zzReader;
|
||||
|
||||
/** the current state of the DFA */
|
||||
private int zzState;
|
||||
|
||||
/** the current lexical state */
|
||||
private int zzLexicalState = YYINITIAL;
|
||||
|
||||
/** this buffer contains the current text to be matched and is
|
||||
the source of the yytext() string */
|
||||
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
|
||||
|
||||
/** the textposition at the last accepting state */
|
||||
private int zzMarkedPos;
|
||||
|
||||
/** the textposition at the last state to be included in yytext */
|
||||
private int zzPushbackPos;
|
||||
|
||||
/** the current text position in the buffer */
|
||||
private int zzCurrentPos;
|
||||
|
||||
/** startRead marks the beginning of the yytext() string in the buffer */
|
||||
private int zzStartRead;
|
||||
|
||||
/** endRead marks the last character in the buffer, that has been read
|
||||
from input */
|
||||
private int zzEndRead;
|
||||
|
||||
/** number of newlines encountered up to the start of the matched text */
|
||||
private int yyline;
|
||||
|
||||
/** the number of characters up to the start of the matched text */
|
||||
private int yychar;
|
||||
|
||||
/**
|
||||
* the number of characters from the last newline up to the start of the
|
||||
* matched text
|
||||
*/
|
||||
private int yycolumn;
|
||||
|
||||
/**
|
||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
||||
*/
|
||||
private boolean zzAtBOL = true;
|
||||
|
||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
||||
private boolean zzAtEOF;
|
||||
|
||||
/* user code: */
|
||||
|
||||
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = StandardTokenizer.ACRONYM;
|
||||
public static final int COMPANY = StandardTokenizer.COMPANY;
|
||||
public static final int EMAIL = StandardTokenizer.EMAIL;
|
||||
public static final int HOST = StandardTokenizer.HOST;
|
||||
public static final int NUM = StandardTokenizer.NUM;
|
||||
public static final int CJ = StandardTokenizer.CJ;
|
||||
/**
|
||||
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||
* as ACRONYMs.
|
||||
*/
|
||||
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
|
||||
|
||||
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills Lucene token with the current token text.
|
||||
*/
|
||||
final void getText(Token t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills TermAttribute with the current token text.
|
||||
*/
|
||||
final void getText(TermAttribute t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new scanner
|
||||
* There is also a java.io.InputStream version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
StandardTokenizerImpl(java.io.Reader in) {
|
||||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
StandardTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
*
|
||||
* @param packed the packed character translation table
|
||||
* @return the unpacked character translation table
|
||||
*/
|
||||
private static char [] zzUnpackCMap(String packed) {
|
||||
char [] map = new char[0x10000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 1154) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Refills the input buffer.
|
||||
*
|
||||
* @return <code>false</code>, iff there was new input.
|
||||
*
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
private boolean zzRefill() throws java.io.IOException {
|
||||
|
||||
/* first: make room (if you can) */
|
||||
if (zzStartRead > 0) {
|
||||
System.arraycopy(zzBuffer, zzStartRead,
|
||||
zzBuffer, 0,
|
||||
zzEndRead-zzStartRead);
|
||||
|
||||
/* translate stored positions */
|
||||
zzEndRead-= zzStartRead;
|
||||
zzCurrentPos-= zzStartRead;
|
||||
zzMarkedPos-= zzStartRead;
|
||||
zzPushbackPos-= zzStartRead;
|
||||
zzStartRead = 0;
|
||||
}
|
||||
|
||||
/* is the buffer big enough? */
|
||||
if (zzCurrentPos >= zzBuffer.length) {
|
||||
/* if not: blow it up */
|
||||
char newBuffer[] = new char[zzCurrentPos*2];
|
||||
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
||||
zzBuffer = newBuffer;
|
||||
}
|
||||
|
||||
/* finally: fill the buffer with new input */
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead,
|
||||
zzBuffer.length-zzEndRead);
|
||||
|
||||
if (numRead < 0) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
zzEndRead+= numRead;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Closes the input stream.
|
||||
*/
|
||||
public final void yyclose() throws java.io.IOException {
|
||||
zzAtEOF = true; /* indicate end of file */
|
||||
zzEndRead = zzStartRead; /* invalidate buffer */
|
||||
|
||||
if (zzReader != null)
|
||||
zzReader.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resets the scanner to read from a new input stream.
|
||||
* Does not close the old reader.
|
||||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
*
|
||||
* @param reader the new input stream
|
||||
*/
|
||||
public final void yyreset(java.io.Reader reader) {
|
||||
zzReader = reader;
|
||||
zzAtBOL = true;
|
||||
zzAtEOF = false;
|
||||
zzEndRead = zzStartRead = 0;
|
||||
zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
|
||||
yyline = yychar = yycolumn = 0;
|
||||
zzLexicalState = YYINITIAL;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the current lexical state.
|
||||
*/
|
||||
public final int yystate() {
|
||||
return zzLexicalState;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Enters a new lexical state
|
||||
*
|
||||
* @param newState the new lexical state
|
||||
*/
|
||||
public final void yybegin(int newState) {
|
||||
zzLexicalState = newState;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the text matched by the current regular expression.
|
||||
*/
|
||||
public final String yytext() {
|
||||
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
*
|
||||
* @param pos the position of the character to fetch.
|
||||
* A value from 0 to yylength()-1.
|
||||
*
|
||||
* @return the character at position pos
|
||||
*/
|
||||
public final char yycharat(int pos) {
|
||||
return zzBuffer[zzStartRead+pos];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the length of the matched text region.
|
||||
*/
|
||||
public final int yylength() {
|
||||
return zzMarkedPos-zzStartRead;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reports an error that occured while scanning.
|
||||
*
|
||||
* In a wellformed scanner (no or only correct usage of
|
||||
* yypushback(int) and a match-all fallback rule) this method
|
||||
* will only be called with things that "Can't Possibly Happen".
|
||||
* If this method is called, something is seriously wrong
|
||||
* (e.g. a JFlex bug producing a faulty scanner etc.).
|
||||
*
|
||||
* Usual syntax/scanner level error handling should be done
|
||||
* in error fallback rules.
|
||||
*
|
||||
* @param errorCode the code of the errormessage to display
|
||||
*/
|
||||
private void zzScanError(int errorCode) {
|
||||
String message;
|
||||
try {
|
||||
message = ZZ_ERROR_MSG[errorCode];
|
||||
}
|
||||
catch (ArrayIndexOutOfBoundsException e) {
|
||||
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
|
||||
}
|
||||
|
||||
throw new Error(message);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pushes the specified amount of characters back into the input stream.
|
||||
*
|
||||
* They will be read again by then next call of the scanning method
|
||||
*
|
||||
* @param number the number of characters to be read again.
|
||||
* This number must not be greater than yylength()!
|
||||
*/
|
||||
public void yypushback(int number) {
|
||||
if ( number > yylength() )
|
||||
zzScanError(ZZ_PUSHBACK_2BIG);
|
||||
|
||||
zzMarkedPos -= number;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resumes scanning until the next regular expression is matched,
|
||||
* the end of input is encountered or an I/O-Error occurs.
|
||||
*
|
||||
* @return the next token
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
public int getNextToken() throws java.io.IOException {
|
||||
int zzInput;
|
||||
int zzAction;
|
||||
|
||||
// cached fields:
|
||||
int zzCurrentPosL;
|
||||
int zzMarkedPosL;
|
||||
int zzEndReadL = zzEndRead;
|
||||
char [] zzBufferL = zzBuffer;
|
||||
char [] zzCMapL = ZZ_CMAP;
|
||||
|
||||
int [] zzTransL = ZZ_TRANS;
|
||||
int [] zzRowMapL = ZZ_ROWMAP;
|
||||
int [] zzAttrL = ZZ_ATTRIBUTE;
|
||||
|
||||
while (true) {
|
||||
zzMarkedPosL = zzMarkedPos;
|
||||
|
||||
yychar+= zzMarkedPosL-zzStartRead;
|
||||
|
||||
zzAction = -1;
|
||||
|
||||
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
|
||||
|
||||
zzState = zzLexicalState;
|
||||
|
||||
|
||||
zzForAction: {
|
||||
while (true) {
|
||||
|
||||
if (zzCurrentPosL < zzEndReadL)
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
else if (zzAtEOF) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
// store back cached positions
|
||||
zzCurrentPos = zzCurrentPosL;
|
||||
zzMarkedPos = zzMarkedPosL;
|
||||
boolean eof = zzRefill();
|
||||
// get translated positions and possibly new buffer
|
||||
zzCurrentPosL = zzCurrentPos;
|
||||
zzMarkedPosL = zzMarkedPos;
|
||||
zzBufferL = zzBuffer;
|
||||
zzEndReadL = zzEndRead;
|
||||
if (eof) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
}
|
||||
}
|
||||
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
|
||||
if (zzNext == -1) break zzForAction;
|
||||
zzState = zzNext;
|
||||
|
||||
int zzAttributes = zzAttrL[zzState];
|
||||
if ( (zzAttributes & 1) == 1 ) {
|
||||
zzAction = zzState;
|
||||
zzMarkedPosL = zzCurrentPosL;
|
||||
if ( (zzAttributes & 8) == 8 ) break zzForAction;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// store back cached position
|
||||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 4:
|
||||
{ return HOST;
|
||||
}
|
||||
case 11: break;
|
||||
case 9:
|
||||
{ return ACRONYM;
|
||||
}
|
||||
case 12: break;
|
||||
case 8:
|
||||
{ return ACRONYM_DEP;
|
||||
}
|
||||
case 13: break;
|
||||
case 1:
|
||||
{ /* ignore */
|
||||
}
|
||||
case 14: break;
|
||||
case 5:
|
||||
{ return NUM;
|
||||
}
|
||||
case 15: break;
|
||||
case 3:
|
||||
{ return CJ;
|
||||
}
|
||||
case 16: break;
|
||||
case 2:
|
||||
{ return ALPHANUM;
|
||||
}
|
||||
case 17: break;
|
||||
case 7:
|
||||
{ return COMPANY;
|
||||
}
|
||||
case 18: break;
|
||||
case 6:
|
||||
{ return APOSTROPHE;
|
||||
}
|
||||
case 19: break;
|
||||
case 10:
|
||||
{ return EMAIL;
|
||||
}
|
||||
case 20: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
zzAtEOF = true;
|
||||
return YYEOF;
|
||||
}
|
||||
else {
|
||||
zzScanError(ZZ_NO_MATCH);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
|
||||
the tokenizer, only use Java 1.4 !!!
|
||||
This grammar currently uses constructs (eg :digit:, :letter:) whose
|
||||
meaning can vary according to the JRE used to run jflex. See
|
||||
https://issues.apache.org/jira/browse/LUCENE-1126 for details.
|
||||
For current backwards compatibility it is needed to support
|
||||
only Java 1.4 - this will change in Lucene 3.1.
|
||||
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
%%
|
||||
|
||||
%class StandardTokenizerImpl
|
||||
%unicode
|
||||
%integer
|
||||
%function getNextToken
|
||||
%pack
|
||||
%char
|
||||
|
||||
%{
|
||||
|
||||
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = StandardTokenizer.ACRONYM;
|
||||
public static final int COMPANY = StandardTokenizer.COMPANY;
|
||||
public static final int EMAIL = StandardTokenizer.EMAIL;
|
||||
public static final int HOST = StandardTokenizer.HOST;
|
||||
public static final int NUM = StandardTokenizer.NUM;
|
||||
public static final int CJ = StandardTokenizer.CJ;
|
||||
/**
|
||||
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||
* as ACRONYMs.
|
||||
*/
|
||||
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
|
||||
|
||||
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills Lucene token with the current token text.
|
||||
*/
|
||||
final void getText(Token t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills TermAttribute with the current token text.
|
||||
*/
|
||||
final void getText(TermAttribute t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
THAI = [\u0E00-\u0E59]
|
||||
|
||||
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
|
||||
ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
|
||||
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
// use a post-filter to remove possessives
|
||||
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
|
||||
|
||||
// acronyms: U.S.A., I.B.M., etc.
|
||||
// use a post-filter to remove dots
|
||||
ACRONYM = {LETTER} "." ({LETTER} ".")+
|
||||
|
||||
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
|
||||
|
||||
// company names like AT&T and Excite@Home.
|
||||
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
|
||||
|
||||
// email addresses
|
||||
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
|
||||
|
||||
// hostname
|
||||
HOST = {ALPHANUM} ((".") {ALPHANUM})+
|
||||
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
|
||||
| {HAS_DIGIT} {P} {ALPHANUM}
|
||||
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
|
||||
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
|
||||
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
|
||||
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
|
||||
|
||||
// punctuation
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
|
||||
// at least one digit
|
||||
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
|
||||
|
||||
ALPHA = ({LETTER})+
|
||||
|
||||
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
|
||||
LETTER = !(![:letter:]|{CJ})
|
||||
|
||||
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
|
||||
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
|
||||
|
||||
WHITESPACE = \r\n | [ \r\n\t\f]
|
||||
|
||||
%%
|
||||
|
||||
{ALPHANUM} { return ALPHANUM; }
|
||||
{APOSTROPHE} { return APOSTROPHE; }
|
||||
{ACRONYM} { return ACRONYM; }
|
||||
{COMPANY} { return COMPANY; }
|
||||
{EMAIL} { return EMAIL; }
|
||||
{HOST} { return HOST; }
|
||||
{NUM} { return NUM; }
|
||||
{CJ} { return CJ; }
|
||||
{ACRONYM_DEP} { return ACRONYM_DEP; }
|
||||
|
||||
/** Ignore the rest */
|
||||
. | {WHITESPACE} { /* ignore */ }
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
A fast grammar-based tokenizer constructed with JFlex.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* This attribute can be used to pass different flags down the {@link Tokenizer} chain,
|
||||
* eg from one TokenFilter to another one.
|
||||
*/
|
||||
public interface FlagsAttribute extends Attribute {
|
||||
/**
|
||||
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
*/
|
||||
public int getFlags();
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags);
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* This attribute can be used to pass different flags down the tokenizer chain,
|
||||
* eg from one TokenFilter to another one.
|
||||
*/
|
||||
public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
|
||||
private int flags = 0;
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
*/
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
flags = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof FlagsAttributeImpl) {
|
||||
return ((FlagsAttributeImpl) other).flags == flags;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
FlagsAttribute t = (FlagsAttribute) target;
|
||||
t.setFlags(flags);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* The start and end character offset of a Token.
|
||||
*/
|
||||
public interface OffsetAttribute extends Attribute {
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public int startOffset();
|
||||
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset);
|
||||
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public int endOffset();
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The start and end character offset of a Token.
|
||||
*/
|
||||
public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable, Serializable {
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
startOffset = 0;
|
||||
endOffset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof OffsetAttributeImpl) {
|
||||
OffsetAttributeImpl o = (OffsetAttributeImpl) other;
|
||||
return o.startOffset == startOffset && o.endOffset == endOffset;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int code = startOffset;
|
||||
code = code * 31 + endOffset;
|
||||
return code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
OffsetAttribute t = (OffsetAttribute) target;
|
||||
t.setOffset(startOffset, endOffset);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* The payload of a Token. See also {@link Payload}.
|
||||
*/
|
||||
public interface PayloadAttribute extends Attribute {
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public Payload getPayload();
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(Payload payload);
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The payload of a Token. See also {@link Payload}.
|
||||
*/
|
||||
public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable, Serializable {
|
||||
private Payload payload;
|
||||
|
||||
/**
|
||||
* Initialize this attribute with no payload.
|
||||
*/
|
||||
public PayloadAttributeImpl() {}
|
||||
|
||||
/**
|
||||
* Initialize this attribute with the given payload.
|
||||
*/
|
||||
public PayloadAttributeImpl(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public Payload getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
payload = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone();
|
||||
if (payload != null) {
|
||||
clone.payload = (Payload) payload.clone();
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PayloadAttribute) {
|
||||
PayloadAttributeImpl o = (PayloadAttributeImpl) other;
|
||||
if (o.payload == null || payload == null) {
|
||||
return o.payload == null && payload == null;
|
||||
}
|
||||
|
||||
return o.payload.equals(payload);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (payload == null) ? 0 : payload.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
PayloadAttribute t = (PayloadAttribute) target;
|
||||
t.setPayload((payload == null) ? null : (Payload) payload.clone());
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** The positionIncrement determines the position of this token
|
||||
* relative to the previous Token in a TokenStream, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
*
|
||||
* @see org.apache.lucene.index.TermPositions
|
||||
*/
|
||||
public interface PositionIncrementAttribute extends Attribute {
|
||||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement);
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement();
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/** The positionIncrement determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
*/
|
||||
public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable, Serializable {
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
throw new IllegalArgumentException
|
||||
("Increment must be zero or greater: " + positionIncrement);
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
this.positionIncrement = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PositionIncrementAttributeImpl) {
|
||||
return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
||||
t.setPositionIncrement(positionIncrement);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* The term text of a Token.
|
||||
*/
|
||||
public interface TermAttribute extends Attribute {
|
||||
/** Returns the Token's term text.
|
||||
*
|
||||
* This method has a performance penalty
|
||||
* because the text is stored internally in a char[]. If
|
||||
* possible, use {@link #termBuffer()} and {@link
|
||||
* #termLength()} directly instead. If you really need a
|
||||
* String, use this method, which is nothing more than
|
||||
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||
*/
|
||||
public String term();
|
||||
|
||||
/** Copies the contents of buffer, starting at offset for
|
||||
* length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(char[] buffer, int offset, int length);
|
||||
|
||||
/** Copies the contents of buffer into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer);
|
||||
|
||||
/** Copies the contents of buffer, starting at offset and continuing
|
||||
* for length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer, int offset, int length);
|
||||
|
||||
/** Returns the internal termBuffer character array which
|
||||
* you can then directly alter. If the array is too
|
||||
* small for your token, use {@link
|
||||
* #resizeTermBuffer(int)} to increase it. After
|
||||
* altering the buffer be sure to call {@link
|
||||
* #setTermLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
public char[] termBuffer();
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
* existing content. Note: If the next operation is to change
|
||||
* the contents of the term buffer use
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setTermBuffer(String)}, or
|
||||
* {@link #setTermBuffer(String, int, int)}
|
||||
* to optimally combine the resize with the setting of the termBuffer.
|
||||
* @param newSize minimum size of the new termBuffer
|
||||
* @return newly created termBuffer with length >= newSize
|
||||
*/
|
||||
public char[] resizeTermBuffer(int newSize);
|
||||
|
||||
/** Return number of valid characters (length of the term)
|
||||
* in the termBuffer array. */
|
||||
public int termLength();
|
||||
|
||||
/** Set number of valid characters (length of the term) in
|
||||
* the termBuffer array. Use this to truncate the termBuffer
|
||||
* or to synchronize with external manipulation of the termBuffer.
|
||||
* Note: to grow the size of the array,
|
||||
* use {@link #resizeTermBuffer(int)} first.
|
||||
* @param length the truncated length
|
||||
*/
|
||||
public void setTermLength(int length);
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The term text of a Token.
|
||||
*/
|
||||
public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable {
|
||||
private static int MIN_BUFFER_SIZE = 10;
|
||||
|
||||
private char[] termBuffer;
|
||||
private int termLength;
|
||||
|
||||
/** Returns the Token's term text.
|
||||
*
|
||||
* This method has a performance penalty
|
||||
* because the text is stored internally in a char[]. If
|
||||
* possible, use {@link #termBuffer()} and {@link
|
||||
* #termLength()} directly instead. If you really need a
|
||||
* String, use this method, which is nothing more than
|
||||
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||
*/
|
||||
public String term() {
|
||||
initTermBuffer();
|
||||
return new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset for
|
||||
* length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
growTermBuffer(length);
|
||||
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer) {
|
||||
int length = buffer.length();
|
||||
growTermBuffer(length);
|
||||
buffer.getChars(0, length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset and continuing
|
||||
* for length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer, int offset, int length) {
|
||||
assert offset <= buffer.length();
|
||||
assert offset + length <= buffer.length();
|
||||
growTermBuffer(length);
|
||||
buffer.getChars(offset, offset + length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Returns the internal termBuffer character array which
|
||||
* you can then directly alter. If the array is too
|
||||
* small for your token, use {@link
|
||||
* #resizeTermBuffer(int)} to increase it. After
|
||||
* altering the buffer be sure to call {@link
|
||||
* #setTermLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
public char[] termBuffer() {
|
||||
initTermBuffer();
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
* existing content. Note: If the next operation is to change
|
||||
* the contents of the term buffer use
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setTermBuffer(String)}, or
|
||||
* {@link #setTermBuffer(String, int, int)}
|
||||
* to optimally combine the resize with the setting of the termBuffer.
|
||||
* @param newSize minimum size of the new termBuffer
|
||||
* @return newly created termBuffer with length >= newSize
|
||||
*/
|
||||
public char[] resizeTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation and preserve content
|
||||
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
}
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
|
||||
/** Allocates a buffer char[] of at least newSize, without preserving the existing content.
|
||||
* its always used in places that set the content
|
||||
* @param newSize minimum size of the buffer
|
||||
*/
|
||||
private void growTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return number of valid characters (length of the term)
|
||||
* in the termBuffer array. */
|
||||
public int termLength() {
|
||||
return termLength;
|
||||
}
|
||||
|
||||
/** Set number of valid characters (length of the term) in
|
||||
* the termBuffer array. Use this to truncate the termBuffer
|
||||
* or to synchronize with external manipulation of the termBuffer.
|
||||
* Note: to grow the size of the array,
|
||||
* use {@link #resizeTermBuffer(int)} first.
|
||||
* @param length the truncated length
|
||||
*/
|
||||
public void setTermLength(int length) {
|
||||
initTermBuffer();
|
||||
if (length > termBuffer.length)
|
||||
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
initTermBuffer();
|
||||
int code = termLength;
|
||||
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
||||
return code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
termLength = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
TermAttributeImpl t = (TermAttributeImpl)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TermAttribute) {
|
||||
initTermBuffer();
|
||||
TermAttributeImpl o = ((TermAttributeImpl) other);
|
||||
o.initTermBuffer();
|
||||
|
||||
if (termLength != o.termLength)
|
||||
return false;
|
||||
for(int i=0;i<termLength;i++) {
|
||||
if (termBuffer[i] != o.termBuffer[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
initTermBuffer();
|
||||
return "term=" + new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
initTermBuffer();
|
||||
TermAttribute t = (TermAttribute) target;
|
||||
t.setTermBuffer(termBuffer, 0, termLength);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* A Token's lexical type. The Default value is "word".
|
||||
*/
|
||||
public interface TypeAttribute extends Attribute {
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public String type();
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public void setType(String type);
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* A Token's lexical type. The Default value is "word".
|
||||
*/
|
||||
public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable, Serializable {
|
||||
private String type;
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
public TypeAttributeImpl() {
|
||||
this(DEFAULT_TYPE);
|
||||
}
|
||||
|
||||
public TypeAttributeImpl(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TypeAttributeImpl) {
|
||||
return type.equals(((TypeAttributeImpl) other).type);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return type.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
TypeAttribute t = (TypeAttribute) target;
|
||||
t.setType(type);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,294 @@
|
|||
package org.apache.lucene.document;
|
||||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.PhraseQuery; // for javadocs
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.StringHelper; // for javadocs
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public abstract class AbstractField implements Fieldable {
|
||||
|
||||
protected String name = "body";
|
||||
protected boolean storeTermVector = false;
|
||||
protected boolean storeOffsetWithTermVector = false;
|
||||
protected boolean storePositionWithTermVector = false;
|
||||
protected boolean omitNorms = false;
|
||||
protected boolean isStored = false;
|
||||
protected boolean isIndexed = true;
|
||||
protected boolean isTokenized = true;
|
||||
protected boolean isBinary = false;
|
||||
protected boolean lazy = false;
|
||||
protected boolean omitTermFreqAndPositions = false;
|
||||
protected float boost = 1.0f;
|
||||
// the data object for all different kind of field values
|
||||
protected Object fieldsData = null;
|
||||
// pre-analyzed tokenStream for indexed fields
|
||||
protected TokenStream tokenStream;
|
||||
// length/offset for all primitive types
|
||||
protected int binaryLength;
|
||||
protected int binaryOffset;
|
||||
|
||||
protected AbstractField()
|
||||
{
|
||||
}
|
||||
|
||||
protected AbstractField(String name, Field.Store store, Field.Index index, Field.TermVector termVector) {
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
this.name = StringHelper.intern(name); // field names are interned
|
||||
|
||||
this.isStored = store.isStored();
|
||||
this.isIndexed = index.isIndexed();
|
||||
this.isTokenized = index.isAnalyzed();
|
||||
this.omitNorms = index.omitNorms();
|
||||
|
||||
this.isBinary = false;
|
||||
|
||||
setStoreTermVector(termVector);
|
||||
}
|
||||
|
||||
/** Sets the boost factor hits on this field. This value will be
|
||||
* multiplied into the score of all hits on this this field of this
|
||||
* document.
|
||||
*
|
||||
* <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
|
||||
* containing this field. If a document has multiple fields with the same
|
||||
* name, all such values are multiplied together. This product is then
|
||||
* used to compute the norm factor for the field. By
|
||||
* default, in the {@link
|
||||
* org.apache.lucene.search.Similarity#computeNorm(String,
|
||||
* FieldInvertState)} method, the boost value is multipled
|
||||
* by the {@link
|
||||
* org.apache.lucene.search.Similarity#lengthNorm(String,
|
||||
* int)} and then
|
||||
* rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
|
||||
* index. One should attempt to ensure that this product does not overflow
|
||||
* the range of that encoding.
|
||||
*
|
||||
* @see org.apache.lucene.document.Document#setBoost(float)
|
||||
* @see org.apache.lucene.search.Similarity#computeNorm(String, org.apache.lucene.index.FieldInvertState)
|
||||
* @see org.apache.lucene.search.Similarity#encodeNorm(float)
|
||||
*/
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/** Returns the boost factor for hits for this field.
|
||||
*
|
||||
* <p>The default value is 1.0.
|
||||
*
|
||||
* <p>Note: this value is not stored directly with the document in the index.
|
||||
* Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and
|
||||
* {@link org.apache.lucene.search.Searcher#doc(int)} may thus not have the same value present as when
|
||||
* this field was indexed.
|
||||
*
|
||||
* @see #setBoost(float)
|
||||
*/
|
||||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
/** Returns the name of the field as an interned string.
|
||||
* For example "date", "title", "body", ...
|
||||
*/
|
||||
public String name() { return name; }
|
||||
|
||||
protected void setStoreTermVector(Field.TermVector termVector) {
|
||||
this.storeTermVector = termVector.isStored();
|
||||
this.storePositionWithTermVector = termVector.withPositions();
|
||||
this.storeOffsetWithTermVector = termVector.withOffsets();
|
||||
}
|
||||
|
||||
/** True iff the value of the field is to be stored in the index for return
|
||||
with search hits. It is an error for this to be true if a field is
|
||||
Reader-valued. */
|
||||
public final boolean isStored() { return isStored; }
|
||||
|
||||
/** True iff the value of the field is to be indexed, so that it may be
|
||||
searched on. */
|
||||
public final boolean isIndexed() { return isIndexed; }
|
||||
|
||||
/** True iff the value of the field should be tokenized as text prior to
|
||||
indexing. Un-tokenized fields are indexed as a single word and may not be
|
||||
Reader-valued. */
|
||||
public final boolean isTokenized() { return isTokenized; }
|
||||
|
||||
/** True iff the term or terms used to index this field are stored as a term
|
||||
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
|
||||
* These methods do not provide access to the original content of the field,
|
||||
* only to terms used to index it. If the original content must be
|
||||
* preserved, use the <code>stored</code> attribute instead.
|
||||
*
|
||||
* @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String)
|
||||
*/
|
||||
public final boolean isTermVectorStored() { return storeTermVector; }
|
||||
|
||||
/**
|
||||
* True iff terms are stored as term vector together with their offsets
|
||||
* (start and end position in source text).
|
||||
*/
|
||||
public boolean isStoreOffsetWithTermVector(){
|
||||
return storeOffsetWithTermVector;
|
||||
}
|
||||
|
||||
/**
|
||||
* True iff terms are stored as term vector together with their token positions.
|
||||
*/
|
||||
public boolean isStorePositionWithTermVector(){
|
||||
return storePositionWithTermVector;
|
||||
}
|
||||
|
||||
/** True iff the value of the filed is stored as binary */
|
||||
public final boolean isBinary() {
|
||||
return isBinary;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the raw byte[] for the binary field. Note that
|
||||
* you must also call {@link #getBinaryLength} and {@link
|
||||
* #getBinaryOffset} to know which range of bytes in this
|
||||
* returned array belong to the field.
|
||||
* @return reference to the Field value as byte[].
|
||||
*/
|
||||
public byte[] getBinaryValue() {
|
||||
return getBinaryValue(null);
|
||||
}
|
||||
|
||||
public byte[] getBinaryValue(byte[] result){
|
||||
if (isBinary || fieldsData instanceof byte[])
|
||||
return (byte[]) fieldsData;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns length of byte[] segment that is used as value, if Field is not binary
|
||||
* returned value is undefined
|
||||
* @return length of byte[] segment that represents this Field value
|
||||
*/
|
||||
public int getBinaryLength() {
|
||||
if (isBinary) {
|
||||
return binaryLength;
|
||||
} else if (fieldsData instanceof byte[])
|
||||
return ((byte[]) fieldsData).length;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns offset into byte[] segment that is used as value, if Field is not binary
|
||||
* returned value is undefined
|
||||
* @return index of the first character in byte[] segment that represents this Field value
|
||||
*/
|
||||
public int getBinaryOffset() {
|
||||
return binaryOffset;
|
||||
}
|
||||
|
||||
/** True if norms are omitted for this indexed field */
|
||||
public boolean getOmitNorms() { return omitNorms; }
|
||||
|
||||
/** @see #setOmitTermFreqAndPositions */
|
||||
public boolean getOmitTermFreqAndPositions() { return omitTermFreqAndPositions; }
|
||||
|
||||
/** Expert:
|
||||
*
|
||||
* If set, omit normalization factors associated with this indexed field.
|
||||
* This effectively disables indexing boosts and length normalization for this field.
|
||||
*/
|
||||
public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; }
|
||||
|
||||
/** Expert:
|
||||
*
|
||||
* If set, omit term freq, positions and payloads from
|
||||
* postings for this field.
|
||||
*
|
||||
* <p><b>NOTE</b>: While this option reduces storage space
|
||||
* required in the index, it also means any query
|
||||
* requiring positional information, such as {@link
|
||||
* PhraseQuery} or {@link SpanQuery} subclasses will
|
||||
* silently fail to find results.
|
||||
*/
|
||||
public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; }
|
||||
|
||||
public boolean isLazy() {
|
||||
return lazy;
|
||||
}
|
||||
|
||||
/** Prints a Field for human consumption. */
|
||||
@Override
|
||||
public final String toString() {
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (isStored) {
|
||||
result.append("stored");
|
||||
}
|
||||
if (isIndexed) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("indexed");
|
||||
}
|
||||
if (isTokenized) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("tokenized");
|
||||
}
|
||||
if (storeTermVector) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("termVector");
|
||||
}
|
||||
if (storeOffsetWithTermVector) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("termVectorOffsets");
|
||||
}
|
||||
if (storePositionWithTermVector) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("termVectorPosition");
|
||||
}
|
||||
if (isBinary) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("binary");
|
||||
}
|
||||
if (omitNorms) {
|
||||
result.append(",omitNorms");
|
||||
}
|
||||
if (omitTermFreqAndPositions) {
|
||||
result.append(",omitTermFreqAndPositions");
|
||||
}
|
||||
if (lazy){
|
||||
result.append(",lazy");
|
||||
}
|
||||
result.append('<');
|
||||
result.append(name);
|
||||
result.append(':');
|
||||
|
||||
if (fieldsData != null && lazy == false) {
|
||||
result.append(fieldsData);
|
||||
}
|
||||
|
||||
result.append('>');
|
||||
return result.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.zip.Deflater;
|
||||
import java.util.zip.Inflater;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** Simple utility class providing static methods to
|
||||
* compress and decompress binary data for stored fields.
|
||||
* This class uses java.util.zip.Deflater and Inflater
|
||||
* classes to compress and decompress.
|
||||
*/
|
||||
|
||||
public class CompressionTools {
|
||||
|
||||
// Export only static methods
|
||||
private CompressionTools() {}
|
||||
|
||||
/** Compresses the specified byte range using the
|
||||
* specified compressionLevel (constants are defined in
|
||||
* java.util.zip.Deflater). */
|
||||
public static byte[] compress(byte[] value, int offset, int length, int compressionLevel) {
|
||||
|
||||
/* Create an expandable byte array to hold the compressed data.
|
||||
* You cannot use an array that's the same size as the orginal because
|
||||
* there is no guarantee that the compressed data will be smaller than
|
||||
* the uncompressed data. */
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(length);
|
||||
|
||||
Deflater compressor = new Deflater();
|
||||
|
||||
try {
|
||||
compressor.setLevel(compressionLevel);
|
||||
compressor.setInput(value, offset, length);
|
||||
compressor.finish();
|
||||
|
||||
// Compress the data
|
||||
final byte[] buf = new byte[1024];
|
||||
while (!compressor.finished()) {
|
||||
int count = compressor.deflate(buf);
|
||||
bos.write(buf, 0, count);
|
||||
}
|
||||
} finally {
|
||||
compressor.end();
|
||||
}
|
||||
|
||||
return bos.toByteArray();
|
||||
}
|
||||
|
||||
/** Compresses the specified byte range, with default BEST_COMPRESSION level */
|
||||
public static byte[] compress(byte[] value, int offset, int length) {
|
||||
return compress(value, offset, length, Deflater.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/** Compresses all bytes in the array, with default BEST_COMPRESSION level */
|
||||
public static byte[] compress(byte[] value) {
|
||||
return compress(value, 0, value.length, Deflater.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/** Compresses the String value, with default BEST_COMPRESSION level */
|
||||
public static byte[] compressString(String value) {
|
||||
return compressString(value, Deflater.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/** Compresses the String value using the specified
|
||||
* compressionLevel (constants are defined in
|
||||
* java.util.zip.Deflater). */
|
||||
public static byte[] compressString(String value, int compressionLevel) {
|
||||
UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
|
||||
UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
|
||||
return compress(result.result, 0, result.length, compressionLevel);
|
||||
}
|
||||
|
||||
/** Decompress the byte array previously returned by
|
||||
* compress */
|
||||
public static byte[] decompress(byte[] value) throws DataFormatException {
|
||||
// Create an expandable byte array to hold the decompressed data
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(value.length);
|
||||
|
||||
Inflater decompressor = new Inflater();
|
||||
|
||||
try {
|
||||
decompressor.setInput(value);
|
||||
|
||||
// Decompress the data
|
||||
final byte[] buf = new byte[1024];
|
||||
while (!decompressor.finished()) {
|
||||
int count = decompressor.inflate(buf);
|
||||
bos.write(buf, 0, count);
|
||||
}
|
||||
} finally {
|
||||
decompressor.end();
|
||||
}
|
||||
|
||||
return bos.toByteArray();
|
||||
}
|
||||
|
||||
/** Decompress the byte array previously returned by
|
||||
* compressString back into a String */
|
||||
public static String decompressString(byte[] value) throws DataFormatException {
|
||||
UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result();
|
||||
final byte[] bytes = decompress(value);
|
||||
UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);
|
||||
return new String(result.result, 0, result.length);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
|
||||
import org.apache.lucene.util.NumericUtils; // for javadocs
|
||||
|
||||
import java.util.Date; // for javadoc
|
||||
import java.util.Calendar; // for javadoc
|
||||
|
||||
// do not remove in 3.0, needed for reading old indexes!
|
||||
|
||||
/**
|
||||
* Provides support for converting dates to strings and vice-versa.
|
||||
* The strings are structured so that lexicographic sorting orders by date,
|
||||
* which makes them suitable for use as field values and search terms.
|
||||
*
|
||||
* <P>Note that this class saves dates with millisecond granularity,
|
||||
* which is bad for {@link TermRangeQuery} and {@link PrefixQuery}, as those
|
||||
* queries are expanded to a BooleanQuery with a potentially large number
|
||||
* of terms when searching. Thus you might want to use
|
||||
* {@link DateTools} instead.
|
||||
*
|
||||
* <P>
|
||||
* Note: dates before 1970 cannot be used, and therefore cannot be
|
||||
* indexed when using this class. See {@link DateTools} for an
|
||||
* alternative without such a limitation.
|
||||
*
|
||||
* <P>
|
||||
* Another approach is {@link NumericUtils}, which provides
|
||||
* a sortable binary representation (prefix encoded) of numeric values, which
|
||||
* date/time are.
|
||||
* For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as
|
||||
* <code>long</code> using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and
|
||||
* index this as a numeric value with {@link NumericField}
|
||||
* and use {@link NumericRangeQuery} to query it.
|
||||
*
|
||||
* @deprecated If you build a new index, use {@link DateTools} or
|
||||
* {@link NumericField} instead.
|
||||
* This class is included for use with existing
|
||||
* indices and will be removed in a future release (possibly Lucene 4.0).
|
||||
*/
|
||||
public class DateField {
|
||||
|
||||
private DateField() {}
|
||||
|
||||
// make date strings long enough to last a millenium
|
||||
private static int DATE_LEN = Long.toString(1000L*365*24*60*60*1000,
|
||||
Character.MAX_RADIX).length();
|
||||
|
||||
public static String MIN_DATE_STRING() {
|
||||
return timeToString(0);
|
||||
}
|
||||
|
||||
public static String MAX_DATE_STRING() {
|
||||
char[] buffer = new char[DATE_LEN];
|
||||
char c = Character.forDigit(Character.MAX_RADIX-1, Character.MAX_RADIX);
|
||||
for (int i = 0 ; i < DATE_LEN; i++)
|
||||
buffer[i] = c;
|
||||
return new String(buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a Date to a string suitable for indexing.
|
||||
* @throws RuntimeException if the date specified in the
|
||||
* method argument is before 1970
|
||||
*/
|
||||
public static String dateToString(Date date) {
|
||||
return timeToString(date.getTime());
|
||||
}
|
||||
/**
|
||||
* Converts a millisecond time to a string suitable for indexing.
|
||||
* @throws RuntimeException if the time specified in the
|
||||
* method argument is negative, that is, before 1970
|
||||
*/
|
||||
public static String timeToString(long time) {
|
||||
if (time < 0)
|
||||
throw new RuntimeException("time '" + time + "' is too early, must be >= 0");
|
||||
|
||||
String s = Long.toString(time, Character.MAX_RADIX);
|
||||
|
||||
if (s.length() > DATE_LEN)
|
||||
throw new RuntimeException("time '" + time + "' is too late, length of string " +
|
||||
"representation must be <= " + DATE_LEN);
|
||||
|
||||
// Pad with leading zeros
|
||||
if (s.length() < DATE_LEN) {
|
||||
StringBuilder sb = new StringBuilder(s);
|
||||
while (sb.length() < DATE_LEN)
|
||||
sb.insert(0, 0);
|
||||
s = sb.toString();
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/** Converts a string-encoded date into a millisecond time. */
|
||||
public static long stringToTime(String s) {
|
||||
return Long.parseLong(s, Character.MAX_RADIX);
|
||||
}
|
||||
/** Converts a string-encoded date into a Date object. */
|
||||
public static Date stringToDate(String s) {
|
||||
return new Date(stringToTime(s));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.TimeZone;
|
||||
import java.util.Locale;
|
||||
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
|
||||
import org.apache.lucene.util.NumericUtils; // for javadocs
|
||||
|
||||
/**
|
||||
* Provides support for converting dates to strings and vice-versa.
|
||||
* The strings are structured so that lexicographic sorting orders
|
||||
* them by date, which makes them suitable for use as field values
|
||||
* and search terms.
|
||||
*
|
||||
* <P>This class also helps you to limit the resolution of your dates. Do not
|
||||
* save dates with a finer resolution than you really need, as then
|
||||
* RangeQuery and PrefixQuery will require more memory and become slower.
|
||||
*
|
||||
* <P>Compared to {@link DateField} the strings generated by the methods
|
||||
* in this class take slightly more space, unless your selected resolution
|
||||
* is set to <code>Resolution.DAY</code> or lower.
|
||||
*
|
||||
* <P>
|
||||
* Another approach is {@link NumericUtils}, which provides
|
||||
* a sortable binary representation (prefix encoded) of numeric values, which
|
||||
* date/time are.
|
||||
* For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as
|
||||
* <code>long</code> using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and
|
||||
* index this as a numeric value with {@link NumericField}
|
||||
* and use {@link NumericRangeQuery} to query it.
|
||||
*/
|
||||
public class DateTools {
|
||||
|
||||
private final static TimeZone GMT = TimeZone.getTimeZone("GMT");
|
||||
|
||||
private static final SimpleDateFormat YEAR_FORMAT = new SimpleDateFormat("yyyy", Locale.US);
|
||||
private static final SimpleDateFormat MONTH_FORMAT = new SimpleDateFormat("yyyyMM", Locale.US);
|
||||
private static final SimpleDateFormat DAY_FORMAT = new SimpleDateFormat("yyyyMMdd", Locale.US);
|
||||
private static final SimpleDateFormat HOUR_FORMAT = new SimpleDateFormat("yyyyMMddHH", Locale.US);
|
||||
private static final SimpleDateFormat MINUTE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm", Locale.US);
|
||||
private static final SimpleDateFormat SECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US);
|
||||
private static final SimpleDateFormat MILLISECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS", Locale.US);
|
||||
static {
|
||||
// times need to be normalized so the value doesn't depend on the
|
||||
// location the index is created/used:
|
||||
YEAR_FORMAT.setTimeZone(GMT);
|
||||
MONTH_FORMAT.setTimeZone(GMT);
|
||||
DAY_FORMAT.setTimeZone(GMT);
|
||||
HOUR_FORMAT.setTimeZone(GMT);
|
||||
MINUTE_FORMAT.setTimeZone(GMT);
|
||||
SECOND_FORMAT.setTimeZone(GMT);
|
||||
MILLISECOND_FORMAT.setTimeZone(GMT);
|
||||
}
|
||||
|
||||
private static final Calendar calInstance = Calendar.getInstance(GMT);
|
||||
|
||||
// cannot create, the class has static methods only
|
||||
private DateTools() {}
|
||||
|
||||
/**
|
||||
* Converts a Date to a string suitable for indexing.
|
||||
*
|
||||
* @param date the date to be converted
|
||||
* @param resolution the desired resolution, see
|
||||
* {@link #round(Date, DateTools.Resolution)}
|
||||
* @return a string in format <code>yyyyMMddHHmmssSSS</code> or shorter,
|
||||
* depending on <code>resolution</code>; using GMT as timezone
|
||||
*/
|
||||
public static synchronized String dateToString(Date date, Resolution resolution) {
|
||||
return timeToString(date.getTime(), resolution);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a millisecond time to a string suitable for indexing.
|
||||
*
|
||||
* @param time the date expressed as milliseconds since January 1, 1970, 00:00:00 GMT
|
||||
* @param resolution the desired resolution, see
|
||||
* {@link #round(long, DateTools.Resolution)}
|
||||
* @return a string in format <code>yyyyMMddHHmmssSSS</code> or shorter,
|
||||
* depending on <code>resolution</code>; using GMT as timezone
|
||||
*/
|
||||
public static synchronized String timeToString(long time, Resolution resolution) {
|
||||
calInstance.setTimeInMillis(round(time, resolution));
|
||||
Date date = calInstance.getTime();
|
||||
|
||||
if (resolution == Resolution.YEAR) {
|
||||
return YEAR_FORMAT.format(date);
|
||||
} else if (resolution == Resolution.MONTH) {
|
||||
return MONTH_FORMAT.format(date);
|
||||
} else if (resolution == Resolution.DAY) {
|
||||
return DAY_FORMAT.format(date);
|
||||
} else if (resolution == Resolution.HOUR) {
|
||||
return HOUR_FORMAT.format(date);
|
||||
} else if (resolution == Resolution.MINUTE) {
|
||||
return MINUTE_FORMAT.format(date);
|
||||
} else if (resolution == Resolution.SECOND) {
|
||||
return SECOND_FORMAT.format(date);
|
||||
} else if (resolution == Resolution.MILLISECOND) {
|
||||
return MILLISECOND_FORMAT.format(date);
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("unknown resolution " + resolution);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string produced by <code>timeToString</code> or
|
||||
* <code>dateToString</code> back to a time, represented as the
|
||||
* number of milliseconds since January 1, 1970, 00:00:00 GMT.
|
||||
*
|
||||
* @param dateString the date string to be converted
|
||||
* @return the number of milliseconds since January 1, 1970, 00:00:00 GMT
|
||||
* @throws ParseException if <code>dateString</code> is not in the
|
||||
* expected format
|
||||
*/
|
||||
public static synchronized long stringToTime(String dateString) throws ParseException {
|
||||
return stringToDate(dateString).getTime();
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string produced by <code>timeToString</code> or
|
||||
* <code>dateToString</code> back to a time, represented as a
|
||||
* Date object.
|
||||
*
|
||||
* @param dateString the date string to be converted
|
||||
* @return the parsed time as a Date object
|
||||
* @throws ParseException if <code>dateString</code> is not in the
|
||||
* expected format
|
||||
*/
|
||||
public static synchronized Date stringToDate(String dateString) throws ParseException {
|
||||
if (dateString.length() == 4) {
|
||||
return YEAR_FORMAT.parse(dateString);
|
||||
} else if (dateString.length() == 6) {
|
||||
return MONTH_FORMAT.parse(dateString);
|
||||
} else if (dateString.length() == 8) {
|
||||
return DAY_FORMAT.parse(dateString);
|
||||
} else if (dateString.length() == 10) {
|
||||
return HOUR_FORMAT.parse(dateString);
|
||||
} else if (dateString.length() == 12) {
|
||||
return MINUTE_FORMAT.parse(dateString);
|
||||
} else if (dateString.length() == 14) {
|
||||
return SECOND_FORMAT.parse(dateString);
|
||||
} else if (dateString.length() == 17) {
|
||||
return MILLISECOND_FORMAT.parse(dateString);
|
||||
}
|
||||
throw new ParseException("Input is not valid date string: " + dateString, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Limit a date's resolution. For example, the date <code>2004-09-21 13:50:11</code>
|
||||
* will be changed to <code>2004-09-01 00:00:00</code> when using
|
||||
* <code>Resolution.MONTH</code>.
|
||||
*
|
||||
* @param resolution The desired resolution of the date to be returned
|
||||
* @return the date with all values more precise than <code>resolution</code>
|
||||
* set to 0 or 1
|
||||
*/
|
||||
public static synchronized Date round(Date date, Resolution resolution) {
|
||||
return new Date(round(date.getTime(), resolution));
|
||||
}
|
||||
|
||||
/**
|
||||
* Limit a date's resolution. For example, the date <code>1095767411000</code>
|
||||
* (which represents 2004-09-21 13:50:11) will be changed to
|
||||
* <code>1093989600000</code> (2004-09-01 00:00:00) when using
|
||||
* <code>Resolution.MONTH</code>.
|
||||
*
|
||||
* @param resolution The desired resolution of the date to be returned
|
||||
* @return the date with all values more precise than <code>resolution</code>
|
||||
* set to 0 or 1, expressed as milliseconds since January 1, 1970, 00:00:00 GMT
|
||||
*/
|
||||
public static synchronized long round(long time, Resolution resolution) {
|
||||
calInstance.setTimeInMillis(time);
|
||||
|
||||
if (resolution == Resolution.YEAR) {
|
||||
calInstance.set(Calendar.MONTH, 0);
|
||||
calInstance.set(Calendar.DAY_OF_MONTH, 1);
|
||||
calInstance.set(Calendar.HOUR_OF_DAY, 0);
|
||||
calInstance.set(Calendar.MINUTE, 0);
|
||||
calInstance.set(Calendar.SECOND, 0);
|
||||
calInstance.set(Calendar.MILLISECOND, 0);
|
||||
} else if (resolution == Resolution.MONTH) {
|
||||
calInstance.set(Calendar.DAY_OF_MONTH, 1);
|
||||
calInstance.set(Calendar.HOUR_OF_DAY, 0);
|
||||
calInstance.set(Calendar.MINUTE, 0);
|
||||
calInstance.set(Calendar.SECOND, 0);
|
||||
calInstance.set(Calendar.MILLISECOND, 0);
|
||||
} else if (resolution == Resolution.DAY) {
|
||||
calInstance.set(Calendar.HOUR_OF_DAY, 0);
|
||||
calInstance.set(Calendar.MINUTE, 0);
|
||||
calInstance.set(Calendar.SECOND, 0);
|
||||
calInstance.set(Calendar.MILLISECOND, 0);
|
||||
} else if (resolution == Resolution.HOUR) {
|
||||
calInstance.set(Calendar.MINUTE, 0);
|
||||
calInstance.set(Calendar.SECOND, 0);
|
||||
calInstance.set(Calendar.MILLISECOND, 0);
|
||||
} else if (resolution == Resolution.MINUTE) {
|
||||
calInstance.set(Calendar.SECOND, 0);
|
||||
calInstance.set(Calendar.MILLISECOND, 0);
|
||||
} else if (resolution == Resolution.SECOND) {
|
||||
calInstance.set(Calendar.MILLISECOND, 0);
|
||||
} else if (resolution == Resolution.MILLISECOND) {
|
||||
// don't cut off anything
|
||||
} else {
|
||||
throw new IllegalArgumentException("unknown resolution " + resolution);
|
||||
}
|
||||
return calInstance.getTimeInMillis();
|
||||
}
|
||||
|
||||
/** Specifies the time granularity. */
|
||||
public static class Resolution {
|
||||
|
||||
public static final Resolution YEAR = new Resolution("year");
|
||||
public static final Resolution MONTH = new Resolution("month");
|
||||
public static final Resolution DAY = new Resolution("day");
|
||||
public static final Resolution HOUR = new Resolution("hour");
|
||||
public static final Resolution MINUTE = new Resolution("minute");
|
||||
public static final Resolution SECOND = new Resolution("second");
|
||||
public static final Resolution MILLISECOND = new Resolution("millisecond");
|
||||
|
||||
private String resolution;
|
||||
|
||||
private Resolution() {
|
||||
}
|
||||
|
||||
private Resolution(String resolution) {
|
||||
this.resolution = resolution;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return resolution;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,305 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.*; // for javadoc
|
||||
import org.apache.lucene.search.ScoreDoc; // for javadoc
|
||||
import org.apache.lucene.search.Searcher; // for javadoc
|
||||
import org.apache.lucene.index.IndexReader; // for javadoc
|
||||
|
||||
/** Documents are the unit of indexing and search.
|
||||
*
|
||||
* A Document is a set of fields. Each field has a name and a textual value.
|
||||
* A field may be {@link Fieldable#isStored() stored} with the document, in which
|
||||
* case it is returned with search hits on the document. Thus each document
|
||||
* should typically contain one or more stored fields which uniquely identify
|
||||
* it.
|
||||
*
|
||||
* <p>Note that fields which are <i>not</i> {@link Fieldable#isStored() stored} are
|
||||
* <i>not</i> available in documents retrieved from the index, e.g. with {@link
|
||||
* ScoreDoc#doc}, {@link Searcher#doc(int)} or {@link
|
||||
* IndexReader#document(int)}.
|
||||
*/
|
||||
|
||||
public final class Document implements java.io.Serializable {
|
||||
List<Fieldable> fields = new ArrayList<Fieldable>();
|
||||
private float boost = 1.0f;
|
||||
|
||||
/** Constructs a new document with no fields. */
|
||||
public Document() {}
|
||||
|
||||
|
||||
/** Sets a boost factor for hits on any field of this document. This value
|
||||
* will be multiplied into the score of all hits on this document.
|
||||
*
|
||||
* <p>The default value is 1.0.
|
||||
*
|
||||
* <p>Values are multiplied into the value of {@link Fieldable#getBoost()} of
|
||||
* each field in this document. Thus, this method in effect sets a default
|
||||
* boost for the fields of this document.
|
||||
*
|
||||
* @see Fieldable#setBoost(float)
|
||||
*/
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/** Returns, at indexing time, the boost factor as set by {@link #setBoost(float)}.
|
||||
*
|
||||
* <p>Note that once a document is indexed this value is no longer available
|
||||
* from the index. At search time, for retrieved documents, this method always
|
||||
* returns 1. This however does not mean that the boost value set at indexing
|
||||
* time was ignored - it was just combined with other indexing time factors and
|
||||
* stored elsewhere, for better indexing and search performance. (For more
|
||||
* information see the "norm(t,d)" part of the scoring formula in
|
||||
* {@link org.apache.lucene.search.Similarity Similarity}.)
|
||||
*
|
||||
* @see #setBoost(float)
|
||||
*/
|
||||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Adds a field to a document. Several fields may be added with
|
||||
* the same name. In this case, if the fields are indexed, their text is
|
||||
* treated as though appended for the purposes of search.</p>
|
||||
* <p> Note that add like the removeField(s) methods only makes sense
|
||||
* prior to adding a document to an index. These methods cannot
|
||||
* be used to change the content of an existing index! In order to achieve this,
|
||||
* a document has to be deleted from an index and a new changed version of that
|
||||
* document has to be added.</p>
|
||||
*/
|
||||
public final void add(Fieldable field) {
|
||||
fields.add(field);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Removes field with the specified name from the document.
|
||||
* If multiple fields exist with this name, this method removes the first field that has been added.
|
||||
* If there is no field with the specified name, the document remains unchanged.</p>
|
||||
* <p> Note that the removeField(s) methods like the add method only make sense
|
||||
* prior to adding a document to an index. These methods cannot
|
||||
* be used to change the content of an existing index! In order to achieve this,
|
||||
* a document has to be deleted from an index and a new changed version of that
|
||||
* document has to be added.</p>
|
||||
*/
|
||||
public final void removeField(String name) {
|
||||
Iterator<Fieldable> it = fields.iterator();
|
||||
while (it.hasNext()) {
|
||||
Fieldable field = it.next();
|
||||
if (field.name().equals(name)) {
|
||||
it.remove();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Removes all fields with the given name from the document.
|
||||
* If there is no field with the specified name, the document remains unchanged.</p>
|
||||
* <p> Note that the removeField(s) methods like the add method only make sense
|
||||
* prior to adding a document to an index. These methods cannot
|
||||
* be used to change the content of an existing index! In order to achieve this,
|
||||
* a document has to be deleted from an index and a new changed version of that
|
||||
* document has to be added.</p>
|
||||
*/
|
||||
public final void removeFields(String name) {
|
||||
Iterator<Fieldable> it = fields.iterator();
|
||||
while (it.hasNext()) {
|
||||
Fieldable field = it.next();
|
||||
if (field.name().equals(name)) {
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a field with the given name if any exist in this document, or
|
||||
* null. If multiple fields exists with this name, this method returns the
|
||||
* first value added.
|
||||
* Do not use this method with lazy loaded fields.
|
||||
*/
|
||||
public final Field getField(String name) {
|
||||
return (Field) getFieldable(name);
|
||||
}
|
||||
|
||||
|
||||
/** Returns a field with the given name if any exist in this document, or
|
||||
* null. If multiple fields exists with this name, this method returns the
|
||||
* first value added.
|
||||
*/
|
||||
public Fieldable getFieldable(String name) {
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name))
|
||||
return field;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns the string value of the field with the given name if any exist in
|
||||
* this document, or null. If multiple fields exist with this name, this
|
||||
* method returns the first value added. If only binary fields with this name
|
||||
* exist, returns null.
|
||||
*/
|
||||
public final String get(String name) {
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name) && (!field.isBinary()))
|
||||
return field.stringValue();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns a List of all the fields in a document.
|
||||
* <p>Note that fields which are <i>not</i> {@link Fieldable#isStored() stored} are
|
||||
* <i>not</i> available in documents retrieved from the
|
||||
* index, e.g. {@link Searcher#doc(int)} or {@link
|
||||
* IndexReader#document(int)}.
|
||||
*/
|
||||
public final List<Fieldable> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
private final static Field[] NO_FIELDS = new Field[0];
|
||||
|
||||
/**
|
||||
* Returns an array of {@link Field}s with the given name.
|
||||
* Do not use with lazy loaded fields.
|
||||
* This method returns an empty array when there are no
|
||||
* matching fields. It never returns null.
|
||||
*
|
||||
* @param name the name of the field
|
||||
* @return a <code>Field[]</code> array
|
||||
*/
|
||||
public final Field[] getFields(String name) {
|
||||
List<Field> result = new ArrayList<Field>();
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name)) {
|
||||
result.add((Field) field);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.size() == 0)
|
||||
return NO_FIELDS;
|
||||
|
||||
return result.toArray(new Field[result.size()]);
|
||||
}
|
||||
|
||||
|
||||
private final static Fieldable[] NO_FIELDABLES = new Fieldable[0];
|
||||
|
||||
/**
|
||||
* Returns an array of {@link Fieldable}s with the given name.
|
||||
* This method returns an empty array when there are no
|
||||
* matching fields. It never returns null.
|
||||
*
|
||||
* @param name the name of the field
|
||||
* @return a <code>Fieldable[]</code> array
|
||||
*/
|
||||
public Fieldable[] getFieldables(String name) {
|
||||
List<Fieldable> result = new ArrayList<Fieldable>();
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name)) {
|
||||
result.add(field);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.size() == 0)
|
||||
return NO_FIELDABLES;
|
||||
|
||||
return result.toArray(new Fieldable[result.size()]);
|
||||
}
|
||||
|
||||
|
||||
private final static String[] NO_STRINGS = new String[0];
|
||||
|
||||
/**
|
||||
* Returns an array of values of the field specified as the method parameter.
|
||||
* This method returns an empty array when there are no
|
||||
* matching fields. It never returns null.
|
||||
* @param name the name of the field
|
||||
* @return a <code>String[]</code> of field values
|
||||
*/
|
||||
public final String[] getValues(String name) {
|
||||
List<String> result = new ArrayList<String>();
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name) && (!field.isBinary()))
|
||||
result.add(field.stringValue());
|
||||
}
|
||||
|
||||
if (result.size() == 0)
|
||||
return NO_STRINGS;
|
||||
|
||||
return result.toArray(new String[result.size()]);
|
||||
}
|
||||
|
||||
private final static byte[][] NO_BYTES = new byte[0][];
|
||||
|
||||
/**
|
||||
* Returns an array of byte arrays for of the fields that have the name specified
|
||||
* as the method parameter. This method returns an empty
|
||||
* array when there are no matching fields. It never
|
||||
* returns null.
|
||||
*
|
||||
* @param name the name of the field
|
||||
* @return a <code>byte[][]</code> of binary field values
|
||||
*/
|
||||
public final byte[][] getBinaryValues(String name) {
|
||||
List<byte[]> result = new ArrayList<byte[]>();
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name) && (field.isBinary()))
|
||||
result.add(field.getBinaryValue());
|
||||
}
|
||||
|
||||
if (result.size() == 0)
|
||||
return NO_BYTES;
|
||||
|
||||
return result.toArray(new byte[result.size()][]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of bytes for the first (or only) field that has the name
|
||||
* specified as the method parameter. This method will return <code>null</code>
|
||||
* if no binary fields with the specified name are available.
|
||||
* There may be non-binary fields with the same name.
|
||||
*
|
||||
* @param name the name of the field.
|
||||
* @return a <code>byte[]</code> containing the binary field value or <code>null</code>
|
||||
*/
|
||||
public final byte[] getBinaryValue(String name) {
|
||||
for (Fieldable field : fields) {
|
||||
if (field.name().equals(name) && (field.isBinary()))
|
||||
return field.getBinaryValue();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Prints the fields of a document for human consumption. */
|
||||
@Override
|
||||
public final String toString() {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
buffer.append("Document<");
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
Fieldable field = fields.get(i);
|
||||
buffer.append(field.toString());
|
||||
if (i != fields.size()-1)
|
||||
buffer.append(" ");
|
||||
}
|
||||
buffer.append(">");
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,566 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexWriter; // for javadoc
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
A field is a section of a Document. Each field has two parts, a name and a
|
||||
value. Values may be free text, provided as a String or as a Reader, or they
|
||||
may be atomic keywords, which are not further processed. Such keywords may
|
||||
be used to represent dates, urls, etc. Fields are optionally stored in the
|
||||
index, so that they may be returned with hits on the document.
|
||||
*/
|
||||
|
||||
public final class Field extends AbstractField implements Fieldable, Serializable {
|
||||
|
||||
/** Specifies whether and how a field should be stored. */
|
||||
public static enum Store {
|
||||
|
||||
/** Store the original field value in the index. This is useful for short texts
|
||||
* like a document's title which should be displayed with the results. The
|
||||
* value is stored in its original form, i.e. no analyzer is used before it is
|
||||
* stored.
|
||||
*/
|
||||
YES {
|
||||
@Override
|
||||
public boolean isStored() { return true; }
|
||||
},
|
||||
|
||||
/** Do not store the field value in the index. */
|
||||
NO {
|
||||
@Override
|
||||
public boolean isStored() { return false; }
|
||||
};
|
||||
|
||||
public abstract boolean isStored();
|
||||
}
|
||||
|
||||
/** Specifies whether and how a field should be indexed. */
|
||||
public static enum Index {
|
||||
|
||||
/** Do not index the field value. This field can thus not be searched,
|
||||
* but one can still access its contents provided it is
|
||||
* {@link Field.Store stored}. */
|
||||
NO {
|
||||
@Override
|
||||
public boolean isIndexed() { return false; }
|
||||
@Override
|
||||
public boolean isAnalyzed() { return false; }
|
||||
@Override
|
||||
public boolean omitNorms() { return true; }
|
||||
},
|
||||
|
||||
/** Index the tokens produced by running the field's
|
||||
* value through an Analyzer. This is useful for
|
||||
* common text. */
|
||||
ANALYZED {
|
||||
@Override
|
||||
public boolean isIndexed() { return true; }
|
||||
@Override
|
||||
public boolean isAnalyzed() { return true; }
|
||||
@Override
|
||||
public boolean omitNorms() { return false; }
|
||||
},
|
||||
|
||||
/** Index the field's value without using an Analyzer, so it can be searched.
|
||||
* As no analyzer is used the value will be stored as a single term. This is
|
||||
* useful for unique Ids like product numbers.
|
||||
*/
|
||||
NOT_ANALYZED {
|
||||
@Override
|
||||
public boolean isIndexed() { return true; }
|
||||
@Override
|
||||
public boolean isAnalyzed() { return false; }
|
||||
@Override
|
||||
public boolean omitNorms() { return false; }
|
||||
},
|
||||
|
||||
/** Expert: Index the field's value without an Analyzer,
|
||||
* and also disable the storing of norms. Note that you
|
||||
* can also separately enable/disable norms by calling
|
||||
* {@link Field#setOmitNorms}. No norms means that
|
||||
* index-time field and document boosting and field
|
||||
* length normalization are disabled. The benefit is
|
||||
* less memory usage as norms take up one byte of RAM
|
||||
* per indexed field for every document in the index,
|
||||
* during searching. Note that once you index a given
|
||||
* field <i>with</i> norms enabled, disabling norms will
|
||||
* have no effect. In other words, for this to have the
|
||||
* above described effect on a field, all instances of
|
||||
* that field must be indexed with NOT_ANALYZED_NO_NORMS
|
||||
* from the beginning. */
|
||||
NOT_ANALYZED_NO_NORMS {
|
||||
@Override
|
||||
public boolean isIndexed() { return true; }
|
||||
@Override
|
||||
public boolean isAnalyzed() { return false; }
|
||||
@Override
|
||||
public boolean omitNorms() { return true; }
|
||||
},
|
||||
|
||||
/** Expert: Index the tokens produced by running the
|
||||
* field's value through an Analyzer, and also
|
||||
* separately disable the storing of norms. See
|
||||
* {@link #NOT_ANALYZED_NO_NORMS} for what norms are
|
||||
* and why you may want to disable them. */
|
||||
ANALYZED_NO_NORMS {
|
||||
@Override
|
||||
public boolean isIndexed() { return true; }
|
||||
@Override
|
||||
public boolean isAnalyzed() { return true; }
|
||||
@Override
|
||||
public boolean omitNorms() { return true; }
|
||||
};
|
||||
|
||||
/** Get the best representation of the index given the flags. */
|
||||
public static Index toIndex(boolean indexed, boolean analyzed) {
|
||||
return toIndex(indexed, analyzed, false);
|
||||
}
|
||||
|
||||
/** Expert: Get the best representation of the index given the flags. */
|
||||
public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {
|
||||
|
||||
// If it is not indexed nothing else matters
|
||||
if (!indexed) {
|
||||
return Index.NO;
|
||||
}
|
||||
|
||||
// typical, non-expert
|
||||
if (!omitNorms) {
|
||||
if (analyzed) {
|
||||
return Index.ANALYZED;
|
||||
}
|
||||
return Index.NOT_ANALYZED;
|
||||
}
|
||||
|
||||
// Expert: Norms omitted
|
||||
if (analyzed) {
|
||||
return Index.ANALYZED_NO_NORMS;
|
||||
}
|
||||
return Index.NOT_ANALYZED_NO_NORMS;
|
||||
}
|
||||
|
||||
public abstract boolean isIndexed();
|
||||
public abstract boolean isAnalyzed();
|
||||
public abstract boolean omitNorms();
|
||||
}
|
||||
|
||||
/** Specifies whether and how a field should have term vectors. */
|
||||
public static enum TermVector {
|
||||
|
||||
/** Do not store term vectors.
|
||||
*/
|
||||
NO {
|
||||
@Override
|
||||
public boolean isStored() { return false; }
|
||||
@Override
|
||||
public boolean withPositions() { return false; }
|
||||
@Override
|
||||
public boolean withOffsets() { return false; }
|
||||
},
|
||||
|
||||
/** Store the term vectors of each document. A term vector is a list
|
||||
* of the document's terms and their number of occurrences in that document. */
|
||||
YES {
|
||||
@Override
|
||||
public boolean isStored() { return true; }
|
||||
@Override
|
||||
public boolean withPositions() { return false; }
|
||||
@Override
|
||||
public boolean withOffsets() { return false; }
|
||||
},
|
||||
|
||||
/**
|
||||
* Store the term vector + token position information
|
||||
*
|
||||
* @see #YES
|
||||
*/
|
||||
WITH_POSITIONS {
|
||||
@Override
|
||||
public boolean isStored() { return true; }
|
||||
@Override
|
||||
public boolean withPositions() { return true; }
|
||||
@Override
|
||||
public boolean withOffsets() { return false; }
|
||||
},
|
||||
|
||||
/**
|
||||
* Store the term vector + Token offset information
|
||||
*
|
||||
* @see #YES
|
||||
*/
|
||||
WITH_OFFSETS {
|
||||
@Override
|
||||
public boolean isStored() { return true; }
|
||||
@Override
|
||||
public boolean withPositions() { return false; }
|
||||
@Override
|
||||
public boolean withOffsets() { return true; }
|
||||
},
|
||||
|
||||
/**
|
||||
* Store the term vector + Token position and offset information
|
||||
*
|
||||
* @see #YES
|
||||
* @see #WITH_POSITIONS
|
||||
* @see #WITH_OFFSETS
|
||||
*/
|
||||
WITH_POSITIONS_OFFSETS {
|
||||
@Override
|
||||
public boolean isStored() { return true; }
|
||||
@Override
|
||||
public boolean withPositions() { return true; }
|
||||
@Override
|
||||
public boolean withOffsets() { return true; }
|
||||
};
|
||||
|
||||
/** Get the best representation of a TermVector given the flags. */
|
||||
public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {
|
||||
|
||||
// If it is not stored, nothing else matters.
|
||||
if (!stored) {
|
||||
return TermVector.NO;
|
||||
}
|
||||
|
||||
if (withOffsets) {
|
||||
if (withPositions) {
|
||||
return Field.TermVector.WITH_POSITIONS_OFFSETS;
|
||||
}
|
||||
return Field.TermVector.WITH_OFFSETS;
|
||||
}
|
||||
|
||||
if (withPositions) {
|
||||
return Field.TermVector.WITH_POSITIONS;
|
||||
}
|
||||
return Field.TermVector.YES;
|
||||
}
|
||||
|
||||
public abstract boolean isStored();
|
||||
public abstract boolean withPositions();
|
||||
public abstract boolean withOffsets();
|
||||
}
|
||||
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value or
|
||||
* binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), and getBinaryValue() must be set. */
|
||||
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value or
|
||||
* binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), and getBinaryValue() must be set. */
|
||||
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
|
||||
|
||||
/** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
|
||||
* or String value is analyzed to produce the indexed tokens. */
|
||||
public TokenStream tokenStreamValue() { return tokenStream; }
|
||||
|
||||
|
||||
/** <p>Expert: change the value of this field. This can
|
||||
* be used during indexing to re-use a single Field
|
||||
* instance to improve indexing speed by avoiding GC cost
|
||||
* of new'ing and reclaiming Field instances. Typically
|
||||
* a single {@link Document} instance is re-used as
|
||||
* well. This helps most on small documents.</p>
|
||||
*
|
||||
* <p>Each Field instance should only be used once
|
||||
* within a single {@link Document} instance. See <a
|
||||
* href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
|
||||
* for details.</p> */
|
||||
public void setValue(String value) {
|
||||
if (isBinary) {
|
||||
throw new IllegalArgumentException("cannot set a String value on a binary field");
|
||||
}
|
||||
fieldsData = value;
|
||||
}
|
||||
|
||||
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
|
||||
public void setValue(Reader value) {
|
||||
if (isBinary) {
|
||||
throw new IllegalArgumentException("cannot set a Reader value on a binary field");
|
||||
}
|
||||
if (isStored) {
|
||||
throw new IllegalArgumentException("cannot set a Reader value on a stored field");
|
||||
}
|
||||
fieldsData = value;
|
||||
}
|
||||
|
||||
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
|
||||
public void setValue(byte[] value) {
|
||||
if (!isBinary) {
|
||||
throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
|
||||
}
|
||||
fieldsData = value;
|
||||
binaryLength = value.length;
|
||||
binaryOffset = 0;
|
||||
}
|
||||
|
||||
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
|
||||
public void setValue(byte[] value, int offset, int length) {
|
||||
if (!isBinary) {
|
||||
throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
|
||||
}
|
||||
fieldsData = value;
|
||||
binaryLength = length;
|
||||
binaryOffset = offset;
|
||||
}
|
||||
|
||||
/** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
|
||||
* May be combined with stored values from stringValue() or getBinaryValue() */
|
||||
public void setTokenStream(TokenStream tokenStream) {
|
||||
this.isIndexed = true;
|
||||
this.isTokenized = true;
|
||||
this.tokenStream = tokenStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a field by specifying its name, value and how it will
|
||||
* be saved in the index. Term vectors will not be stored in the index.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param value The string to process
|
||||
* @param store Whether <code>value</code> should be stored in the index
|
||||
* @param index Whether the field should be indexed, and if so, if it should
|
||||
* be tokenized before indexing
|
||||
* @throws NullPointerException if name or value is <code>null</code>
|
||||
* @throws IllegalArgumentException if the field is neither stored nor indexed
|
||||
*/
|
||||
public Field(String name, String value, Store store, Index index) {
|
||||
this(name, value, store, index, TermVector.NO);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a field by specifying its name, value and how it will
|
||||
* be saved in the index.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param value The string to process
|
||||
* @param store Whether <code>value</code> should be stored in the index
|
||||
* @param index Whether the field should be indexed, and if so, if it should
|
||||
* be tokenized before indexing
|
||||
* @param termVector Whether term vector should be stored
|
||||
* @throws NullPointerException if name or value is <code>null</code>
|
||||
* @throws IllegalArgumentException in any of the following situations:
|
||||
* <ul>
|
||||
* <li>the field is neither stored nor indexed</li>
|
||||
* <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
|
||||
* </ul>
|
||||
*/
|
||||
public Field(String name, String value, Store store, Index index, TermVector termVector) {
|
||||
this(name, true, value, store, index, termVector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a field by specifying its name, value and how it will
|
||||
* be saved in the index.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param internName Whether to .intern() name or not
|
||||
* @param value The string to process
|
||||
* @param store Whether <code>value</code> should be stored in the index
|
||||
* @param index Whether the field should be indexed, and if so, if it should
|
||||
* be tokenized before indexing
|
||||
* @param termVector Whether term vector should be stored
|
||||
* @throws NullPointerException if name or value is <code>null</code>
|
||||
* @throws IllegalArgumentException in any of the following situations:
|
||||
* <ul>
|
||||
* <li>the field is neither stored nor indexed</li>
|
||||
* <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
|
||||
* </ul>
|
||||
*/
|
||||
public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
if (value == null)
|
||||
throw new NullPointerException("value cannot be null");
|
||||
if (name.length() == 0 && value.length() == 0)
|
||||
throw new IllegalArgumentException("name and value cannot both be empty");
|
||||
if (index == Index.NO && store == Store.NO)
|
||||
throw new IllegalArgumentException("it doesn't make sense to have a field that "
|
||||
+ "is neither indexed nor stored");
|
||||
if (index == Index.NO && termVector != TermVector.NO)
|
||||
throw new IllegalArgumentException("cannot store term vector information "
|
||||
+ "for a field that is not indexed");
|
||||
|
||||
if (internName) // field names are optionally interned
|
||||
name = StringHelper.intern(name);
|
||||
|
||||
this.name = name;
|
||||
|
||||
this.fieldsData = value;
|
||||
|
||||
this.isStored = store.isStored();
|
||||
|
||||
this.isIndexed = index.isIndexed();
|
||||
this.isTokenized = index.isAnalyzed();
|
||||
this.omitNorms = index.omitNorms();
|
||||
if (index == Index.NO) {
|
||||
this.omitTermFreqAndPositions = false;
|
||||
}
|
||||
|
||||
this.isBinary = false;
|
||||
|
||||
setStoreTermVector(termVector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a tokenized and indexed field that is not stored. Term vectors will
|
||||
* not be stored. The Reader is read only when the Document is added to the index,
|
||||
* i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
|
||||
* has been called.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param reader The reader with the content
|
||||
* @throws NullPointerException if name or reader is <code>null</code>
|
||||
*/
|
||||
public Field(String name, Reader reader) {
|
||||
this(name, reader, TermVector.NO);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a tokenized and indexed field that is not stored, optionally with
|
||||
* storing term vectors. The Reader is read only when the Document is added to the index,
|
||||
* i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
|
||||
* has been called.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param reader The reader with the content
|
||||
* @param termVector Whether term vector should be stored
|
||||
* @throws NullPointerException if name or reader is <code>null</code>
|
||||
*/
|
||||
public Field(String name, Reader reader, TermVector termVector) {
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
if (reader == null)
|
||||
throw new NullPointerException("reader cannot be null");
|
||||
|
||||
this.name = StringHelper.intern(name); // field names are interned
|
||||
this.fieldsData = reader;
|
||||
|
||||
this.isStored = false;
|
||||
|
||||
this.isIndexed = true;
|
||||
this.isTokenized = true;
|
||||
|
||||
this.isBinary = false;
|
||||
|
||||
setStoreTermVector(termVector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a tokenized and indexed field that is not stored. Term vectors will
|
||||
* not be stored. This is useful for pre-analyzed fields.
|
||||
* The TokenStream is read only when the Document is added to the index,
|
||||
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
|
||||
* has been called.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param tokenStream The TokenStream with the content
|
||||
* @throws NullPointerException if name or tokenStream is <code>null</code>
|
||||
*/
|
||||
public Field(String name, TokenStream tokenStream) {
|
||||
this(name, tokenStream, TermVector.NO);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a tokenized and indexed field that is not stored, optionally with
|
||||
* storing term vectors. This is useful for pre-analyzed fields.
|
||||
* The TokenStream is read only when the Document is added to the index,
|
||||
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
|
||||
* has been called.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param tokenStream The TokenStream with the content
|
||||
* @param termVector Whether term vector should be stored
|
||||
* @throws NullPointerException if name or tokenStream is <code>null</code>
|
||||
*/
|
||||
public Field(String name, TokenStream tokenStream, TermVector termVector) {
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
if (tokenStream == null)
|
||||
throw new NullPointerException("tokenStream cannot be null");
|
||||
|
||||
this.name = StringHelper.intern(name); // field names are interned
|
||||
this.fieldsData = null;
|
||||
this.tokenStream = tokenStream;
|
||||
|
||||
this.isStored = false;
|
||||
|
||||
this.isIndexed = true;
|
||||
this.isTokenized = true;
|
||||
|
||||
this.isBinary = false;
|
||||
|
||||
setStoreTermVector(termVector);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a stored field with binary value. Optionally the value may be compressed.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param value The binary value
|
||||
* @param store How <code>value</code> should be stored (compressed or not)
|
||||
* @throws IllegalArgumentException if store is <code>Store.NO</code>
|
||||
*/
|
||||
public Field(String name, byte[] value, Store store) {
|
||||
this(name, value, 0, value.length, store);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a stored field with binary value. Optionally the value may be compressed.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param value The binary value
|
||||
* @param offset Starting offset in value where this Field's bytes are
|
||||
* @param length Number of bytes to use for this Field, starting at offset
|
||||
* @param store How <code>value</code> should be stored (compressed or not)
|
||||
* @throws IllegalArgumentException if store is <code>Store.NO</code>
|
||||
*/
|
||||
public Field(String name, byte[] value, int offset, int length, Store store) {
|
||||
|
||||
if (name == null)
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
if (value == null)
|
||||
throw new IllegalArgumentException("value cannot be null");
|
||||
|
||||
this.name = StringHelper.intern(name); // field names are interned
|
||||
fieldsData = value;
|
||||
|
||||
if (store == Store.NO)
|
||||
throw new IllegalArgumentException("binary values can't be unstored");
|
||||
|
||||
isStored = store.isStored();
|
||||
isIndexed = false;
|
||||
isTokenized = false;
|
||||
omitTermFreqAndPositions = false;
|
||||
omitNorms = true;
|
||||
|
||||
isBinary = true;
|
||||
binaryLength = length;
|
||||
binaryOffset = offset;
|
||||
|
||||
setStoreTermVector(TermVector.NO);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
import java.io.Serializable;
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about
|
||||
* what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)}
|
||||
*
|
||||
**/
|
||||
public interface FieldSelector extends Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param fieldName the field to accept or reject
|
||||
* @return an instance of {@link FieldSelectorResult}
|
||||
* if the {@link Field} named <code>fieldName</code> should be loaded.
|
||||
*/
|
||||
FieldSelectorResult accept(String fieldName);
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Provides information about what should be done with this Field
|
||||
*
|
||||
**/
|
||||
public enum FieldSelectorResult {
|
||||
|
||||
/**
|
||||
* Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encountered.
|
||||
* {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null.
|
||||
*<p/>
|
||||
* {@link Document#add(Fieldable)} should be called by the Reader.
|
||||
*/
|
||||
LOAD,
|
||||
|
||||
/**
|
||||
* Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until
|
||||
* invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should
|
||||
* return a valid instance of a {@link Fieldable}.
|
||||
*<p/>
|
||||
* {@link Document#add(Fieldable)} should be called by the Reader.
|
||||
*/
|
||||
LAZY_LOAD,
|
||||
|
||||
/**
|
||||
* Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null.
|
||||
* {@link Document#add(Fieldable)} is not called.
|
||||
* <p/>
|
||||
* {@link Document#add(Fieldable)} should not be called by the Reader.
|
||||
*/
|
||||
NO_LOAD,
|
||||
|
||||
/**
|
||||
* Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the
|
||||
* Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should
|
||||
* both be valid for this {@link Field}
|
||||
* <p/>
|
||||
* {@link Document#add(Fieldable)} should be called by the Reader.
|
||||
*/
|
||||
LOAD_AND_BREAK,
|
||||
|
||||
/** Expert: Load the size of this {@link Field} rather than its value.
|
||||
* Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value.
|
||||
* The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0]
|
||||
*/
|
||||
SIZE,
|
||||
|
||||
/** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */
|
||||
SIZE_AND_BREAK
|
||||
}
|
|
@ -0,0 +1,212 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.FieldInvertState; // for javadocs
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Synonymous with {@link Field}.
|
||||
*
|
||||
* <p><bold>WARNING</bold>: This interface may change within minor versions, despite Lucene's backward compatibility requirements.
|
||||
* This means new methods may be added from version to version. This change only affects the Fieldable API; other backwards
|
||||
* compatibility promises remain intact. For example, Lucene can still
|
||||
* read and write indices created within the same major version.
|
||||
* </p>
|
||||
*
|
||||
**/
|
||||
public interface Fieldable extends Serializable {
|
||||
/** Sets the boost factor hits on this field. This value will be
|
||||
* multiplied into the score of all hits on this this field of this
|
||||
* document.
|
||||
*
|
||||
* <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
|
||||
* containing this field. If a document has multiple fields with the same
|
||||
* name, all such values are multiplied together. This product is then
|
||||
* used to compute the norm factor for the field. By
|
||||
* default, in the {@link
|
||||
* org.apache.lucene.search.Similarity#computeNorm(String,
|
||||
* FieldInvertState)} method, the boost value is multiplied
|
||||
* by the {@link
|
||||
* org.apache.lucene.search.Similarity#lengthNorm(String,
|
||||
* int)} and then rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
|
||||
* index. One should attempt to ensure that this product does not overflow
|
||||
* the range of that encoding.
|
||||
*
|
||||
* @see org.apache.lucene.document.Document#setBoost(float)
|
||||
* @see org.apache.lucene.search.Similarity#computeNorm(String, FieldInvertState)
|
||||
* @see org.apache.lucene.search.Similarity#encodeNorm(float)
|
||||
*/
|
||||
void setBoost(float boost);
|
||||
|
||||
/** Returns the boost factor for hits for this field.
|
||||
*
|
||||
* <p>The default value is 1.0.
|
||||
*
|
||||
* <p>Note: this value is not stored directly with the document in the index.
|
||||
* Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and
|
||||
* {@link org.apache.lucene.search.Searcher#doc(int)} may thus not have the same value present as when
|
||||
* this field was indexed.
|
||||
*
|
||||
* @see #setBoost(float)
|
||||
*/
|
||||
float getBoost();
|
||||
|
||||
/** Returns the name of the field as an interned string.
|
||||
* For example "date", "title", "body", ...
|
||||
*/
|
||||
String name();
|
||||
|
||||
/** The value of the field as a String, or null.
|
||||
* <p>
|
||||
* For indexing, if isStored()==true, the stringValue() will be used as the stored field value
|
||||
* unless isBinary()==true, in which case getBinaryValue() will be used.
|
||||
*
|
||||
* If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
|
||||
* If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
|
||||
* else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
|
||||
*/
|
||||
public String stringValue();
|
||||
|
||||
/** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
|
||||
* @see #stringValue()
|
||||
*/
|
||||
public Reader readerValue();
|
||||
|
||||
/** The TokenStream for this field to be used when indexing, or null.
|
||||
* @see #stringValue()
|
||||
*/
|
||||
public TokenStream tokenStreamValue();
|
||||
|
||||
/** True if the value of the field is to be stored in the index for return
|
||||
with search hits. */
|
||||
boolean isStored();
|
||||
|
||||
/** True if the value of the field is to be indexed, so that it may be
|
||||
searched on. */
|
||||
boolean isIndexed();
|
||||
|
||||
/** True if the value of the field should be tokenized as text prior to
|
||||
indexing. Un-tokenized fields are indexed as a single word and may not be
|
||||
Reader-valued. */
|
||||
boolean isTokenized();
|
||||
|
||||
/** True if the term or terms used to index this field are stored as a term
|
||||
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
|
||||
* These methods do not provide access to the original content of the field,
|
||||
* only to terms used to index it. If the original content must be
|
||||
* preserved, use the <code>stored</code> attribute instead.
|
||||
*
|
||||
* @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String)
|
||||
*/
|
||||
boolean isTermVectorStored();
|
||||
|
||||
/**
|
||||
* True if terms are stored as term vector together with their offsets
|
||||
* (start and end positon in source text).
|
||||
*/
|
||||
boolean isStoreOffsetWithTermVector();
|
||||
|
||||
/**
|
||||
* True if terms are stored as term vector together with their token positions.
|
||||
*/
|
||||
boolean isStorePositionWithTermVector();
|
||||
|
||||
/** True if the value of the field is stored as binary */
|
||||
boolean isBinary();
|
||||
|
||||
/** True if norms are omitted for this indexed field */
|
||||
boolean getOmitNorms();
|
||||
|
||||
/** Expert:
|
||||
*
|
||||
* If set, omit normalization factors associated with this indexed field.
|
||||
* This effectively disables indexing boosts and length normalization for this field.
|
||||
*/
|
||||
void setOmitNorms(boolean omitNorms);
|
||||
|
||||
/**
|
||||
* Indicates whether a Field is Lazy or not. The semantics of Lazy loading are such that if a Field is lazily loaded, retrieving
|
||||
* it's values via {@link #stringValue()} or {@link #getBinaryValue()} is only valid as long as the {@link org.apache.lucene.index.IndexReader} that
|
||||
* retrieved the {@link Document} is still open.
|
||||
*
|
||||
* @return true if this field can be loaded lazily
|
||||
*/
|
||||
boolean isLazy();
|
||||
|
||||
/**
|
||||
* Returns offset into byte[] segment that is used as value, if Field is not binary
|
||||
* returned value is undefined
|
||||
* @return index of the first character in byte[] segment that represents this Field value
|
||||
*/
|
||||
abstract int getBinaryOffset();
|
||||
|
||||
/**
|
||||
* Returns length of byte[] segment that is used as value, if Field is not binary
|
||||
* returned value is undefined
|
||||
* @return length of byte[] segment that represents this Field value
|
||||
*/
|
||||
abstract int getBinaryLength();
|
||||
|
||||
/**
|
||||
* Return the raw byte[] for the binary field. Note that
|
||||
* you must also call {@link #getBinaryLength} and {@link
|
||||
* #getBinaryOffset} to know which range of bytes in this
|
||||
* returned array belong to the field.
|
||||
* @return reference to the Field value as byte[].
|
||||
*/
|
||||
abstract byte[] getBinaryValue();
|
||||
|
||||
/**
|
||||
* Return the raw byte[] for the binary field. Note that
|
||||
* you must also call {@link #getBinaryLength} and {@link
|
||||
* #getBinaryOffset} to know which range of bytes in this
|
||||
* returned array belong to the field.<p>
|
||||
* About reuse: if you pass in the result byte[] and it is
|
||||
* used, likely the underlying implementation will hold
|
||||
* onto this byte[] and return it in future calls to
|
||||
* {@link #getBinaryValue()}.
|
||||
* So if you subsequently re-use the same byte[] elsewhere
|
||||
* it will alter this Fieldable's value.
|
||||
* @param result User defined buffer that will be used if
|
||||
* possible. If this is null or not large enough, a new
|
||||
* buffer is allocated
|
||||
* @return reference to the Field value as byte[].
|
||||
*/
|
||||
abstract byte[] getBinaryValue(byte[] result);
|
||||
|
||||
/** @see #setOmitTermFreqAndPositions */
|
||||
boolean getOmitTermFreqAndPositions();
|
||||
|
||||
/** Expert:
|
||||
*
|
||||
* If set, omit term freq, positions and payloads from
|
||||
* postings for this field.
|
||||
*
|
||||
* <p><b>NOTE</b>: While this option reduces storage space
|
||||
* required in the index, it also means any query
|
||||
* requiring positional information, such as {@link
|
||||
* PhraseQuery} or {@link SpanQuery} subclasses will
|
||||
* silently fail to find results.
|
||||
*/
|
||||
void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions);
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package org.apache.lucene.document;
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Load the First field and break.
|
||||
* <p/>
|
||||
* See {@link FieldSelectorResult#LOAD_AND_BREAK}
|
||||
*/
|
||||
public class LoadFirstFieldSelector implements FieldSelector {
|
||||
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
return FieldSelectorResult.LOAD_AND_BREAK;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A {@link FieldSelector} based on a Map of field names to {@link FieldSelectorResult}s
|
||||
*
|
||||
*/
|
||||
public class MapFieldSelector implements FieldSelector {
|
||||
|
||||
Map<String,FieldSelectorResult> fieldSelections;
|
||||
|
||||
/** Create a a MapFieldSelector
|
||||
* @param fieldSelections maps from field names (String) to {@link FieldSelectorResult}s
|
||||
*/
|
||||
public MapFieldSelector(Map<String,FieldSelectorResult> fieldSelections) {
|
||||
this.fieldSelections = fieldSelections;
|
||||
}
|
||||
|
||||
/** Create a a MapFieldSelector
|
||||
* @param fields fields to LOAD. List of Strings. All other fields are NO_LOAD.
|
||||
*/
|
||||
public MapFieldSelector(List<String> fields) {
|
||||
fieldSelections = new HashMap<String,FieldSelectorResult>(fields.size()*5/3);
|
||||
for (final String field : fields)
|
||||
fieldSelections.put(field, FieldSelectorResult.LOAD);
|
||||
}
|
||||
|
||||
/** Create a a MapFieldSelector
|
||||
* @param fields fields to LOAD. All other fields are NO_LOAD.
|
||||
*/
|
||||
public MapFieldSelector(String... fields) {
|
||||
this(Arrays.asList(fields));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Load field according to its associated value in fieldSelections
|
||||
* @param field a field name
|
||||
* @return the fieldSelections value that field maps to or NO_LOAD if none.
|
||||
*/
|
||||
public FieldSelectorResult accept(String field) {
|
||||
FieldSelectorResult selection = fieldSelections.get(field);
|
||||
return selection!=null ? selection : FieldSelectorResult.NO_LOAD;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.NumericField; // for javadocs
|
||||
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
|
||||
import org.apache.lucene.util.NumericUtils; // for javadocs
|
||||
|
||||
// do not remove this class in 3.0, it may be needed to decode old indexes!
|
||||
|
||||
/**
|
||||
* Provides support for converting longs to Strings, and back again. The strings
|
||||
* are structured so that lexicographic sorting order is preserved.
|
||||
*
|
||||
* <p>
|
||||
* That is, if l1 is less than l2 for any two longs l1 and l2, then
|
||||
* NumberTools.longToString(l1) is lexicographically less than
|
||||
* NumberTools.longToString(l2). (Similarly for "greater than" and "equals".)
|
||||
*
|
||||
* <p>
|
||||
* This class handles <b>all</b> long values (unlike
|
||||
* {@link org.apache.lucene.document.DateField}).
|
||||
*
|
||||
* @deprecated For new indexes use {@link NumericUtils} instead, which
|
||||
* provides a sortable binary representation (prefix encoded) of numeric
|
||||
* values.
|
||||
* To index and efficiently query numeric values use {@link NumericField}
|
||||
* and {@link NumericRangeQuery}.
|
||||
* This class is included for use with existing
|
||||
* indices and will be removed in a future release (possibly Lucene 4.0).
|
||||
*/
|
||||
public class NumberTools {
|
||||
|
||||
private static final int RADIX = 36;
|
||||
|
||||
private static final char NEGATIVE_PREFIX = '-';
|
||||
|
||||
// NB: NEGATIVE_PREFIX must be < POSITIVE_PREFIX
|
||||
private static final char POSITIVE_PREFIX = '0';
|
||||
|
||||
//NB: this must be less than
|
||||
/**
|
||||
* Equivalent to longToString(Long.MIN_VALUE)
|
||||
*/
|
||||
public static final String MIN_STRING_VALUE = NEGATIVE_PREFIX
|
||||
+ "0000000000000";
|
||||
|
||||
/**
|
||||
* Equivalent to longToString(Long.MAX_VALUE)
|
||||
*/
|
||||
public static final String MAX_STRING_VALUE = POSITIVE_PREFIX
|
||||
+ "1y2p0ij32e8e7";
|
||||
|
||||
/**
|
||||
* The length of (all) strings returned by {@link #longToString}
|
||||
*/
|
||||
public static final int STR_SIZE = MIN_STRING_VALUE.length();
|
||||
|
||||
/**
|
||||
* Converts a long to a String suitable for indexing.
|
||||
*/
|
||||
public static String longToString(long l) {
|
||||
|
||||
if (l == Long.MIN_VALUE) {
|
||||
// special case, because long is not symmetric around zero
|
||||
return MIN_STRING_VALUE;
|
||||
}
|
||||
|
||||
StringBuilder buf = new StringBuilder(STR_SIZE);
|
||||
|
||||
if (l < 0) {
|
||||
buf.append(NEGATIVE_PREFIX);
|
||||
l = Long.MAX_VALUE + l + 1;
|
||||
} else {
|
||||
buf.append(POSITIVE_PREFIX);
|
||||
}
|
||||
String num = Long.toString(l, RADIX);
|
||||
|
||||
int padLen = STR_SIZE - num.length() - buf.length();
|
||||
while (padLen-- > 0) {
|
||||
buf.append('0');
|
||||
}
|
||||
buf.append(num);
|
||||
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a String that was returned by {@link #longToString} back to a
|
||||
* long.
|
||||
*
|
||||
* @throws IllegalArgumentException
|
||||
* if the input is null
|
||||
* @throws NumberFormatException
|
||||
* if the input does not parse (it was not a String returned by
|
||||
* longToString()).
|
||||
*/
|
||||
public static long stringToLong(String str) {
|
||||
if (str == null) {
|
||||
throw new NullPointerException("string cannot be null");
|
||||
}
|
||||
if (str.length() != STR_SIZE) {
|
||||
throw new NumberFormatException("string is the wrong size");
|
||||
}
|
||||
|
||||
if (str.equals(MIN_STRING_VALUE)) {
|
||||
return Long.MIN_VALUE;
|
||||
}
|
||||
|
||||
char prefix = str.charAt(0);
|
||||
long l = Long.parseLong(str.substring(1), RADIX);
|
||||
|
||||
if (prefix == POSITIVE_PREFIX) {
|
||||
// nop
|
||||
} else if (prefix == NEGATIVE_PREFIX) {
|
||||
l = l - Long.MAX_VALUE - 1;
|
||||
} else {
|
||||
throw new NumberFormatException(
|
||||
"string does not begin with the correct prefix");
|
||||
}
|
||||
|
||||
return l;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,277 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.NumericTokenStream;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.lucene.search.NumericRangeQuery; // javadocs
|
||||
import org.apache.lucene.search.NumericRangeFilter; // javadocs
|
||||
import org.apache.lucene.search.SortField; // javadocs
|
||||
import org.apache.lucene.search.FieldCache; // javadocs
|
||||
|
||||
/**
|
||||
* <p>This class provides a {@link Field} that enables indexing
|
||||
* of numeric values for efficient range filtering and
|
||||
* sorting. Here's an example usage, adding an int value:
|
||||
* <pre>
|
||||
* document.add(new NumericField(name).setIntValue(value));
|
||||
* </pre>
|
||||
*
|
||||
* For optimal performance, re-use the
|
||||
* <code>NumericField</code> and {@link Document} instance for more than
|
||||
* one document:
|
||||
*
|
||||
* <pre>
|
||||
* NumericField field = new NumericField(name);
|
||||
* Document document = new Document();
|
||||
* document.add(field);
|
||||
*
|
||||
* for(all documents) {
|
||||
* ...
|
||||
* field.setIntValue(value)
|
||||
* writer.addDocument(document);
|
||||
* ...
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p>The java native types <code>int</code>, <code>long</code>,
|
||||
* <code>float</code> and <code>double</code> are
|
||||
* directly supported. However, any value that can be
|
||||
* converted into these native types can also be indexed.
|
||||
* For example, date/time values represented by a
|
||||
* {@link java.util.Date} can be translated into a long
|
||||
* value using the {@link java.util.Date#getTime} method. If you
|
||||
* don't need millisecond precision, you can quantize the
|
||||
* value, either by dividing the result of
|
||||
* {@link java.util.Date#getTime} or using the separate getters
|
||||
* (for year, month, etc.) to construct an <code>int</code> or
|
||||
* <code>long</code> value.</p>
|
||||
*
|
||||
* <p>To perform range querying or filtering against a
|
||||
* <code>NumericField</code>, use {@link NumericRangeQuery} or {@link
|
||||
* NumericRangeFilter}. To sort according to a
|
||||
* <code>NumericField</code>, use the normal numeric sort types, eg
|
||||
* {@link SortField#INT}. <code>NumericField</code> values
|
||||
* can also be loaded directly from {@link FieldCache}.</p>
|
||||
*
|
||||
* <p>By default, a <code>NumericField</code>'s value is not stored but
|
||||
* is indexed for range filtering and sorting. You can use
|
||||
* the {@link #NumericField(String,Field.Store,boolean)}
|
||||
* constructor if you need to change these defaults.</p>
|
||||
*
|
||||
* <p>You may add the same field name as a <code>NumericField</code> to
|
||||
* the same document more than once. Range querying and
|
||||
* filtering will be the logical OR of all values; so a range query
|
||||
* will hit all documents that have at least one value in
|
||||
* the range. However sort behavior is not defined. If you need to sort,
|
||||
* you should separately index a single-valued <code>NumericField</code>.</p>
|
||||
*
|
||||
* <p>A <code>NumericField</code> will consume somewhat more disk space
|
||||
* in the index than an ordinary single-valued field.
|
||||
* However, for a typical index that includes substantial
|
||||
* textual content per document, this increase will likely
|
||||
* be in the noise. </p>
|
||||
*
|
||||
* <p>Within Lucene, each numeric value is indexed as a
|
||||
* <em>trie</em> structure, where each term is logically
|
||||
* assigned to larger and larger pre-defined brackets (which
|
||||
* are simply lower-precision representations of the value).
|
||||
* The step size between each successive bracket is called the
|
||||
* <code>precisionStep</code>, measured in bits. Smaller
|
||||
* <code>precisionStep</code> values result in larger number
|
||||
* of brackets, which consumes more disk space in the index
|
||||
* but may result in faster range search performance. The
|
||||
* default value, 4, was selected for a reasonable tradeoff
|
||||
* of disk space consumption versus performance. You can
|
||||
* use the expert constructor {@link
|
||||
* #NumericField(String,int,Field.Store,boolean)} if you'd
|
||||
* like to change the value. Note that you must also
|
||||
* specify a congruent value when creating {@link
|
||||
* NumericRangeQuery} or {@link NumericRangeFilter}.
|
||||
* For low cardinality fields larger precision steps are good.
|
||||
* If the cardinality is < 100, it is fair
|
||||
* to use {@link Integer#MAX_VALUE}, which produces one
|
||||
* term per value.
|
||||
*
|
||||
* <p>For more information on the internals of numeric trie
|
||||
* indexing, including the <a
|
||||
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
|
||||
* configuration, see {@link NumericRangeQuery}. The format of
|
||||
* indexed values is described in {@link NumericUtils}.
|
||||
*
|
||||
* <p>If you only need to sort by numeric value, and never
|
||||
* run range querying/filtering, you can index using a
|
||||
* <code>precisionStep</code> of {@link Integer#MAX_VALUE}.
|
||||
* This will minimize disk space consumed. </p>
|
||||
*
|
||||
* <p>More advanced users can instead use {@link
|
||||
* NumericTokenStream} directly, when indexing numbers. This
|
||||
* class is a wrapper around this token stream type for
|
||||
* easier, more intuitive usage.</p>
|
||||
*
|
||||
* <p><b>NOTE:</b> This class is only used during
|
||||
* indexing. When retrieving the stored field value from a
|
||||
* {@link Document} instance after search, you will get a
|
||||
* conventional {@link Fieldable} instance where the numeric
|
||||
* values are returned as {@link String}s (according to
|
||||
* <code>toString(value)</code> of the used data type).
|
||||
*
|
||||
* <p><font color="red"><b>NOTE:</b> This API is
|
||||
* experimental and might change in incompatible ways in the
|
||||
* next release.</font>
|
||||
*
|
||||
* @since 2.9
|
||||
*/
|
||||
public final class NumericField extends AbstractField {
|
||||
|
||||
private final NumericTokenStream tokenStream;
|
||||
|
||||
/**
|
||||
* Creates a field for numeric values using the default <code>precisionStep</code>
|
||||
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with
|
||||
* a numeric value, before indexing a document containing this field,
|
||||
* set a value using the various set<em>???</em>Value() methods.
|
||||
* This constructor creates an indexed, but not stored field.
|
||||
* @param name the field name
|
||||
*/
|
||||
public NumericField(String name) {
|
||||
this(name, NumericUtils.PRECISION_STEP_DEFAULT, Field.Store.NO, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a field for numeric values using the default <code>precisionStep</code>
|
||||
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with
|
||||
* a numeric value, before indexing a document containing this field,
|
||||
* set a value using the various set<em>???</em>Value() methods.
|
||||
* @param name the field name
|
||||
* @param store if the field should be stored in plain text form
|
||||
* (according to <code>toString(value)</code> of the used data type)
|
||||
* @param index if the field should be indexed using {@link NumericTokenStream}
|
||||
*/
|
||||
public NumericField(String name, Field.Store store, boolean index) {
|
||||
this(name, NumericUtils.PRECISION_STEP_DEFAULT, store, index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a field for numeric values with the specified
|
||||
* <code>precisionStep</code>. The instance is not yet initialized with
|
||||
* a numeric value, before indexing a document containing this field,
|
||||
* set a value using the various set<em>???</em>Value() methods.
|
||||
* This constructor creates an indexed, but not stored field.
|
||||
* @param name the field name
|
||||
* @param precisionStep the used <a href="../search/NumericRangeQuery.html#precisionStepDesc">precision step</a>
|
||||
*/
|
||||
public NumericField(String name, int precisionStep) {
|
||||
this(name, precisionStep, Field.Store.NO, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a field for numeric values with the specified
|
||||
* <code>precisionStep</code>. The instance is not yet initialized with
|
||||
* a numeric value, before indexing a document containing this field,
|
||||
* set a value using the various set<em>???</em>Value() methods.
|
||||
* @param name the field name
|
||||
* @param precisionStep the used <a href="../search/NumericRangeQuery.html#precisionStepDesc">precision step</a>
|
||||
* @param store if the field should be stored in plain text form
|
||||
* (according to <code>toString(value)</code> of the used data type)
|
||||
* @param index if the field should be indexed using {@link NumericTokenStream}
|
||||
*/
|
||||
public NumericField(String name, int precisionStep, Field.Store store, boolean index) {
|
||||
super(name, store, index ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NO, Field.TermVector.NO);
|
||||
setOmitTermFreqAndPositions(true);
|
||||
tokenStream = new NumericTokenStream(precisionStep);
|
||||
}
|
||||
|
||||
/** Returns a {@link NumericTokenStream} for indexing the numeric value. */
|
||||
public TokenStream tokenStreamValue() {
|
||||
return isIndexed() ? tokenStream : null;
|
||||
}
|
||||
|
||||
/** Returns always <code>null</code> for numeric fields */
|
||||
@Override
|
||||
public byte[] getBinaryValue(byte[] result){
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns always <code>null</code> for numeric fields */
|
||||
public Reader readerValue() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns the numeric value as a string (how it is stored, when {@link Field.Store#YES} is chosen). */
|
||||
public String stringValue() {
|
||||
return (fieldsData == null) ? null : fieldsData.toString();
|
||||
}
|
||||
|
||||
/** Returns the current numeric value as a subclass of {@link Number}, <code>null</code> if not yet initialized. */
|
||||
public Number getNumericValue() {
|
||||
return (Number) fieldsData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the field with the supplied <code>long</code> value.
|
||||
* @param value the numeric value
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>document.add(new NumericField(name, precisionStep).setLongValue(value))</code>
|
||||
*/
|
||||
public NumericField setLongValue(final long value) {
|
||||
tokenStream.setLongValue(value);
|
||||
fieldsData = Long.valueOf(value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the field with the supplied <code>int</code> value.
|
||||
* @param value the numeric value
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>document.add(new NumericField(name, precisionStep).setIntValue(value))</code>
|
||||
*/
|
||||
public NumericField setIntValue(final int value) {
|
||||
tokenStream.setIntValue(value);
|
||||
fieldsData = Integer.valueOf(value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the field with the supplied <code>double</code> value.
|
||||
* @param value the numeric value
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>document.add(new NumericField(name, precisionStep).setDoubleValue(value))</code>
|
||||
*/
|
||||
public NumericField setDoubleValue(final double value) {
|
||||
tokenStream.setDoubleValue(value);
|
||||
fieldsData = Double.valueOf(value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the field with the supplied <code>float</code> value.
|
||||
* @param value the numeric value
|
||||
* @return this instance, because of this you can use it the following way:
|
||||
* <code>document.add(new NumericField(name, precisionStep).setFloatValue(value))</code>
|
||||
*/
|
||||
public NumericField setFloatValue(final float value) {
|
||||
tokenStream.setFloatValue(value);
|
||||
fieldsData = Float.valueOf(value);
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
import java.util.Set;
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Declare what fields to load normally and what fields to load lazily
|
||||
*
|
||||
**/
|
||||
public class SetBasedFieldSelector implements FieldSelector {
|
||||
|
||||
private Set<String> fieldsToLoad;
|
||||
private Set<String> lazyFieldsToLoad;
|
||||
|
||||
/**
|
||||
* Pass in the Set of {@link Field} names to load and the Set of {@link Field} names to load lazily. If both are null, the
|
||||
* Document will not have any {@link Field} on it.
|
||||
* @param fieldsToLoad A Set of {@link String} field names to load. May be empty, but not null
|
||||
* @param lazyFieldsToLoad A Set of {@link String} field names to load lazily. May be empty, but not null
|
||||
*/
|
||||
public SetBasedFieldSelector(Set<String> fieldsToLoad, Set<String> lazyFieldsToLoad) {
|
||||
this.fieldsToLoad = fieldsToLoad;
|
||||
this.lazyFieldsToLoad = lazyFieldsToLoad;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicate whether to load the field with the given name or not. If the {@link Field#name()} is not in either of the
|
||||
* initializing Sets, then {@link org.apache.lucene.document.FieldSelectorResult#NO_LOAD} is returned. If a Field name
|
||||
* is in both <code>fieldsToLoad</code> and <code>lazyFieldsToLoad</code>, lazy has precedence.
|
||||
*
|
||||
* @param fieldName The {@link Field} name to check
|
||||
* @return The {@link FieldSelectorResult}
|
||||
*/
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
FieldSelectorResult result = FieldSelectorResult.NO_LOAD;
|
||||
if (fieldsToLoad.contains(fieldName) == true){
|
||||
result = FieldSelectorResult.LOAD;
|
||||
}
|
||||
if (lazyFieldsToLoad.contains(fieldName) == true){
|
||||
result = FieldSelectorResult.LAZY_LOAD;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
<p>The logical representation of a {@link org.apache.lucene.document.Document} for indexing and searching.</p>
|
||||
<p>The document package provides the user level logical representation of content to be indexed and searched. The
|
||||
package also provides utilities for working with {@link org.apache.lucene.document.Document}s and {@link org.apache.lucene.document.Fieldable}s.</p>
|
||||
<h2>Document and Fieldable</h2>
|
||||
<p>A {@link org.apache.lucene.document.Document} is a collection of {@link org.apache.lucene.document.Fieldable}s. A
|
||||
{@link org.apache.lucene.document.Fieldable} is a logical representation of a user's content that needs to be indexed or stored.
|
||||
{@link org.apache.lucene.document.Fieldable}s have a number of properties that tell Lucene how to treat the content (like indexed, tokenized,
|
||||
stored, etc.) See the {@link org.apache.lucene.document.Field} implementation of {@link org.apache.lucene.document.Fieldable}
|
||||
for specifics on these properties.
|
||||
</p>
|
||||
<p>Note: it is common to refer to {@link org.apache.lucene.document.Document}s having {@link org.apache.lucene.document.Field}s, even though technically they have
|
||||
{@link org.apache.lucene.document.Fieldable}s.</p>
|
||||
<h2>Working with Documents</h2>
|
||||
<p>First and foremost, a {@link org.apache.lucene.document.Document} is something created by the user application. It is your job
|
||||
to create Documents based on the content of the files you are working with in your application (Word, txt, PDF, Excel or any other format.)
|
||||
How this is done is completely up to you. That being said, there are many tools available in other projects that can make
|
||||
the process of taking a file and converting it into a Lucene {@link org.apache.lucene.document.Document}. To see an example of this,
|
||||
take a look at the Lucene <a href="../../../../../../gettingstarted.html" target="top">demo</a> and the associated source code
|
||||
for extracting content from HTML.
|
||||
</p>
|
||||
<p>The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times searchable
|
||||
(remember, Lucene only searches text). {@link org.apache.lucene.document.NumericField} is a special helper class
|
||||
to simplify indexing of numeric values (and also dates) for fast range range queries with {@link org.apache.lucene.search.NumericRangeQuery}
|
||||
(using a special sortable string representation of numeric values).</p>
|
||||
<p>The {@link org.apache.lucene.document.FieldSelector} class provides a mechanism to tell Lucene how to load Documents from
|
||||
storage. If no FieldSelector is used, all Fieldables on a Document will be loaded. As an example of the FieldSelector usage, consider
|
||||
the common use case of
|
||||
displaying search results on a web page and then having users click through to see the full document. In this scenario, it is often
|
||||
the case that there are many small fields and one or two large fields (containing the contents of the original file). Before the FieldSelector,
|
||||
the full Document had to be loaded, including the large fields, in order to display the results. Now, using the FieldSelector, one
|
||||
can {@link org.apache.lucene.document.FieldSelectorResult#LAZY_LOAD} the large fields, thus only loading the large fields
|
||||
when a user clicks on the actual link to view the original content.</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.util.BitVector;
|
||||
import java.io.IOException;
|
||||
|
||||
class AllTermDocs implements TermDocs {
|
||||
protected BitVector deletedDocs;
|
||||
protected int maxDoc;
|
||||
protected int doc = -1;
|
||||
|
||||
protected AllTermDocs(SegmentReader parent) {
|
||||
synchronized (parent) {
|
||||
this.deletedDocs = parent.deletedDocs;
|
||||
}
|
||||
this.maxDoc = parent.maxDoc();
|
||||
}
|
||||
|
||||
public void seek(Term term) throws IOException {
|
||||
if (term==null) {
|
||||
doc = -1;
|
||||
} else {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
public void seek(TermEnum termEnum) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public int doc() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
public int freq() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
return skipTo(doc+1);
|
||||
}
|
||||
|
||||
public int read(int[] docs, int[] freqs) throws IOException {
|
||||
final int length = docs.length;
|
||||
int i = 0;
|
||||
while (i < length && doc < maxDoc) {
|
||||
if (deletedDocs == null || !deletedDocs.get(doc)) {
|
||||
docs[i] = doc;
|
||||
freqs[i] = 1;
|
||||
++i;
|
||||
}
|
||||
doc++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
doc = target;
|
||||
while (doc < maxDoc) {
|
||||
if (deletedDocs == null || !deletedDocs.get(doc)) {
|
||||
return true;
|
||||
}
|
||||
doc++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
/** Holds buffered deletes, by docID, term or query. We
|
||||
* hold two instances of this class: one for the deletes
|
||||
* prior to the last flush, the other for deletes after
|
||||
* the last flush. This is so if we need to abort
|
||||
* (discard all buffered docs) we can also discard the
|
||||
* buffered deletes yet keep the deletes done during
|
||||
* previously flushed segments. */
|
||||
class BufferedDeletes {
|
||||
int numTerms;
|
||||
HashMap<Term,Num> terms = new HashMap<Term,Num>();
|
||||
HashMap<Query,Integer> queries = new HashMap<Query,Integer>();
|
||||
List<Integer> docIDs = new ArrayList<Integer>();
|
||||
long bytesUsed;
|
||||
|
||||
// Number of documents a delete term applies to.
|
||||
final static class Num {
|
||||
private int num;
|
||||
|
||||
Num(int num) {
|
||||
this.num = num;
|
||||
}
|
||||
|
||||
int getNum() {
|
||||
return num;
|
||||
}
|
||||
|
||||
void setNum(int num) {
|
||||
// Only record the new number if it's greater than the
|
||||
// current one. This is important because if multiple
|
||||
// threads are replacing the same doc at nearly the
|
||||
// same time, it's possible that one thread that got a
|
||||
// higher docID is scheduled before the other
|
||||
// threads.
|
||||
if (num > this.num)
|
||||
this.num = num;
|
||||
}
|
||||
}
|
||||
|
||||
int size() {
|
||||
// We use numTerms not terms.size() intentionally, so
|
||||
// that deletes by the same term multiple times "count",
|
||||
// ie if you ask to flush every 1000 deletes then even
|
||||
// dup'd terms are counted towards that 1000
|
||||
return numTerms + queries.size() + docIDs.size();
|
||||
}
|
||||
|
||||
void update(BufferedDeletes in) {
|
||||
numTerms += in.numTerms;
|
||||
bytesUsed += in.bytesUsed;
|
||||
terms.putAll(in.terms);
|
||||
queries.putAll(in.queries);
|
||||
docIDs.addAll(in.docIDs);
|
||||
in.clear();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
terms.clear();
|
||||
queries.clear();
|
||||
docIDs.clear();
|
||||
numTerms = 0;
|
||||
bytesUsed = 0;
|
||||
}
|
||||
|
||||
void addBytesUsed(long b) {
|
||||
bytesUsed += b;
|
||||
}
|
||||
|
||||
boolean any() {
|
||||
return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0;
|
||||
}
|
||||
|
||||
// Remaps all buffered deletes based on a completed
|
||||
// merge
|
||||
synchronized void remap(MergeDocIDRemapper mapper,
|
||||
SegmentInfos infos,
|
||||
int[][] docMaps,
|
||||
int[] delCounts,
|
||||
MergePolicy.OneMerge merge,
|
||||
int mergeDocCount) {
|
||||
|
||||
final HashMap<Term,Num> newDeleteTerms;
|
||||
|
||||
// Remap delete-by-term
|
||||
if (terms.size() > 0) {
|
||||
newDeleteTerms = new HashMap<Term, Num>();
|
||||
for(Entry<Term,Num> entry : terms.entrySet()) {
|
||||
Num num = entry.getValue();
|
||||
newDeleteTerms.put(entry.getKey(),
|
||||
new Num(mapper.remap(num.getNum())));
|
||||
}
|
||||
} else
|
||||
newDeleteTerms = null;
|
||||
|
||||
|
||||
// Remap delete-by-docID
|
||||
final List<Integer> newDeleteDocIDs;
|
||||
|
||||
if (docIDs.size() > 0) {
|
||||
newDeleteDocIDs = new ArrayList<Integer>(docIDs.size());
|
||||
for (Integer num : docIDs) {
|
||||
newDeleteDocIDs.add(Integer.valueOf(mapper.remap(num.intValue())));
|
||||
}
|
||||
} else
|
||||
newDeleteDocIDs = null;
|
||||
|
||||
|
||||
// Remap delete-by-query
|
||||
final HashMap<Query,Integer> newDeleteQueries;
|
||||
|
||||
if (queries.size() > 0) {
|
||||
newDeleteQueries = new HashMap<Query, Integer>(queries.size());
|
||||
for(Entry<Query,Integer> entry: queries.entrySet()) {
|
||||
Integer num = entry.getValue();
|
||||
newDeleteQueries.put(entry.getKey(),
|
||||
Integer.valueOf(mapper.remap(num.intValue())));
|
||||
}
|
||||
} else
|
||||
newDeleteQueries = null;
|
||||
|
||||
if (newDeleteTerms != null)
|
||||
terms = newDeleteTerms;
|
||||
if (newDeleteDocIDs != null)
|
||||
docIDs = newDeleteDocIDs;
|
||||
if (newDeleteQueries != null)
|
||||
queries = newDeleteQueries;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Class that Posting and PostingVector use to write byte
|
||||
* streams into shared fixed-size byte[] arrays. The idea
|
||||
* is to allocate slices of increasing lengths For
|
||||
* example, the first slice is 5 bytes, the next slice is
|
||||
* 14, etc. We start by writing our bytes into the first
|
||||
* 5 bytes. When we hit the end of the slice, we allocate
|
||||
* the next slice and then write the address of the new
|
||||
* slice into the last 4 bytes of the previous slice (the
|
||||
* "forwarding address").
|
||||
*
|
||||
* Each slice is filled with 0's initially, and we mark
|
||||
* the end with a non-zero byte. This way the methods
|
||||
* that are writing into the slice don't need to record
|
||||
* its length and instead allocate a new slice once they
|
||||
* hit a non-zero byte. */
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
final class ByteBlockPool {
|
||||
|
||||
abstract static class Allocator {
|
||||
abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
|
||||
abstract byte[] getByteBlock(boolean trackAllocations);
|
||||
}
|
||||
|
||||
public byte[][] buffers = new byte[10][];
|
||||
|
||||
int bufferUpto = -1; // Which buffer we are upto
|
||||
public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer
|
||||
|
||||
public byte[] buffer; // Current head buffer
|
||||
public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset
|
||||
|
||||
private final boolean trackAllocations;
|
||||
private final Allocator allocator;
|
||||
|
||||
public ByteBlockPool(Allocator allocator, boolean trackAllocations) {
|
||||
this.allocator = allocator;
|
||||
this.trackAllocations = trackAllocations;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
if (bufferUpto != -1) {
|
||||
// We allocated at least one buffer
|
||||
|
||||
for(int i=0;i<bufferUpto;i++)
|
||||
// Fully zero fill buffers that we fully used
|
||||
Arrays.fill(buffers[i], (byte) 0);
|
||||
|
||||
// Partial zero fill the final buffer
|
||||
Arrays.fill(buffers[bufferUpto], 0, byteUpto, (byte) 0);
|
||||
|
||||
if (bufferUpto > 0)
|
||||
// Recycle all but the first buffer
|
||||
allocator.recycleByteBlocks(buffers, 1, 1+bufferUpto);
|
||||
|
||||
// Re-use the first buffer
|
||||
bufferUpto = 0;
|
||||
byteUpto = 0;
|
||||
byteOffset = 0;
|
||||
buffer = buffers[0];
|
||||
}
|
||||
}
|
||||
|
||||
public void nextBuffer() {
|
||||
if (1+bufferUpto == buffers.length) {
|
||||
byte[][] newBuffers = new byte[(int) (buffers.length*1.5)][];
|
||||
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
|
||||
buffers = newBuffers;
|
||||
}
|
||||
buffer = buffers[1+bufferUpto] = allocator.getByteBlock(trackAllocations);
|
||||
bufferUpto++;
|
||||
|
||||
byteUpto = 0;
|
||||
byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
public int newSlice(final int size) {
|
||||
if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size)
|
||||
nextBuffer();
|
||||
final int upto = byteUpto;
|
||||
byteUpto += size;
|
||||
buffer[byteUpto-1] = 16;
|
||||
return upto;
|
||||
}
|
||||
|
||||
// Size of each slice. These arrays should be at most 16
|
||||
// elements (index is encoded with 4 bits). First array
|
||||
// is just a compact way to encode X+1 with a max. Second
|
||||
// array is the length of each slice, ie first slice is 5
|
||||
// bytes, next slice is 14 bytes, etc.
|
||||
final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
|
||||
final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
|
||||
final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
|
||||
|
||||
public int allocSlice(final byte[] slice, final int upto) {
|
||||
|
||||
final int level = slice[upto] & 15;
|
||||
final int newLevel = nextLevelArray[level];
|
||||
final int newSize = levelSizeArray[newLevel];
|
||||
|
||||
// Maybe allocate another block
|
||||
if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize)
|
||||
nextBuffer();
|
||||
|
||||
final int newUpto = byteUpto;
|
||||
final int offset = newUpto + byteOffset;
|
||||
byteUpto += newSize;
|
||||
|
||||
// Copy forward the past 3 bytes (which we are about
|
||||
// to overwrite with the forwarding address):
|
||||
buffer[newUpto] = slice[upto-3];
|
||||
buffer[newUpto+1] = slice[upto-2];
|
||||
buffer[newUpto+2] = slice[upto-1];
|
||||
|
||||
// Write forwarding address at end of last slice:
|
||||
slice[upto-3] = (byte) (offset >>> 24);
|
||||
slice[upto-2] = (byte) (offset >>> 16);
|
||||
slice[upto-1] = (byte) (offset >>> 8);
|
||||
slice[upto] = (byte) offset;
|
||||
|
||||
// Write new level:
|
||||
buffer[byteUpto-1] = (byte) (16|newLevel);
|
||||
|
||||
return newUpto+3;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,149 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import java.io.IOException;
|
||||
|
||||
/* IndexInput that knows how to read the byte slices written
|
||||
* by Posting and PostingVector. We read the bytes in
|
||||
* each slice until we hit the end of that slice at which
|
||||
* point we read the forwarding address of the next slice
|
||||
* and then jump to it.*/
|
||||
final class ByteSliceReader extends IndexInput {
|
||||
ByteBlockPool pool;
|
||||
int bufferUpto;
|
||||
byte[] buffer;
|
||||
public int upto;
|
||||
int limit;
|
||||
int level;
|
||||
public int bufferOffset;
|
||||
|
||||
public int endIndex;
|
||||
|
||||
public void init(ByteBlockPool pool, int startIndex, int endIndex) {
|
||||
|
||||
assert endIndex-startIndex >= 0;
|
||||
assert startIndex >= 0;
|
||||
assert endIndex >= 0;
|
||||
|
||||
this.pool = pool;
|
||||
this.endIndex = endIndex;
|
||||
|
||||
level = 0;
|
||||
bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
buffer = pool.buffers[bufferUpto];
|
||||
upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
|
||||
final int firstSize = ByteBlockPool.levelSizeArray[0];
|
||||
|
||||
if (startIndex+firstSize >= endIndex) {
|
||||
// There is only this one slice to read
|
||||
limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
} else
|
||||
limit = upto+firstSize-4;
|
||||
}
|
||||
|
||||
public boolean eof() {
|
||||
assert upto + bufferOffset <= endIndex;
|
||||
return upto + bufferOffset == endIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() {
|
||||
assert !eof();
|
||||
assert upto <= limit;
|
||||
if (upto == limit)
|
||||
nextSlice();
|
||||
return buffer[upto++];
|
||||
}
|
||||
|
||||
public long writeTo(IndexOutput out) throws IOException {
|
||||
long size = 0;
|
||||
while(true) {
|
||||
if (limit + bufferOffset == endIndex) {
|
||||
assert endIndex - bufferOffset >= upto;
|
||||
out.writeBytes(buffer, upto, limit-upto);
|
||||
size += limit-upto;
|
||||
break;
|
||||
} else {
|
||||
out.writeBytes(buffer, upto, limit-upto);
|
||||
size += limit-upto;
|
||||
nextSlice();
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
public void nextSlice() {
|
||||
|
||||
// Skip to our next slice
|
||||
final int nextIndex = ((buffer[limit]&0xff)<<24) + ((buffer[1+limit]&0xff)<<16) + ((buffer[2+limit]&0xff)<<8) + (buffer[3+limit]&0xff);
|
||||
|
||||
level = ByteBlockPool.nextLevelArray[level];
|
||||
final int newSize = ByteBlockPool.levelSizeArray[level];
|
||||
|
||||
bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
|
||||
buffer = pool.buffers[bufferUpto];
|
||||
upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
|
||||
if (nextIndex + newSize >= endIndex) {
|
||||
// We are advancing to the final slice
|
||||
assert endIndex - nextIndex > 0;
|
||||
limit = endIndex - bufferOffset;
|
||||
} else {
|
||||
// This is not the final slice (subtract 4 for the
|
||||
// forwarding address at the end of this new slice)
|
||||
limit = upto+newSize-4;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) {
|
||||
while(len > 0) {
|
||||
final int numLeft = limit-upto;
|
||||
if (numLeft < len) {
|
||||
// Read entire slice
|
||||
System.arraycopy(buffer, upto, b, offset, numLeft);
|
||||
offset += numLeft;
|
||||
len -= numLeft;
|
||||
nextSlice();
|
||||
} else {
|
||||
// This slice is the last one
|
||||
System.arraycopy(buffer, upto, b, offset, len);
|
||||
upto += len;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getFilePointer() {throw new RuntimeException("not implemented");}
|
||||
@Override
|
||||
public long length() {throw new RuntimeException("not implemented");}
|
||||
@Override
|
||||
public void seek(long pos) {throw new RuntimeException("not implemented");}
|
||||
@Override
|
||||
public void close() {throw new RuntimeException("not implemented");}
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Class to write byte streams into slices of shared
|
||||
* byte[]. This is used by DocumentsWriter to hold the
|
||||
* posting list for many terms in RAM.
|
||||
*/
|
||||
|
||||
final class ByteSliceWriter {
|
||||
|
||||
private byte[] slice;
|
||||
private int upto;
|
||||
private final ByteBlockPool pool;
|
||||
|
||||
int offset0;
|
||||
|
||||
public ByteSliceWriter(ByteBlockPool pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the writer to write at address.
|
||||
*/
|
||||
public void init(int address) {
|
||||
slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT];
|
||||
assert slice != null;
|
||||
upto = address & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
offset0 = address;
|
||||
assert upto < slice.length;
|
||||
}
|
||||
|
||||
/** Write byte into byte slice stream */
|
||||
public void writeByte(byte b) {
|
||||
assert slice != null;
|
||||
if (slice[upto] != 0) {
|
||||
upto = pool.allocSlice(slice, upto);
|
||||
slice = pool.buffer;
|
||||
offset0 = pool.byteOffset;
|
||||
assert slice != null;
|
||||
}
|
||||
slice[upto++] = b;
|
||||
assert upto != slice.length;
|
||||
}
|
||||
|
||||
public void writeBytes(final byte[] b, int offset, final int len) {
|
||||
final int offsetEnd = offset + len;
|
||||
while(offset < offsetEnd) {
|
||||
if (slice[upto] != 0) {
|
||||
// End marker
|
||||
upto = pool.allocSlice(slice, upto);
|
||||
slice = pool.buffer;
|
||||
offset0 = pool.byteOffset;
|
||||
}
|
||||
|
||||
slice[upto++] = b[offset++];
|
||||
assert upto != slice.length;
|
||||
}
|
||||
}
|
||||
|
||||
public int getAddress() {
|
||||
return upto + (offset0 & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
|
||||
}
|
||||
|
||||
public void writeVInt(int i) {
|
||||
while ((i & ~0x7F) != 0) {
|
||||
writeByte((byte)((i & 0x7f) | 0x80));
|
||||
i >>>= 7;
|
||||
}
|
||||
writeByte((byte) i);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
final class CharBlockPool {
|
||||
|
||||
public char[][] buffers = new char[10][];
|
||||
int numBuffer;
|
||||
|
||||
int bufferUpto = -1; // Which buffer we are upto
|
||||
public int charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer
|
||||
|
||||
public char[] buffer; // Current head buffer
|
||||
public int charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset
|
||||
final private DocumentsWriter docWriter;
|
||||
|
||||
public CharBlockPool(DocumentsWriter docWriter) {
|
||||
this.docWriter = docWriter;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
docWriter.recycleCharBlocks(buffers, 1+bufferUpto);
|
||||
bufferUpto = -1;
|
||||
charUpto = DocumentsWriter.CHAR_BLOCK_SIZE;
|
||||
charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
public void nextBuffer() {
|
||||
if (1+bufferUpto == buffers.length) {
|
||||
char[][] newBuffers = new char[(int) (buffers.length*1.5)][];
|
||||
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
|
||||
buffers = newBuffers;
|
||||
}
|
||||
buffer = buffers[1+bufferUpto] = docWriter.getCharBlock();
|
||||
bufferUpto++;
|
||||
|
||||
charUpto = 0;
|
||||
charOffset += DocumentsWriter.CHAR_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,911 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.document.AbstractField; // for javadocs
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.text.NumberFormat;
|
||||
import java.io.PrintStream;
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Basic tool and API to check the health of an index and
|
||||
* write a new segments file that removes reference to
|
||||
* problematic segments.
|
||||
*
|
||||
* <p>As this tool checks every byte in the index, on a large
|
||||
* index it can take quite a long time to run.
|
||||
*
|
||||
* <p><b>WARNING</b>: this tool and API is new and
|
||||
* experimental and is subject to suddenly change in the
|
||||
* next release. Please make a complete backup of your
|
||||
* index before using this to fix your index!
|
||||
*/
|
||||
public class CheckIndex {
|
||||
|
||||
private PrintStream infoStream;
|
||||
private Directory dir;
|
||||
|
||||
/**
|
||||
* Returned from {@link #checkIndex()} detailing the health and status of the index.
|
||||
*
|
||||
* <p><b>WARNING</b>: this API is new and experimental and is
|
||||
* subject to suddenly change in the next release.
|
||||
**/
|
||||
|
||||
public static class Status {
|
||||
|
||||
/** True if no problems were found with the index. */
|
||||
public boolean clean;
|
||||
|
||||
/** True if we were unable to locate and load the segments_N file. */
|
||||
public boolean missingSegments;
|
||||
|
||||
/** True if we were unable to open the segments_N file. */
|
||||
public boolean cantOpenSegments;
|
||||
|
||||
/** True if we were unable to read the version number from segments_N file. */
|
||||
public boolean missingSegmentVersion;
|
||||
|
||||
/** Name of latest segments_N file in the index. */
|
||||
public String segmentsFileName;
|
||||
|
||||
/** Number of segments in the index. */
|
||||
public int numSegments;
|
||||
|
||||
/** String description of the version of the index. */
|
||||
public String segmentFormat;
|
||||
|
||||
/** Empty unless you passed specific segments list to check as optional 3rd argument.
|
||||
* @see CheckIndex#checkIndex(List) */
|
||||
public List<String> segmentsChecked = new ArrayList<String>();
|
||||
|
||||
/** True if the index was created with a newer version of Lucene than the CheckIndex tool. */
|
||||
public boolean toolOutOfDate;
|
||||
|
||||
/** List of {@link SegmentInfoStatus} instances, detailing status of each segment. */
|
||||
public List<SegmentInfoStatus> segmentInfos = new ArrayList<SegmentInfoStatus>();
|
||||
|
||||
/** Directory index is in. */
|
||||
public Directory dir;
|
||||
|
||||
/**
|
||||
* SegmentInfos instance containing only segments that
|
||||
* had no problems (this is used with the {@link CheckIndex#fixIndex}
|
||||
* method to repair the index.
|
||||
*/
|
||||
SegmentInfos newSegments;
|
||||
|
||||
/** How many documents will be lost to bad segments. */
|
||||
public int totLoseDocCount;
|
||||
|
||||
/** How many bad segments were found. */
|
||||
public int numBadSegments;
|
||||
|
||||
/** True if we checked only specific segments ({@link
|
||||
* #checkIndex(List)}) was called with non-null
|
||||
* argument). */
|
||||
public boolean partial;
|
||||
|
||||
/** Holds the userData of the last commit in the index */
|
||||
public Map<String, String> userData;
|
||||
|
||||
/** Holds the status of each segment in the index.
|
||||
* See {@link #segmentInfos}.
|
||||
*
|
||||
* <p><b>WARNING</b>: this API is new and experimental and is
|
||||
* subject to suddenly change in the next release.
|
||||
*/
|
||||
public static class SegmentInfoStatus {
|
||||
/** Name of the segment. */
|
||||
public String name;
|
||||
|
||||
/** Document count (does not take deletions into account). */
|
||||
public int docCount;
|
||||
|
||||
/** True if segment is compound file format. */
|
||||
public boolean compound;
|
||||
|
||||
/** Number of files referenced by this segment. */
|
||||
public int numFiles;
|
||||
|
||||
/** Net size (MB) of the files referenced by this
|
||||
* segment. */
|
||||
public double sizeMB;
|
||||
|
||||
/** Doc store offset, if this segment shares the doc
|
||||
* store files (stored fields and term vectors) with
|
||||
* other segments. This is -1 if it does not share. */
|
||||
public int docStoreOffset = -1;
|
||||
|
||||
/** String of the shared doc store segment, or null if
|
||||
* this segment does not share the doc store files. */
|
||||
public String docStoreSegment;
|
||||
|
||||
/** True if the shared doc store files are compound file
|
||||
* format. */
|
||||
public boolean docStoreCompoundFile;
|
||||
|
||||
/** True if this segment has pending deletions. */
|
||||
public boolean hasDeletions;
|
||||
|
||||
/** Name of the current deletions file name. */
|
||||
public String deletionsFileName;
|
||||
|
||||
/** Number of deleted documents. */
|
||||
public int numDeleted;
|
||||
|
||||
/** True if we were able to open a SegmentReader on this
|
||||
* segment. */
|
||||
public boolean openReaderPassed;
|
||||
|
||||
/** Number of fields in this segment. */
|
||||
int numFields;
|
||||
|
||||
/** True if at least one of the fields in this segment
|
||||
* does not omitTermFreqAndPositions.
|
||||
* @see AbstractField#setOmitTermFreqAndPositions */
|
||||
public boolean hasProx;
|
||||
|
||||
/** Map that includes certain
|
||||
* debugging details that IndexWriter records into
|
||||
* each segment it creates */
|
||||
public Map<String,String> diagnostics;
|
||||
|
||||
/** Status for testing of field norms (null if field norms could not be tested). */
|
||||
public FieldNormStatus fieldNormStatus;
|
||||
|
||||
/** Status for testing of indexed terms (null if indexed terms could not be tested). */
|
||||
public TermIndexStatus termIndexStatus;
|
||||
|
||||
/** Status for testing of stored fields (null if stored fields could not be tested). */
|
||||
public StoredFieldStatus storedFieldStatus;
|
||||
|
||||
/** Status for testing of term vectors (null if term vectors could not be tested). */
|
||||
public TermVectorStatus termVectorStatus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Status from testing field norms.
|
||||
*/
|
||||
public static final class FieldNormStatus {
|
||||
/** Number of fields successfully tested */
|
||||
public long totFields = 0L;
|
||||
|
||||
/** Exception thrown during term index test (null on success) */
|
||||
public Throwable error = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Status from testing term index.
|
||||
*/
|
||||
public static final class TermIndexStatus {
|
||||
/** Total term count */
|
||||
public long termCount = 0L;
|
||||
|
||||
/** Total frequency across all terms. */
|
||||
public long totFreq = 0L;
|
||||
|
||||
/** Total number of positions. */
|
||||
public long totPos = 0L;
|
||||
|
||||
/** Exception thrown during term index test (null on success) */
|
||||
public Throwable error = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Status from testing stored fields.
|
||||
*/
|
||||
public static final class StoredFieldStatus {
|
||||
|
||||
/** Number of documents tested. */
|
||||
public int docCount = 0;
|
||||
|
||||
/** Total number of stored fields tested. */
|
||||
public long totFields = 0;
|
||||
|
||||
/** Exception thrown during stored fields test (null on success) */
|
||||
public Throwable error = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Status from testing stored fields.
|
||||
*/
|
||||
public static final class TermVectorStatus {
|
||||
|
||||
/** Number of documents tested. */
|
||||
public int docCount = 0;
|
||||
|
||||
/** Total number of term vectors tested. */
|
||||
public long totVectors = 0;
|
||||
|
||||
/** Exception thrown during term vector test (null on success) */
|
||||
public Throwable error = null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new CheckIndex on the directory. */
|
||||
public CheckIndex(Directory dir) {
|
||||
this.dir = dir;
|
||||
infoStream = null;
|
||||
}
|
||||
|
||||
/** Set infoStream where messages should go. If null, no
|
||||
* messages are printed */
|
||||
public void setInfoStream(PrintStream out) {
|
||||
infoStream = out;
|
||||
}
|
||||
|
||||
private void msg(String msg) {
|
||||
if (infoStream != null)
|
||||
infoStream.println(msg);
|
||||
}
|
||||
|
||||
private static class MySegmentTermDocs extends SegmentTermDocs {
|
||||
|
||||
int delCount;
|
||||
|
||||
MySegmentTermDocs(SegmentReader p) {
|
||||
super(p);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(Term term) throws IOException {
|
||||
super.seek(term);
|
||||
delCount = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void skippingDoc() throws IOException {
|
||||
delCount++;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a {@link Status} instance detailing
|
||||
* the state of the index.
|
||||
*
|
||||
* <p>As this method checks every byte in the index, on a large
|
||||
* index it can take quite a long time to run.
|
||||
*
|
||||
* <p><b>WARNING</b>: make sure
|
||||
* you only call this when the index is not opened by any
|
||||
* writer. */
|
||||
public Status checkIndex() throws IOException {
|
||||
return checkIndex(null);
|
||||
}
|
||||
|
||||
/** Returns a {@link Status} instance detailing
|
||||
* the state of the index.
|
||||
*
|
||||
* @param onlySegments list of specific segment names to check
|
||||
*
|
||||
* <p>As this method checks every byte in the specified
|
||||
* segments, on a large index it can take quite a long
|
||||
* time to run.
|
||||
*
|
||||
* <p><b>WARNING</b>: make sure
|
||||
* you only call this when the index is not opened by any
|
||||
* writer. */
|
||||
public Status checkIndex(List<String> onlySegments) throws IOException {
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
SegmentInfos sis = new SegmentInfos();
|
||||
Status result = new Status();
|
||||
result.dir = dir;
|
||||
try {
|
||||
sis.read(dir);
|
||||
} catch (Throwable t) {
|
||||
msg("ERROR: could not read any segments file in directory");
|
||||
result.missingSegments = true;
|
||||
if (infoStream != null)
|
||||
t.printStackTrace(infoStream);
|
||||
return result;
|
||||
}
|
||||
|
||||
final int numSegments = sis.size();
|
||||
final String segmentsFileName = sis.getCurrentSegmentFileName();
|
||||
IndexInput input = null;
|
||||
try {
|
||||
input = dir.openInput(segmentsFileName);
|
||||
} catch (Throwable t) {
|
||||
msg("ERROR: could not open segments file in directory");
|
||||
if (infoStream != null)
|
||||
t.printStackTrace(infoStream);
|
||||
result.cantOpenSegments = true;
|
||||
return result;
|
||||
}
|
||||
int format = 0;
|
||||
try {
|
||||
format = input.readInt();
|
||||
} catch (Throwable t) {
|
||||
msg("ERROR: could not read segment file version in directory");
|
||||
if (infoStream != null)
|
||||
t.printStackTrace(infoStream);
|
||||
result.missingSegmentVersion = true;
|
||||
return result;
|
||||
} finally {
|
||||
if (input != null)
|
||||
input.close();
|
||||
}
|
||||
|
||||
String sFormat = "";
|
||||
boolean skip = false;
|
||||
|
||||
if (format == SegmentInfos.FORMAT)
|
||||
sFormat = "FORMAT [Lucene Pre-2.1]";
|
||||
if (format == SegmentInfos.FORMAT_LOCKLESS)
|
||||
sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
|
||||
else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
|
||||
sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
|
||||
else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
|
||||
sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
|
||||
else {
|
||||
if (format == SegmentInfos.FORMAT_CHECKSUM)
|
||||
sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
|
||||
else if (format == SegmentInfos.FORMAT_DEL_COUNT)
|
||||
sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
|
||||
else if (format == SegmentInfos.FORMAT_HAS_PROX)
|
||||
sFormat = "FORMAT_HAS_PROX [Lucene 2.4]";
|
||||
else if (format == SegmentInfos.FORMAT_USER_DATA)
|
||||
sFormat = "FORMAT_USER_DATA [Lucene 2.9]";
|
||||
else if (format == SegmentInfos.FORMAT_DIAGNOSTICS)
|
||||
sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]";
|
||||
else if (format < SegmentInfos.CURRENT_FORMAT) {
|
||||
sFormat = "int=" + format + " [newer version of Lucene than this tool]";
|
||||
skip = true;
|
||||
} else {
|
||||
sFormat = format + " [Lucene 1.3 or prior]";
|
||||
}
|
||||
}
|
||||
|
||||
result.segmentsFileName = segmentsFileName;
|
||||
result.numSegments = numSegments;
|
||||
result.segmentFormat = sFormat;
|
||||
result.userData = sis.getUserData();
|
||||
String userDataString;
|
||||
if (sis.getUserData().size() > 0) {
|
||||
userDataString = " userData=" + sis.getUserData();
|
||||
} else {
|
||||
userDataString = "";
|
||||
}
|
||||
|
||||
msg("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat + userDataString);
|
||||
|
||||
if (onlySegments != null) {
|
||||
result.partial = true;
|
||||
if (infoStream != null)
|
||||
infoStream.print("\nChecking only these segments:");
|
||||
for (String s : onlySegments) {
|
||||
if (infoStream != null)
|
||||
infoStream.print(" " + s);
|
||||
}
|
||||
result.segmentsChecked.addAll(onlySegments);
|
||||
msg(":");
|
||||
}
|
||||
|
||||
if (skip) {
|
||||
msg("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
|
||||
result.toolOutOfDate = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
result.newSegments = (SegmentInfos) sis.clone();
|
||||
result.newSegments.clear();
|
||||
|
||||
for(int i=0;i<numSegments;i++) {
|
||||
final SegmentInfo info = sis.info(i);
|
||||
if (onlySegments != null && !onlySegments.contains(info.name))
|
||||
continue;
|
||||
Status.SegmentInfoStatus segInfoStat = new Status.SegmentInfoStatus();
|
||||
result.segmentInfos.add(segInfoStat);
|
||||
msg(" " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
|
||||
segInfoStat.name = info.name;
|
||||
segInfoStat.docCount = info.docCount;
|
||||
|
||||
int toLoseDocCount = info.docCount;
|
||||
|
||||
SegmentReader reader = null;
|
||||
|
||||
try {
|
||||
msg(" compound=" + info.getUseCompoundFile());
|
||||
segInfoStat.compound = info.getUseCompoundFile();
|
||||
msg(" hasProx=" + info.getHasProx());
|
||||
segInfoStat.hasProx = info.getHasProx();
|
||||
msg(" numFiles=" + info.files().size());
|
||||
segInfoStat.numFiles = info.files().size();
|
||||
msg(" size (MB)=" + nf.format(info.sizeInBytes()/(1024.*1024.)));
|
||||
segInfoStat.sizeMB = info.sizeInBytes()/(1024.*1024.);
|
||||
Map<String,String> diagnostics = info.getDiagnostics();
|
||||
segInfoStat.diagnostics = diagnostics;
|
||||
if (diagnostics.size() > 0) {
|
||||
msg(" diagnostics = " + diagnostics);
|
||||
}
|
||||
|
||||
final int docStoreOffset = info.getDocStoreOffset();
|
||||
if (docStoreOffset != -1) {
|
||||
msg(" docStoreOffset=" + docStoreOffset);
|
||||
segInfoStat.docStoreOffset = docStoreOffset;
|
||||
msg(" docStoreSegment=" + info.getDocStoreSegment());
|
||||
segInfoStat.docStoreSegment = info.getDocStoreSegment();
|
||||
msg(" docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile());
|
||||
segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile();
|
||||
}
|
||||
final String delFileName = info.getDelFileName();
|
||||
if (delFileName == null){
|
||||
msg(" no deletions");
|
||||
segInfoStat.hasDeletions = false;
|
||||
}
|
||||
else{
|
||||
msg(" has deletions [delFileName=" + delFileName + "]");
|
||||
segInfoStat.hasDeletions = true;
|
||||
segInfoStat.deletionsFileName = delFileName;
|
||||
}
|
||||
if (infoStream != null)
|
||||
infoStream.print(" test: open reader.........");
|
||||
reader = SegmentReader.get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR);
|
||||
|
||||
segInfoStat.openReaderPassed = true;
|
||||
|
||||
final int numDocs = reader.numDocs();
|
||||
toLoseDocCount = numDocs;
|
||||
if (reader.hasDeletions()) {
|
||||
if (reader.deletedDocs.count() != info.getDelCount()) {
|
||||
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
|
||||
}
|
||||
if (reader.deletedDocs.count() > reader.maxDoc()) {
|
||||
throw new RuntimeException("too many deleted docs: maxDoc()=" + reader.maxDoc() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
|
||||
}
|
||||
if (info.docCount - numDocs != info.getDelCount()){
|
||||
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
|
||||
}
|
||||
segInfoStat.numDeleted = info.docCount - numDocs;
|
||||
msg("OK [" + (segInfoStat.numDeleted) + " deleted docs]");
|
||||
} else {
|
||||
if (info.getDelCount() != 0) {
|
||||
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
|
||||
}
|
||||
msg("OK");
|
||||
}
|
||||
if (reader.maxDoc() != info.docCount)
|
||||
throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.docCount);
|
||||
|
||||
// Test getFieldNames()
|
||||
if (infoStream != null) {
|
||||
infoStream.print(" test: fields..............");
|
||||
}
|
||||
Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
|
||||
msg("OK [" + fieldNames.size() + " fields]");
|
||||
segInfoStat.numFields = fieldNames.size();
|
||||
|
||||
// Test Field Norms
|
||||
segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader);
|
||||
|
||||
// Test the Term Index
|
||||
segInfoStat.termIndexStatus = testTermIndex(info, reader);
|
||||
|
||||
// Test Stored Fields
|
||||
segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
|
||||
|
||||
// Test Term Vectors
|
||||
segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);
|
||||
|
||||
// Rethrow the first exception we encountered
|
||||
// This will cause stats for failed segments to be incremented properly
|
||||
if (segInfoStat.fieldNormStatus.error != null) {
|
||||
throw new RuntimeException("Field Norm test failed");
|
||||
} else if (segInfoStat.termIndexStatus.error != null) {
|
||||
throw new RuntimeException("Term Index test failed");
|
||||
} else if (segInfoStat.storedFieldStatus.error != null) {
|
||||
throw new RuntimeException("Stored Field test failed");
|
||||
} else if (segInfoStat.termVectorStatus.error != null) {
|
||||
throw new RuntimeException("Term Vector test failed");
|
||||
}
|
||||
|
||||
msg("");
|
||||
|
||||
} catch (Throwable t) {
|
||||
msg("FAILED");
|
||||
String comment;
|
||||
comment = "fixIndex() would remove reference to this segment";
|
||||
msg(" WARNING: " + comment + "; full exception:");
|
||||
if (infoStream != null)
|
||||
t.printStackTrace(infoStream);
|
||||
msg("");
|
||||
result.totLoseDocCount += toLoseDocCount;
|
||||
result.numBadSegments++;
|
||||
continue;
|
||||
} finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
|
||||
// Keeper
|
||||
result.newSegments.add((SegmentInfo) info.clone());
|
||||
}
|
||||
|
||||
if (0 == result.numBadSegments) {
|
||||
result.clean = true;
|
||||
msg("No problems were detected with this index.\n");
|
||||
} else
|
||||
msg("WARNING: " + result.numBadSegments + " broken segments (containing " + result.totLoseDocCount + " documents) detected");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test field norms.
|
||||
*/
|
||||
private Status.FieldNormStatus testFieldNorms(Collection<String> fieldNames, SegmentReader reader) {
|
||||
final Status.FieldNormStatus status = new Status.FieldNormStatus();
|
||||
|
||||
try {
|
||||
// Test Field Norms
|
||||
if (infoStream != null) {
|
||||
infoStream.print(" test: field norms.........");
|
||||
}
|
||||
final byte[] b = new byte[reader.maxDoc()];
|
||||
for (final String fieldName : fieldNames) {
|
||||
reader.norms(fieldName, b, 0);
|
||||
++status.totFields;
|
||||
}
|
||||
|
||||
msg("OK [" + status.totFields + " fields]");
|
||||
} catch (Throwable e) {
|
||||
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
if (infoStream != null) {
|
||||
e.printStackTrace(infoStream);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the term index.
|
||||
*/
|
||||
private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) {
|
||||
final Status.TermIndexStatus status = new Status.TermIndexStatus();
|
||||
|
||||
try {
|
||||
if (infoStream != null) {
|
||||
infoStream.print(" test: terms, freq, prox...");
|
||||
}
|
||||
|
||||
final TermEnum termEnum = reader.terms();
|
||||
final TermPositions termPositions = reader.termPositions();
|
||||
|
||||
// Used only to count up # deleted docs for this term
|
||||
final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
|
||||
|
||||
final int maxDoc = reader.maxDoc();
|
||||
|
||||
while (termEnum.next()) {
|
||||
status.termCount++;
|
||||
final Term term = termEnum.term();
|
||||
final int docFreq = termEnum.docFreq();
|
||||
termPositions.seek(term);
|
||||
int lastDoc = -1;
|
||||
int freq0 = 0;
|
||||
status.totFreq += docFreq;
|
||||
while (termPositions.next()) {
|
||||
freq0++;
|
||||
final int doc = termPositions.doc();
|
||||
final int freq = termPositions.freq();
|
||||
if (doc <= lastDoc)
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
|
||||
if (doc >= maxDoc)
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
|
||||
|
||||
lastDoc = doc;
|
||||
if (freq <= 0)
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
|
||||
|
||||
int lastPos = -1;
|
||||
status.totPos += freq;
|
||||
for(int j=0;j<freq;j++) {
|
||||
final int pos = termPositions.nextPosition();
|
||||
if (pos < -1)
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
|
||||
if (pos < lastPos)
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
|
||||
lastPos = pos;
|
||||
}
|
||||
}
|
||||
|
||||
// Now count how many deleted docs occurred in
|
||||
// this term:
|
||||
final int delCount;
|
||||
if (reader.hasDeletions()) {
|
||||
myTermDocs.seek(term);
|
||||
while(myTermDocs.next()) { }
|
||||
delCount = myTermDocs.delCount;
|
||||
} else {
|
||||
delCount = 0;
|
||||
}
|
||||
|
||||
if (freq0 + delCount != docFreq) {
|
||||
throw new RuntimeException("term " + term + " docFreq=" +
|
||||
docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
|
||||
}
|
||||
}
|
||||
|
||||
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
|
||||
|
||||
} catch (Throwable e) {
|
||||
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
if (infoStream != null) {
|
||||
e.printStackTrace(infoStream);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test stored fields for a segment.
|
||||
*/
|
||||
private Status.StoredFieldStatus testStoredFields(SegmentInfo info, SegmentReader reader, NumberFormat format) {
|
||||
final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
|
||||
|
||||
try {
|
||||
if (infoStream != null) {
|
||||
infoStream.print(" test: stored fields.......");
|
||||
}
|
||||
|
||||
// Scan stored fields for all documents
|
||||
for (int j = 0; j < info.docCount; ++j) {
|
||||
if (!reader.isDeleted(j)) {
|
||||
status.docCount++;
|
||||
Document doc = reader.document(j);
|
||||
status.totFields += doc.getFields().size();
|
||||
}
|
||||
}
|
||||
|
||||
// Validate docCount
|
||||
if (status.docCount != reader.numDocs()) {
|
||||
throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
|
||||
}
|
||||
|
||||
msg("OK [" + status.totFields + " total field count; avg " +
|
||||
format.format((((float) status.totFields)/status.docCount)) + " fields per doc]");
|
||||
} catch (Throwable e) {
|
||||
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
if (infoStream != null) {
|
||||
e.printStackTrace(infoStream);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test term vectors for a segment.
|
||||
*/
|
||||
private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
|
||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||
|
||||
try {
|
||||
if (infoStream != null) {
|
||||
infoStream.print(" test: term vectors........");
|
||||
}
|
||||
|
||||
for (int j = 0; j < info.docCount; ++j) {
|
||||
if (!reader.isDeleted(j)) {
|
||||
status.docCount++;
|
||||
TermFreqVector[] tfv = reader.getTermFreqVectors(j);
|
||||
if (tfv != null) {
|
||||
status.totVectors += tfv.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
msg("OK [" + status.totVectors + " total vector count; avg " +
|
||||
format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
|
||||
} catch (Throwable e) {
|
||||
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
if (infoStream != null) {
|
||||
e.printStackTrace(infoStream);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/** Repairs the index using previously returned result
|
||||
* from {@link #checkIndex}. Note that this does not
|
||||
* remove any of the unreferenced files after it's done;
|
||||
* you must separately open an {@link IndexWriter}, which
|
||||
* deletes unreferenced files when it's created.
|
||||
*
|
||||
* <p><b>WARNING</b>: this writes a
|
||||
* new segments file into the index, effectively removing
|
||||
* all documents in broken segments from the index.
|
||||
* BE CAREFUL.
|
||||
*
|
||||
* <p><b>WARNING</b>: Make sure you only call this when the
|
||||
* index is not opened by any writer. */
|
||||
public void fixIndex(Status result) throws IOException {
|
||||
if (result.partial)
|
||||
throw new IllegalArgumentException("can only fix an index that was fully checked (this status checked a subset of segments)");
|
||||
result.newSegments.commit(result.dir);
|
||||
}
|
||||
|
||||
private static boolean assertsOn;
|
||||
|
||||
private static boolean testAsserts() {
|
||||
assertsOn = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean assertsOn() {
|
||||
assert testAsserts();
|
||||
return assertsOn;
|
||||
}
|
||||
|
||||
/** Command-line interface to check and fix an index.
|
||||
|
||||
<p>
|
||||
Run it like this:
|
||||
<pre>
|
||||
java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
|
||||
</pre>
|
||||
<ul>
|
||||
<li><code>-fix</code>: actually write a new segments_N file, removing any problematic segments
|
||||
|
||||
<li><code>-segment X</code>: only check the specified
|
||||
segment(s). This can be specified multiple times,
|
||||
to check more than one segment, eg <code>-segment _2
|
||||
-segment _a</code>. You can't use this with the -fix
|
||||
option.
|
||||
</ul>
|
||||
|
||||
<p><b>WARNING</b>: <code>-fix</code> should only be used on an emergency basis as it will cause
|
||||
documents (perhaps many) to be permanently removed from the index. Always make
|
||||
a backup copy of your index before running this! Do not run this tool on an index
|
||||
that is actively being written to. You have been warned!
|
||||
|
||||
<p> Run without -fix, this tool will open the index, report version information
|
||||
and report any exceptions it hits and what action it would take if -fix were
|
||||
specified. With -fix, this tool will remove any segments that have issues and
|
||||
write a new segments_N file. This means all documents contained in the affected
|
||||
segments will be removed.
|
||||
|
||||
<p>
|
||||
This tool exits with exit code 1 if the index cannot be opened or has any
|
||||
corruption, else 0.
|
||||
*/
|
||||
public static void main(String[] args) throws IOException, InterruptedException {
|
||||
|
||||
boolean doFix = false;
|
||||
List<String> onlySegments = new ArrayList<String>();
|
||||
String indexPath = null;
|
||||
int i = 0;
|
||||
while(i < args.length) {
|
||||
if (args[i].equals("-fix")) {
|
||||
doFix = true;
|
||||
i++;
|
||||
} else if (args[i].equals("-segment")) {
|
||||
if (i == args.length-1) {
|
||||
System.out.println("ERROR: missing name for -segment option");
|
||||
System.exit(1);
|
||||
}
|
||||
onlySegments.add(args[i+1]);
|
||||
i += 2;
|
||||
} else {
|
||||
if (indexPath != null) {
|
||||
System.out.println("ERROR: unexpected extra argument '" + args[i] + "'");
|
||||
System.exit(1);
|
||||
}
|
||||
indexPath = args[i];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
if (indexPath == null) {
|
||||
System.out.println("\nERROR: index path not specified");
|
||||
System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" +
|
||||
"\n" +
|
||||
" -fix: actually write a new segments_N file, removing any problematic segments\n" +
|
||||
" -segment X: only check the specified segments. This can be specified multiple\n" +
|
||||
" times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
|
||||
" You can't use this with the -fix option\n" +
|
||||
"\n" +
|
||||
"**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
|
||||
"documents (perhaps many) to be permanently removed from the index. Always make\n" +
|
||||
"a backup copy of your index before running this! Do not run this tool on an index\n" +
|
||||
"that is actively being written to. You have been warned!\n" +
|
||||
"\n" +
|
||||
"Run without -fix, this tool will open the index, report version information\n" +
|
||||
"and report any exceptions it hits and what action it would take if -fix were\n" +
|
||||
"specified. With -fix, this tool will remove any segments that have issues and\n" +
|
||||
"write a new segments_N file. This means all documents contained in the affected\n" +
|
||||
"segments will be removed.\n" +
|
||||
"\n" +
|
||||
"This tool exits with exit code 1 if the index cannot be opened or has any\n" +
|
||||
"corruption, else 0.\n");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (!assertsOn())
|
||||
System.out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
|
||||
|
||||
if (onlySegments.size() == 0)
|
||||
onlySegments = null;
|
||||
else if (doFix) {
|
||||
System.out.println("ERROR: cannot specify both -fix and -segment");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
System.out.println("\nOpening index @ " + indexPath + "\n");
|
||||
Directory dir = null;
|
||||
try {
|
||||
dir = FSDirectory.open(new File(indexPath));
|
||||
} catch (Throwable t) {
|
||||
System.out.println("ERROR: could not open directory \"" + indexPath + "\"; exiting");
|
||||
t.printStackTrace(System.out);
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
CheckIndex checker = new CheckIndex(dir);
|
||||
checker.setInfoStream(System.out);
|
||||
|
||||
Status result = checker.checkIndex(onlySegments);
|
||||
if (result.missingSegments) {
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (!result.clean) {
|
||||
if (!doFix) {
|
||||
System.out.println("WARNING: would write new segments file, and " + result.totLoseDocCount + " documents would be lost, if -fix were specified\n");
|
||||
} else {
|
||||
System.out.println("WARNING: " + result.totLoseDocCount + " documents will be lost\n");
|
||||
System.out.println("NOTE: will write new segments file in 5 seconds; this will remove " + result.totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
|
||||
for(int s=0;s<5;s++) {
|
||||
Thread.sleep(1000);
|
||||
System.out.println(" " + (5-s) + "...");
|
||||
}
|
||||
System.out.println("Writing...");
|
||||
checker.fixIndex(result);
|
||||
System.out.println("OK");
|
||||
System.out.println("Wrote new segments file \"" + result.newSegments.getCurrentSegmentFileName() + "\"");
|
||||
}
|
||||
}
|
||||
System.out.println("");
|
||||
|
||||
final int exitCode;
|
||||
if (result != null && result.clean == true)
|
||||
exitCode = 0;
|
||||
else
|
||||
exitCode = 1;
|
||||
System.exit(exitCode);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,281 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.BufferedIndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.Lock;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Class for accessing a compound stream.
|
||||
* This class implements a directory, but is limited to only read operations.
|
||||
* Directory methods that would normally modify data throw an exception.
|
||||
*/
|
||||
class CompoundFileReader extends Directory {
|
||||
|
||||
private int readBufferSize;
|
||||
|
||||
private static final class FileEntry {
|
||||
long offset;
|
||||
long length;
|
||||
}
|
||||
|
||||
|
||||
// Base info
|
||||
private Directory directory;
|
||||
private String fileName;
|
||||
|
||||
private IndexInput stream;
|
||||
private HashMap<String,FileEntry> entries = new HashMap<String,FileEntry>();
|
||||
|
||||
|
||||
public CompoundFileReader(Directory dir, String name) throws IOException {
|
||||
this(dir, name, BufferedIndexInput.BUFFER_SIZE);
|
||||
}
|
||||
|
||||
public CompoundFileReader(Directory dir, String name, int readBufferSize)
|
||||
throws IOException
|
||||
{
|
||||
directory = dir;
|
||||
fileName = name;
|
||||
this.readBufferSize = readBufferSize;
|
||||
|
||||
boolean success = false;
|
||||
|
||||
try {
|
||||
stream = dir.openInput(name, readBufferSize);
|
||||
|
||||
// read the directory and init files
|
||||
int count = stream.readVInt();
|
||||
FileEntry entry = null;
|
||||
for (int i=0; i<count; i++) {
|
||||
long offset = stream.readLong();
|
||||
String id = stream.readString();
|
||||
|
||||
if (entry != null) {
|
||||
// set length of the previous entry
|
||||
entry.length = offset - entry.offset;
|
||||
}
|
||||
|
||||
entry = new FileEntry();
|
||||
entry.offset = offset;
|
||||
entries.put(id, entry);
|
||||
}
|
||||
|
||||
// set the length of the final entry
|
||||
if (entry != null) {
|
||||
entry.length = stream.length() - entry.offset;
|
||||
}
|
||||
|
||||
success = true;
|
||||
|
||||
} finally {
|
||||
if (! success && (stream != null)) {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException e) { }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Directory getDirectory() {
|
||||
return directory;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return fileName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void close() throws IOException {
|
||||
if (stream == null)
|
||||
throw new IOException("Already closed");
|
||||
|
||||
entries.clear();
|
||||
stream.close();
|
||||
stream = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized IndexInput openInput(String id)
|
||||
throws IOException
|
||||
{
|
||||
// Default to readBufferSize passed in when we were opened
|
||||
return openInput(id, readBufferSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized IndexInput openInput(String id, int readBufferSize)
|
||||
throws IOException
|
||||
{
|
||||
if (stream == null)
|
||||
throw new IOException("Stream closed");
|
||||
|
||||
FileEntry entry = entries.get(id);
|
||||
if (entry == null)
|
||||
throw new IOException("No sub-file with id " + id + " found");
|
||||
|
||||
return new CSIndexInput(stream, entry.offset, entry.length, readBufferSize);
|
||||
}
|
||||
|
||||
/** Returns an array of strings, one for each file in the directory. */
|
||||
@Override
|
||||
public String[] listAll() {
|
||||
String res[] = new String[entries.size()];
|
||||
return entries.keySet().toArray(res);
|
||||
}
|
||||
|
||||
/** Returns true iff a file with the given name exists. */
|
||||
@Override
|
||||
public boolean fileExists(String name) {
|
||||
return entries.containsKey(name);
|
||||
}
|
||||
|
||||
/** Returns the time the compound file was last modified. */
|
||||
@Override
|
||||
public long fileModified(String name) throws IOException {
|
||||
return directory.fileModified(fileName);
|
||||
}
|
||||
|
||||
/** Set the modified time of the compound file to now. */
|
||||
@Override
|
||||
public void touchFile(String name) throws IOException {
|
||||
directory.touchFile(fileName);
|
||||
}
|
||||
|
||||
/** Not implemented
|
||||
* @throws UnsupportedOperationException */
|
||||
@Override
|
||||
public void deleteFile(String name)
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/** Not implemented
|
||||
* @throws UnsupportedOperationException */
|
||||
public void renameFile(String from, String to)
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/** Returns the length of a file in the directory.
|
||||
* @throws IOException if the file does not exist */
|
||||
@Override
|
||||
public long fileLength(String name)
|
||||
throws IOException
|
||||
{
|
||||
FileEntry e = entries.get(name);
|
||||
if (e == null)
|
||||
throw new IOException("File " + name + " does not exist");
|
||||
return e.length;
|
||||
}
|
||||
|
||||
/** Not implemented
|
||||
* @throws UnsupportedOperationException */
|
||||
@Override
|
||||
public IndexOutput createOutput(String name)
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/** Not implemented
|
||||
* @throws UnsupportedOperationException */
|
||||
@Override
|
||||
public Lock makeLock(String name)
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/** Implementation of an IndexInput that reads from a portion of the
|
||||
* compound file. The visibility is left as "package" *only* because
|
||||
* this helps with testing since JUnit test cases in a different class
|
||||
* can then access package fields of this class.
|
||||
*/
|
||||
static final class CSIndexInput extends BufferedIndexInput {
|
||||
|
||||
IndexInput base;
|
||||
long fileOffset;
|
||||
long length;
|
||||
|
||||
CSIndexInput(final IndexInput base, final long fileOffset, final long length)
|
||||
{
|
||||
this(base, fileOffset, length, BufferedIndexInput.BUFFER_SIZE);
|
||||
}
|
||||
|
||||
CSIndexInput(final IndexInput base, final long fileOffset, final long length, int readBufferSize)
|
||||
{
|
||||
super(readBufferSize);
|
||||
this.base = (IndexInput)base.clone();
|
||||
this.fileOffset = fileOffset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
CSIndexInput clone = (CSIndexInput)super.clone();
|
||||
clone.base = (IndexInput)base.clone();
|
||||
clone.fileOffset = fileOffset;
|
||||
clone.length = length;
|
||||
return clone;
|
||||
}
|
||||
|
||||
/** Expert: implements buffer refill. Reads bytes from the current
|
||||
* position in the input.
|
||||
* @param b the array to read bytes into
|
||||
* @param offset the offset in the array to start storing bytes
|
||||
* @param len the number of bytes to read
|
||||
*/
|
||||
@Override
|
||||
protected void readInternal(byte[] b, int offset, int len)
|
||||
throws IOException
|
||||
{
|
||||
long start = getFilePointer();
|
||||
if(start + len > length)
|
||||
throw new IOException("read past EOF");
|
||||
base.seek(fileOffset + start);
|
||||
base.readBytes(b, offset, len, false);
|
||||
}
|
||||
|
||||
/** Expert: implements seek. Sets current position in this file, where
|
||||
* the next {@link #readInternal(byte[],int,int)} will occur.
|
||||
* @see #readInternal(byte[],int,int)
|
||||
*/
|
||||
@Override
|
||||
protected void seekInternal(long pos) {}
|
||||
|
||||
/** Closes the stream to further operations. */
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
base.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import java.util.LinkedList;
|
||||
import java.util.HashSet;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Combines multiple files into a single compound file.
|
||||
* The file format:<br>
|
||||
* <ul>
|
||||
* <li>VInt fileCount</li>
|
||||
* <li>{Directory}
|
||||
* fileCount entries with the following structure:</li>
|
||||
* <ul>
|
||||
* <li>long dataOffset</li>
|
||||
* <li>String fileName</li>
|
||||
* </ul>
|
||||
* <li>{File Data}
|
||||
* fileCount entries with the raw data of the corresponding file</li>
|
||||
* </ul>
|
||||
*
|
||||
* The fileCount integer indicates how many files are contained in this compound
|
||||
* file. The {directory} that follows has that many entries. Each directory entry
|
||||
* contains a long pointer to the start of this file's data section, and a String
|
||||
* with that file's name.
|
||||
*/
|
||||
final class CompoundFileWriter {
|
||||
|
||||
private static final class FileEntry {
|
||||
/** source file */
|
||||
String file;
|
||||
|
||||
/** temporary holder for the start of directory entry for this file */
|
||||
long directoryOffset;
|
||||
|
||||
/** temporary holder for the start of this file's data section */
|
||||
long dataOffset;
|
||||
}
|
||||
|
||||
|
||||
private Directory directory;
|
||||
private String fileName;
|
||||
private HashSet<String> ids;
|
||||
private LinkedList<FileEntry> entries;
|
||||
private boolean merged = false;
|
||||
private SegmentMerger.CheckAbort checkAbort;
|
||||
|
||||
/** Create the compound stream in the specified file. The file name is the
|
||||
* entire name (no extensions are added).
|
||||
* @throws NullPointerException if <code>dir</code> or <code>name</code> is null
|
||||
*/
|
||||
public CompoundFileWriter(Directory dir, String name) {
|
||||
this(dir, name, null);
|
||||
}
|
||||
|
||||
CompoundFileWriter(Directory dir, String name, SegmentMerger.CheckAbort checkAbort) {
|
||||
if (dir == null)
|
||||
throw new NullPointerException("directory cannot be null");
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
this.checkAbort = checkAbort;
|
||||
directory = dir;
|
||||
fileName = name;
|
||||
ids = new HashSet<String>();
|
||||
entries = new LinkedList<FileEntry>();
|
||||
}
|
||||
|
||||
/** Returns the directory of the compound file. */
|
||||
public Directory getDirectory() {
|
||||
return directory;
|
||||
}
|
||||
|
||||
/** Returns the name of the compound file. */
|
||||
public String getName() {
|
||||
return fileName;
|
||||
}
|
||||
|
||||
/** Add a source stream. <code>file</code> is the string by which the
|
||||
* sub-stream will be known in the compound stream.
|
||||
*
|
||||
* @throws IllegalStateException if this writer is closed
|
||||
* @throws NullPointerException if <code>file</code> is null
|
||||
* @throws IllegalArgumentException if a file with the same name
|
||||
* has been added already
|
||||
*/
|
||||
public void addFile(String file) {
|
||||
if (merged)
|
||||
throw new IllegalStateException(
|
||||
"Can't add extensions after merge has been called");
|
||||
|
||||
if (file == null)
|
||||
throw new NullPointerException(
|
||||
"file cannot be null");
|
||||
|
||||
if (! ids.add(file))
|
||||
throw new IllegalArgumentException(
|
||||
"File " + file + " already added");
|
||||
|
||||
FileEntry entry = new FileEntry();
|
||||
entry.file = file;
|
||||
entries.add(entry);
|
||||
}
|
||||
|
||||
/** Merge files with the extensions added up to now.
|
||||
* All files with these extensions are combined sequentially into the
|
||||
* compound stream. After successful merge, the source files
|
||||
* are deleted.
|
||||
* @throws IllegalStateException if close() had been called before or
|
||||
* if no file has been added to this object
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
if (merged)
|
||||
throw new IllegalStateException(
|
||||
"Merge already performed");
|
||||
|
||||
if (entries.isEmpty())
|
||||
throw new IllegalStateException(
|
||||
"No entries to merge have been defined");
|
||||
|
||||
merged = true;
|
||||
|
||||
// open the compound stream
|
||||
IndexOutput os = null;
|
||||
try {
|
||||
os = directory.createOutput(fileName);
|
||||
|
||||
// Write the number of entries
|
||||
os.writeVInt(entries.size());
|
||||
|
||||
// Write the directory with all offsets at 0.
|
||||
// Remember the positions of directory entries so that we can
|
||||
// adjust the offsets later
|
||||
long totalSize = 0;
|
||||
for (FileEntry fe : entries) {
|
||||
fe.directoryOffset = os.getFilePointer();
|
||||
os.writeLong(0); // for now
|
||||
os.writeString(fe.file);
|
||||
totalSize += directory.fileLength(fe.file);
|
||||
}
|
||||
|
||||
// Pre-allocate size of file as optimization --
|
||||
// this can potentially help IO performance as
|
||||
// we write the file and also later during
|
||||
// searching. It also uncovers a disk-full
|
||||
// situation earlier and hopefully without
|
||||
// actually filling disk to 100%:
|
||||
final long finalLength = totalSize+os.getFilePointer();
|
||||
os.setLength(finalLength);
|
||||
|
||||
// Open the files and copy their data into the stream.
|
||||
// Remember the locations of each file's data section.
|
||||
byte buffer[] = new byte[16384];
|
||||
for (FileEntry fe : entries) {
|
||||
fe.dataOffset = os.getFilePointer();
|
||||
copyFile(fe, os, buffer);
|
||||
}
|
||||
|
||||
// Write the data offsets into the directory of the compound stream
|
||||
for (FileEntry fe : entries) {
|
||||
os.seek(fe.directoryOffset);
|
||||
os.writeLong(fe.dataOffset);
|
||||
}
|
||||
|
||||
assert finalLength == os.length();
|
||||
|
||||
// Close the output stream. Set the os to null before trying to
|
||||
// close so that if an exception occurs during the close, the
|
||||
// finally clause below will not attempt to close the stream
|
||||
// the second time.
|
||||
IndexOutput tmp = os;
|
||||
os = null;
|
||||
tmp.close();
|
||||
|
||||
} finally {
|
||||
if (os != null) try { os.close(); } catch (IOException e) { }
|
||||
}
|
||||
}
|
||||
|
||||
/** Copy the contents of the file with specified extension into the
|
||||
* provided output stream. Use the provided buffer for moving data
|
||||
* to reduce memory allocation.
|
||||
*/
|
||||
private void copyFile(FileEntry source, IndexOutput os, byte buffer[])
|
||||
throws IOException
|
||||
{
|
||||
IndexInput is = null;
|
||||
try {
|
||||
long startPtr = os.getFilePointer();
|
||||
|
||||
is = directory.openInput(source.file);
|
||||
long length = is.length();
|
||||
long remainder = length;
|
||||
int chunk = buffer.length;
|
||||
|
||||
while(remainder > 0) {
|
||||
int len = (int) Math.min(chunk, remainder);
|
||||
is.readBytes(buffer, 0, len, false);
|
||||
os.writeBytes(buffer, len);
|
||||
remainder -= len;
|
||||
if (checkAbort != null)
|
||||
// Roughly every 2 MB we will check if
|
||||
// it's time to abort
|
||||
checkAbort.work(80);
|
||||
}
|
||||
|
||||
// Verify that remainder is 0
|
||||
if (remainder != 0)
|
||||
throw new IOException(
|
||||
"Non-zero remainder length after copying: " + remainder
|
||||
+ " (id: " + source.file + ", length: " + length
|
||||
+ ", buffer size: " + chunk + ")");
|
||||
|
||||
// Verify that the output length diff is equal to original file
|
||||
long endPtr = os.getFilePointer();
|
||||
long diff = endPtr - startPtr;
|
||||
if (diff != length)
|
||||
throw new IOException(
|
||||
"Difference in the output file offsets " + diff
|
||||
+ " does not match the original file length " + length);
|
||||
|
||||
} finally {
|
||||
if (is != null) is.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,409 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A {@link MergeScheduler} that runs each merge using a
|
||||
* separate thread, up until a maximum number of threads
|
||||
* ({@link #setMaxThreadCount}) at which when a merge is
|
||||
* needed, the thread(s) that are updating the index will
|
||||
* pause until one or more merges completes. This is a
|
||||
* simple way to use concurrency in the indexing process
|
||||
* without having to create and manage application level
|
||||
* threads. */
|
||||
|
||||
public class ConcurrentMergeScheduler extends MergeScheduler {
|
||||
|
||||
private int mergeThreadPriority = -1;
|
||||
|
||||
protected List<MergeThread> mergeThreads = new ArrayList<MergeThread>();
|
||||
|
||||
// Max number of threads allowed to be merging at once
|
||||
private int maxThreadCount = 1;
|
||||
|
||||
protected Directory dir;
|
||||
|
||||
private boolean closed;
|
||||
protected IndexWriter writer;
|
||||
protected int mergeThreadCount;
|
||||
|
||||
public ConcurrentMergeScheduler() {
|
||||
if (allInstances != null) {
|
||||
// Only for testing
|
||||
addMyself();
|
||||
}
|
||||
}
|
||||
|
||||
/** Sets the max # simultaneous threads that may be
|
||||
* running. If a merge is necessary yet we already have
|
||||
* this many threads running, the incoming thread (that
|
||||
* is calling add/updateDocument) will block until
|
||||
* a merge thread has completed. */
|
||||
public void setMaxThreadCount(int count) {
|
||||
if (count < 1)
|
||||
throw new IllegalArgumentException("count should be at least 1");
|
||||
maxThreadCount = count;
|
||||
}
|
||||
|
||||
/** Get the max # simultaneous threads that may be
|
||||
* running. @see #setMaxThreadCount. */
|
||||
public int getMaxThreadCount() {
|
||||
return maxThreadCount;
|
||||
}
|
||||
|
||||
/** Return the priority that merge threads run at. By
|
||||
* default the priority is 1 plus the priority of (ie,
|
||||
* slightly higher priority than) the first thread that
|
||||
* calls merge. */
|
||||
public synchronized int getMergeThreadPriority() {
|
||||
initMergeThreadPriority();
|
||||
return mergeThreadPriority;
|
||||
}
|
||||
|
||||
/** Set the priority that merge threads run at. */
|
||||
public synchronized void setMergeThreadPriority(int pri) {
|
||||
if (pri > Thread.MAX_PRIORITY || pri < Thread.MIN_PRIORITY)
|
||||
throw new IllegalArgumentException("priority must be in range " + Thread.MIN_PRIORITY + " .. " + Thread.MAX_PRIORITY + " inclusive");
|
||||
mergeThreadPriority = pri;
|
||||
|
||||
final int numThreads = mergeThreadCount();
|
||||
for(int i=0;i<numThreads;i++) {
|
||||
MergeThread merge = mergeThreads.get(i);
|
||||
merge.setThreadPriority(pri);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean verbose() {
|
||||
return writer != null && writer.verbose();
|
||||
}
|
||||
|
||||
private void message(String message) {
|
||||
if (verbose())
|
||||
writer.message("CMS: " + message);
|
||||
}
|
||||
|
||||
private synchronized void initMergeThreadPriority() {
|
||||
if (mergeThreadPriority == -1) {
|
||||
// Default to slightly higher priority than our
|
||||
// calling thread
|
||||
mergeThreadPriority = 1+Thread.currentThread().getPriority();
|
||||
if (mergeThreadPriority > Thread.MAX_PRIORITY)
|
||||
mergeThreadPriority = Thread.MAX_PRIORITY;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
closed = true;
|
||||
}
|
||||
|
||||
public synchronized void sync() {
|
||||
while(mergeThreadCount() > 0) {
|
||||
if (verbose())
|
||||
message("now wait for threads; currently " + mergeThreads.size() + " still running");
|
||||
final int count = mergeThreads.size();
|
||||
if (verbose()) {
|
||||
for(int i=0;i<count;i++)
|
||||
message(" " + i + ": " + mergeThreads.get(i));
|
||||
}
|
||||
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized int mergeThreadCount() {
|
||||
int count = 0;
|
||||
final int numThreads = mergeThreads.size();
|
||||
for(int i=0;i<numThreads;i++)
|
||||
if (mergeThreads.get(i).isAlive())
|
||||
count++;
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void merge(IndexWriter writer)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
||||
assert !Thread.holdsLock(writer);
|
||||
|
||||
this.writer = writer;
|
||||
|
||||
initMergeThreadPriority();
|
||||
|
||||
dir = writer.getDirectory();
|
||||
|
||||
// First, quickly run through the newly proposed merges
|
||||
// and add any orthogonal merges (ie a merge not
|
||||
// involving segments already pending to be merged) to
|
||||
// the queue. If we are way behind on merging, many of
|
||||
// these newly proposed merges will likely already be
|
||||
// registered.
|
||||
|
||||
if (verbose()) {
|
||||
message("now merge");
|
||||
message(" index: " + writer.segString());
|
||||
}
|
||||
|
||||
// Iterate, pulling from the IndexWriter's queue of
|
||||
// pending merges, until it's empty:
|
||||
while(true) {
|
||||
|
||||
// TODO: we could be careful about which merges to do in
|
||||
// the BG (eg maybe the "biggest" ones) vs FG, which
|
||||
// merges to do first (the easiest ones?), etc.
|
||||
|
||||
MergePolicy.OneMerge merge = writer.getNextMerge();
|
||||
if (merge == null) {
|
||||
if (verbose())
|
||||
message(" no more merges pending; now return");
|
||||
return;
|
||||
}
|
||||
|
||||
// We do this w/ the primary thread to keep
|
||||
// deterministic assignment of segment names
|
||||
writer.mergeInit(merge);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
synchronized(this) {
|
||||
final MergeThread merger;
|
||||
while (mergeThreadCount() >= maxThreadCount) {
|
||||
if (verbose())
|
||||
message(" too many merge threads running; stalling...");
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose())
|
||||
message(" consider merge " + merge.segString(dir));
|
||||
|
||||
assert mergeThreadCount() < maxThreadCount;
|
||||
|
||||
// OK to spawn a new merge thread to handle this
|
||||
// merge:
|
||||
merger = getMergeThread(writer, merge);
|
||||
mergeThreads.add(merger);
|
||||
if (verbose())
|
||||
message(" launch new thread [" + merger.getName() + "]");
|
||||
|
||||
merger.start();
|
||||
success = true;
|
||||
}
|
||||
} finally {
|
||||
if (!success) {
|
||||
writer.mergeFinish(merge);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Does the actual merge, by calling {@link IndexWriter#merge} */
|
||||
protected void doMerge(MergePolicy.OneMerge merge)
|
||||
throws IOException {
|
||||
writer.merge(merge);
|
||||
}
|
||||
|
||||
/** Create and return a new MergeThread */
|
||||
protected synchronized MergeThread getMergeThread(IndexWriter writer, MergePolicy.OneMerge merge) throws IOException {
|
||||
final MergeThread thread = new MergeThread(writer, merge);
|
||||
thread.setThreadPriority(mergeThreadPriority);
|
||||
thread.setDaemon(true);
|
||||
thread.setName("Lucene Merge Thread #" + mergeThreadCount++);
|
||||
return thread;
|
||||
}
|
||||
|
||||
protected class MergeThread extends Thread {
|
||||
|
||||
IndexWriter writer;
|
||||
MergePolicy.OneMerge startMerge;
|
||||
MergePolicy.OneMerge runningMerge;
|
||||
|
||||
public MergeThread(IndexWriter writer, MergePolicy.OneMerge startMerge) throws IOException {
|
||||
this.writer = writer;
|
||||
this.startMerge = startMerge;
|
||||
}
|
||||
|
||||
public synchronized void setRunningMerge(MergePolicy.OneMerge merge) {
|
||||
runningMerge = merge;
|
||||
}
|
||||
|
||||
public synchronized MergePolicy.OneMerge getRunningMerge() {
|
||||
return runningMerge;
|
||||
}
|
||||
|
||||
public void setThreadPriority(int pri) {
|
||||
try {
|
||||
setPriority(pri);
|
||||
} catch (NullPointerException npe) {
|
||||
// Strangely, Sun's JDK 1.5 on Linux sometimes
|
||||
// throws NPE out of here...
|
||||
} catch (SecurityException se) {
|
||||
// Ignore this because we will still run fine with
|
||||
// normal thread priority
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// First time through the while loop we do the merge
|
||||
// that we were started with:
|
||||
MergePolicy.OneMerge merge = this.startMerge;
|
||||
|
||||
try {
|
||||
|
||||
if (verbose())
|
||||
message(" merge thread: start");
|
||||
|
||||
while(true) {
|
||||
setRunningMerge(merge);
|
||||
doMerge(merge);
|
||||
|
||||
// Subsequent times through the loop we do any new
|
||||
// merge that writer says is necessary:
|
||||
merge = writer.getNextMerge();
|
||||
if (merge != null) {
|
||||
writer.mergeInit(merge);
|
||||
if (verbose())
|
||||
message(" merge thread: do another merge " + merge.segString(dir));
|
||||
} else
|
||||
break;
|
||||
}
|
||||
|
||||
if (verbose())
|
||||
message(" merge thread: done");
|
||||
|
||||
} catch (Throwable exc) {
|
||||
|
||||
// Ignore the exception if it was due to abort:
|
||||
if (!(exc instanceof MergePolicy.MergeAbortedException)) {
|
||||
if (!suppressExceptions) {
|
||||
// suppressExceptions is normally only set during
|
||||
// testing.
|
||||
anyExceptions = true;
|
||||
handleMergeException(exc);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
synchronized(ConcurrentMergeScheduler.this) {
|
||||
ConcurrentMergeScheduler.this.notifyAll();
|
||||
boolean removed = mergeThreads.remove(this);
|
||||
assert removed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
MergePolicy.OneMerge merge = getRunningMerge();
|
||||
if (merge == null)
|
||||
merge = startMerge;
|
||||
return "merge thread: " + merge.segString(dir);
|
||||
}
|
||||
}
|
||||
|
||||
/** Called when an exception is hit in a background merge
|
||||
* thread */
|
||||
protected void handleMergeException(Throwable exc) {
|
||||
try {
|
||||
// When an exception is hit during merge, IndexWriter
|
||||
// removes any partial files and then allows another
|
||||
// merge to run. If whatever caused the error is not
|
||||
// transient then the exception will keep happening,
|
||||
// so, we sleep here to avoid saturating CPU in such
|
||||
// cases:
|
||||
Thread.sleep(250);
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
}
|
||||
throw new MergePolicy.MergeException(exc, dir);
|
||||
}
|
||||
|
||||
static boolean anyExceptions = false;
|
||||
|
||||
/** Used for testing */
|
||||
public static boolean anyUnhandledExceptions() {
|
||||
if (allInstances == null) {
|
||||
throw new RuntimeException("setTestMode() was not called; often this is because your test case's setUp method fails to call super.setUp in LuceneTestCase");
|
||||
}
|
||||
synchronized(allInstances) {
|
||||
final int count = allInstances.size();
|
||||
// Make sure all outstanding threads are done so we see
|
||||
// any exceptions they may produce:
|
||||
for(int i=0;i<count;i++)
|
||||
allInstances.get(i).sync();
|
||||
boolean v = anyExceptions;
|
||||
anyExceptions = false;
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
||||
public static void clearUnhandledExceptions() {
|
||||
synchronized(allInstances) {
|
||||
anyExceptions = false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Used for testing */
|
||||
private void addMyself() {
|
||||
synchronized(allInstances) {
|
||||
final int size = allInstances.size();
|
||||
int upto = 0;
|
||||
for(int i=0;i<size;i++) {
|
||||
final ConcurrentMergeScheduler other = allInstances.get(i);
|
||||
if (!(other.closed && 0 == other.mergeThreadCount()))
|
||||
// Keep this one for now: it still has threads or
|
||||
// may spawn new threads
|
||||
allInstances.set(upto++, other);
|
||||
}
|
||||
allInstances.subList(upto, allInstances.size()).clear();
|
||||
allInstances.add(this);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean suppressExceptions;
|
||||
|
||||
/** Used for testing */
|
||||
void setSuppressExceptions() {
|
||||
suppressExceptions = true;
|
||||
}
|
||||
|
||||
/** Used for testing */
|
||||
void clearSuppressExceptions() {
|
||||
suppressExceptions = false;
|
||||
}
|
||||
|
||||
/** Used for testing */
|
||||
private static List<ConcurrentMergeScheduler> allInstances;
|
||||
public static void setTestMode() {
|
||||
allInstances = new ArrayList<ConcurrentMergeScheduler>();
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue