LUCENE-2326: Removed SVN checkouts for backwards tests. The backwards branch is now included in the svn repository using "svn copy" after release.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@924207 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-03-17 10:24:07 +00:00
parent 5023a08ace
commit 675597141b
753 changed files with 158526 additions and 100 deletions

View File

@ -238,9 +238,13 @@ Optimizations
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
into core, and moved the ICU-based collation support into contrib/icu.
(Robert Muir)
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
into core, and moved the ICU-based collation support into contrib/icu.
(Robert Muir)
* LUCENE-2326: Removed SVN checkouts for backwards tests. The backwards branch
is now included in the svn repository using "svn copy" after release.
(Uwe Schindler)
Test Cases

View File

@ -0,0 +1,13 @@
This folder contains the src/ folder of the previous Lucene major version.
The test-backwards ANT task compiles the core classes of the previous version and its tests
against these class files. After that the compiled test classes are run against the new
lucene-core.jar file.
After branching a new Lucene major version (branch name "lucene_X_Y") do the following:
* svn rm backwards/src/
* svn cp https://svn.apache.org/repos/asf/lucene/java/branches/lucene_X_Y/src/ backwards/src/
* Check that everything is correct: The backwards folder should contain a src/ folder
that now contains java, test, demo,.... The files should be the ones from the branch.
* Run "ant test-backwards"

View File

@ -0,0 +1,253 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="lucene-demo" default="compile-demo" basedir=".">
<dirname file="${ant.file.common}" property="common.dir"/>
<property name="version" value="@PLACEHOLDER_version@"/>
<property name="javac.source" value="@PLACEHOLDER_javac.source@"/>
<property name="javac.target" value="@PLACEHOLDER_javac.target@"/>
<property name="build.dir" location="build"/>
<property name="core.name" value="lucene-core-${version}"/>
<property name="demo.name" value="lucene-demos-${version}"/>
<property name="demo.war.name" value="luceneweb"/>
<property name="manifest.file" location="${build.dir}/MANIFEST.MF"/>
<!-- Build classpath -->
<path id="classpath">
<pathelement location="${common.dir}/${core.name}.jar"/>
</path>
<path id="demo.classpath">
<path refid="classpath"/>
<pathelement location="${build.dir}/classes/demo"/>
</path>
<available
property="jar.core.present"
type="file"
file="${common.dir}/${core.name}.jar"
/>
<target name="jar.core-check">
<fail unless="jar.core.present">
##################################################################
${common.dir}/${core.name}.jar not found.
##################################################################
</fail>
</target>
<!-- ================================================================== -->
<!-- J A R -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="jar-demo" depends="compile-demo"
description="Build demo jar file">
<sequential>
<build-manifest/>
<jar
destfile="${demo.name}.jar"
basedir="${build.dir}/classes/demo"
excludes="**/*.java"
manifest="${manifest.file}">
<metainf dir="${common.dir}">
<include name="LICENSE.txt"/>
<include name="NOTICE.txt"/>
</metainf>
</jar>
</sequential>
</target>
<target name="war-demo" depends="jar-demo"
description="Build demo war file">
<sequential>
<build-manifest/>
<war destfile="${demo.war.name}.war"
webxml="src/jsp/WEB-INF/web.xml"
manifest="${manifest.file}">
<fileset dir="src/jsp" excludes="WEB-INF/web.xml"/>
<lib dir="." includes="${demo.name}.jar"/>
<lib dir="." includes="${core.name}.jar"/>
<metainf dir="${common.dir}">
<include name="LICENSE.txt"/>
<include name="NOTICE.txt"/>
</metainf>
</war>
</sequential>
</target>
<!-- ================================================================== -->
<!-- B U I L D D E M O -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="compile-demo" depends="jar.core-check"
description="Compile demo classes">
<mkdir dir="${build.dir}/classes/demo"/>
<compile
srcdir="src/demo"
destdir="${build.dir}/classes/demo">
<classpath refid="demo.classpath"/>
</compile>
</target>
<target name="clean"
description="Removes contents of build directory">
<delete dir="${build.dir}"/>
<delete dir="${common.dir}/demo-text-dir"/>
<delete dir="${common.dir}/demo-html-dir"/>
</target>
<!-- ================================================================== -->
<!-- R U N T E X T I N D E X I N G D E M O -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="demo-index-text" depends="jar-demo"
description="Run text indexing demo (index the sources of the demo).">
<echo>----- (1) Prepare dir ----- </echo>
<echo>cd ${common.dir} </echo>
<echo>rmdir demo-text-dir </echo>
<delete dir="${common.dir}/demo-text-dir"/>
<echo>mkdir demo-text-dir </echo>
<mkdir dir="${common.dir}/demo-text-dir"/>
<echo>cd demo-text-dir </echo>
<echo>----- (2) Index the files located under ${common.dir}/src ----- </echo>
<invoke-java class="IndexFiles" params="${common.dir}/src/demo" paramsDisplay="../src/demo" type="text"/>
</target>
<!-- ================================================================== -->
<!-- R U N T E X T S E A R C H D E M O -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="demo-search-text" depends="jar-demo"
description="Run interactive search demo.">
<echo>----- Interactive search ----- </echo>
<echo>cd demo-text-dir </echo>
<invoke-java class="SearchFiles" params="-index index" paramsDisplay="-index index" type="text"/>
</target>
<!-- ================================================================== -->
<!-- R U N H T M L I N D E X I N G D E M O -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="demo-index-html" depends="jar-demo"
description="Run html indexing demo (index the javadocs).">
<echo>----- (1) Prepare dir ----- </echo>
<echo>cd ${common.dir} </echo>
<echo>rmdir demo-html-dir </echo>
<delete dir="${common.dir}/demo-html-dir"/>
<echo>mkdir demo-html-dir </echo>
<mkdir dir="${common.dir}/demo-html-dir"/>
<echo>cd demo-html-dir </echo>
<echo>----- (2) Index the files located under ${common.dir}/src ----- </echo>
<invoke-java class="IndexFiles" params="${common.dir}/docs/api" paramsDisplay="../docs/api" type="html"/>
</target>
<!-- ================================================================== -->
<!-- R U N H T M L S E A R C H D E M O -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="demo-search-html" depends="jar-demo"
description="Run interactive search demo.">
<echo>----- Interactive search ----- </echo>
<echo>cd demo-html-dir </echo>
<invoke-java class="SearchFiles" params="-index index" paramsDisplay="-index index" type="html"/>
</target>
<!--+
| M A C R O S
+-->
<macrodef name="build-manifest" description="Builds a manifest file">
<sequential>
<manifest file="${manifest.file}">
<attribute name="Specification-Title" value="Lucene Search Engine: demos"/>
<!-- spec version must match "digit+{.digit+}*" -->
<attribute name="Specification-Version" value="${version}"/>
<attribute name="Specification-Vendor"
value="The Apache Software Foundation"/>
<attribute name="Implementation-Title" value="org.apache.lucene"/>
<!-- impl version can be any string -->
<attribute name="Implementation-Version"
value="${version}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
<attribute name="X-Compile-Source-JDK"
value="${javac.source}"/>
<attribute name="X-Compile-Target-JDK"
value="${javac.target}"/>
</manifest>
</sequential>
</macrodef>
<macrodef name="compile">
<attribute name="srcdir"/>
<attribute name="destdir"/>
<element name="nested" implicit="yes" optional="yes"/>
<sequential>
<mkdir dir="@{destdir}"/>
<javac
srcdir="@{srcdir}"
destdir="@{destdir}"
deprecation="off"
debug="on"
source="${javac.source}"
target="${javac.target}">
<nested/>
</javac>
</sequential>
</macrodef>
<macrodef name="invoke-java">
<attribute name="class"/>
<attribute name="params"/>
<attribute name="paramsDisplay"/>
<attribute name="type"/>
<sequential>
<echo>java -classpath "../${core.name}.jar;../${demo.name}.jar" org.apache.lucene.demo.@{class} @{paramsDisplay} </echo>
<java classname="org.apache.lucene.demo.@{class}"
dir="${common.dir}/demo-@{type}-dir"
fork="true"
failonerror="true"
maxmemory="128m"
>
<arg value="@{params}"/>
<classpath>
<pathelement location="${common.dir}/${core.name}.jar"/>
<pathelement location="${common.dir}/${demo.name}.jar"/>
</classpath>
</java>
</sequential>
</macrodef>
</project>

View File

@ -0,0 +1,66 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
//import org.apache.lucene.index.Term;
/** Deletes documents from an index that do not contain a term. */
public class DeleteFiles {
private DeleteFiles() {} // singleton
/** Deletes documents from an index that do not contain a term. */
public static void main(String[] args) {
String usage = "java org.apache.lucene.demo.DeleteFiles <unique_term>";
if (args.length == 0) {
System.err.println("Usage: " + usage);
System.exit(1);
}
try {
Directory directory = FSDirectory.open(new File("index"));
IndexReader reader = IndexReader.open(directory, false); // we don't want read-only because we are about to delete
Term term = new Term("path", args[0]);
int deleted = reader.deleteDocuments(term);
System.out.println("deleted " + deleted +
" documents containing " + term);
// one can also delete documents by their internal id:
/*
for (int i = 0; i < reader.maxDoc(); i++) {
System.out.println("Deleting document with id " + i);
reader.delete(i);
}*/
reader.close();
directory.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
}

View File

@ -0,0 +1,71 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.FileReader;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/** A utility for making Lucene Documents from a File. */
public class FileDocument {
/** Makes a document for a File.
<p>
The document has three fields:
<ul>
<li><code>path</code>--containing the pathname of the file, as a stored,
untokenized field;
<li><code>modified</code>--containing the last modified date of the file as
a field as created by <a
href="lucene.document.DateTools.html">DateTools</a>; and
<li><code>contents</code>--containing the full contents of the file, as a
Reader field;
*/
public static Document Document(File f)
throws java.io.FileNotFoundException {
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a field that is
// indexed (i.e. searchable), but don't tokenize the field into words.
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the last modified date of the file a field named "modified". Use
// a field that is indexed (i.e. searchable), but don't tokenize the field
// into words.
doc.add(new Field("modified",
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the contents of the file to a field named "contents". Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in the system's default encoding.
// If that's not the case searching for special characters will fail.
doc.add(new Field("contents", new FileReader(f)));
// return the document
return doc;
}
private FileDocument() {}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import org.apache.lucene.document.*;
import org.apache.lucene.demo.html.HTMLParser;
/** A utility for making Lucene Documents for HTML documents. */
public class HTMLDocument {
static char dirSep = System.getProperty("file.separator").charAt(0);
public static String uid(File f) {
// Append path and date into a string in such a way that lexicographic
// sorting gives the same results as a walk of the file hierarchy. Thus
// null (\u0000) is used both to separate directory components and to
// separate the path from the date.
return f.getPath().replace(dirSep, '\u0000') +
"\u0000" +
DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND);
}
public static String uid2url(String uid) {
String url = uid.replace('\u0000', '/'); // replace nulls with slashes
return url.substring(0, url.lastIndexOf('/')); // remove date from end
}
public static Document Document(File f)
throws IOException, InterruptedException {
// make a new, empty document
Document doc = new Document();
// Add the url as a field named "path". Use a field that is
// indexed (i.e. searchable), but don't tokenize the field into words.
doc.add(new Field("path", f.getPath().replace(dirSep, '/'), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// Add the last modified date of the file a field named "modified".
// Use a field that is indexed (i.e. searchable), but don't tokenize
// the field into words.
doc.add(new Field("modified",
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the uid as a field, so that index can be incrementally maintained.
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
FileInputStream fis = new FileInputStream(f);
HTMLParser parser = new HTMLParser(fis);
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
doc.add(new Field("contents", parser.getReader()));
// Add the summary as a field that is stored and returned with
// hit documents for display.
doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));
// Add the title as a field that it can be searched and that is stored.
doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
// return the document
return doc;
}
private HTMLDocument() {}
}

View File

@ -0,0 +1,100 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
/** Index all text files under a directory. */
public class IndexFiles {
private IndexFiles() {}
static final File INDEX_DIR = new File("index");
/** Index all text files under a directory. */
public static void main(String[] args) {
String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>";
if (args.length == 0) {
System.err.println("Usage: " + usage);
System.exit(1);
}
if (INDEX_DIR.exists()) {
System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
System.exit(1);
}
final File docDir = new File(args[0]);
if (!docDir.exists() || !docDir.canRead()) {
System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);
System.out.println("Indexing to directory '" +INDEX_DIR+ "'...");
indexDocs(writer, docDir);
System.out.println("Optimizing...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
static void indexDocs(IndexWriter writer, File file)
throws IOException {
// do not try to index files that cannot be read
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
// an IO error could occur
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
System.out.println("adding " + file);
try {
writer.addDocument(FileDocument.Document(file));
}
// at least on windows, some temporary files raise this exception with an "access denied" message
// checking if the file can be read doesn't help
catch (FileNotFoundException fnfe) {
;
}
}
}
}
}

View File

@ -0,0 +1,168 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.File;
import java.util.Date;
import java.util.Arrays;
/** Indexer for HTML files. */
public class IndexHTML {
private IndexHTML() {}
private static boolean deleting = false; // true during deletion pass
private static IndexReader reader; // existing index
private static IndexWriter writer; // new index being built
private static TermEnum uidIter; // document id iterator
/** Indexer for HTML files.*/
public static void main(String[] argv) {
try {
File index = new File("index");
boolean create = false;
File root = null;
String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
if (argv.length == 0) {
System.err.println("Usage: " + usage);
return;
}
for (int i = 0; i < argv.length; i++) {
if (argv[i].equals("-index")) { // parse -index option
index = new File(argv[++i]);
} else if (argv[i].equals("-create")) { // parse -create option
create = true;
} else if (i != argv.length-1) {
System.err.println("Usage: " + usage);
return;
} else
root = new File(argv[i]);
}
if(root == null) {
System.err.println("Specify directory to index");
System.err.println("Usage: " + usage);
return;
}
Date start = new Date();
if (!create) { // delete stale docs
deleting = true;
indexDocs(root, index, create);
}
writer = new IndexWriter(FSDirectory.open(index), new StandardAnalyzer(Version.LUCENE_CURRENT), create,
new IndexWriter.MaxFieldLength(1000000));
indexDocs(root, index, create); // add new docs
System.out.println("Optimizing index...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
} catch (Exception e) {
e.printStackTrace();
}
}
/* Walk directory hierarchy in uid order, while keeping uid iterator from
/* existing index in sync. Mismatches indicate one of: (a) old documents to
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
/* documents, to be indexed.
*/
private static void indexDocs(File file, File index, boolean create)
throws Exception {
if (!create) { // incrementally update
reader = IndexReader.open(FSDirectory.open(index), false); // open existing index
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
indexDocs(file);
if (deleting) { // delete rest of stale docs
while (uidIter.term() != null && uidIter.term().field() == "uid") {
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.deleteDocuments(uidIter.term());
uidIter.next();
}
deleting = false;
}
uidIter.close(); // close uid iterator
reader.close(); // close existing index
} else // don't have exisiting
indexDocs(file);
}
private static void indexDocs(File file) throws Exception {
if (file.isDirectory()) { // if a directory
String[] files = file.list(); // list its files
Arrays.sort(files); // sort the files
for (int i = 0; i < files.length; i++) // recursively index them
indexDocs(new File(file, files[i]));
} else if (file.getPath().endsWith(".html") || // index .html files
file.getPath().endsWith(".htm") || // index .htm files
file.getPath().endsWith(".txt")) { // index .txt files
if (uidIter != null) {
String uid = HTMLDocument.uid(file); // construct uid for doc
while (uidIter.term() != null && uidIter.term().field() == "uid" &&
uidIter.term().text().compareTo(uid) < 0) {
if (deleting) { // delete stale docs
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.deleteDocuments(uidIter.term());
}
uidIter.next();
}
if (uidIter.term() != null && uidIter.term().field() == "uid" &&
uidIter.term().text().compareTo(uid) == 0) {
uidIter.next(); // keep matching docs
} else if (!deleting) { // add new docs
Document doc = HTMLDocument.Document(file);
System.out.println("adding " + doc.get("path"));
writer.addDocument(doc);
}
} else { // creating a new index
Document doc = HTMLDocument.Document(file);
System.out.println("adding " + doc.get("path"));
writer.addDocument(doc); // add docs unconditionally
}
}
}
}

View File

@ -0,0 +1,313 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/** Simple command-line based search demo. */
public class SearchFiles {
/** Use the norms from one field for all fields. Norms are read into memory,
* using a byte of memory per document per searched field. This can cause
* search of large collections with a large number of fields to run out of
* memory. If all of the fields contain only a single token, then the norms
* are all identical, then single norm vector may be shared. */
private static class OneNormsReader extends FilterIndexReader {
private String field;
public OneNormsReader(IndexReader in, String field) {
super(in);
this.field = field;
}
@Override
public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}
private SearchFiles() {}
/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
String usage =
"Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field] [-paging hitsPerPage]";
usage += "\n\tSpecify 'false' for hitsPerPage to use streaming instead of paging search.";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
}
String index = "index";
String field = "contents";
String queries = null;
int repeat = 0;
boolean raw = false;
String normsField = null;
boolean paging = true;
int hitsPerPage = 10;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
index = args[i+1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i+1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i+1];
i++;
} else if ("-repeat".equals(args[i])) {
repeat = Integer.parseInt(args[i+1]);
i++;
} else if ("-raw".equals(args[i])) {
raw = true;
} else if ("-norms".equals(args[i])) {
normsField = args[i+1];
i++;
} else if ("-paging".equals(args[i])) {
if (args[i+1].equals("false")) {
paging = false;
} else {
hitsPerPage = Integer.parseInt(args[i+1]);
if (hitsPerPage == 0) {
paging = false;
}
}
i++;
}
}
IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true); // only searching, so read-only=true
if (normsField != null)
reader = new OneNormsReader(reader, normsField);
Searcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new FileReader(queries));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, analyzer);
while (true) {
if (queries == null) // prompt the user
System.out.println("Enter query: ");
String line = in.readLine();
if (line == null || line.length() == -1)
break;
line = line.trim();
if (line.length() == 0)
break;
Query query = parser.parse(line);
System.out.println("Searching for: " + query.toString(field));
if (repeat > 0) { // repeat & time as benchmark
Date start = new Date();
for (int i = 0; i < repeat; i++) {
searcher.search(query, null, 100);
}
Date end = new Date();
System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
}
if (paging) {
doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null);
} else {
doStreamingSearch(searcher, query);
}
}
reader.close();
}
/**
* This method uses a custom HitCollector implementation which simply prints out
* the docId and score of every matching document.
*
* This simulates the streaming search use case, where all hits are supposed to
* be processed, regardless of their relevance.
*/
public static void doStreamingSearch(final Searcher searcher, Query query) throws IOException {
Collector streamingHitCollector = new Collector() {
private Scorer scorer;
private int docBase;
// simply print docId and score of every matching document
@Override
public void collect(int doc) throws IOException {
System.out.println("doc=" + doc + docBase + " score=" + scorer.score());
}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
@Override
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
this.docBase = docBase;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
};
searcher.search(query, streamingHitCollector);
}
/**
* This demonstrates a typical paging search scenario, where the search engine presents
* pages of size n to the user. The user can then go to the next page if interested in
* the next hits.
*
* When the query is executed for the first time, then only enough results are collected
* to fill 5 result pages. If the user wants to page beyond this limit, then the query
* is executed another time and all hits are collected.
*
*/
public static void doPagingSearch(BufferedReader in, Searcher searcher, Query query,
int hitsPerPage, boolean raw, boolean interactive) throws IOException {
// Collect enough docs to show 5 pages
TopScoreDocCollector collector = TopScoreDocCollector.create(
5 * hitsPerPage, false);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
int numTotalHits = collector.getTotalHits();
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
while (true) {
if (end > hits.length) {
System.out.println("Only results 1 - " + hits.length +" of " + numTotalHits + " total matching documents collected.");
System.out.println("Collect more (y/n) ?");
String line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n') {
break;
}
collector = TopScoreDocCollector.create(numTotalHits, false);
searcher.search(query, collector);
hits = collector.topDocs().scoreDocs;
}
end = Math.min(hits.length, start + hitsPerPage);
for (int i = start; i < end; i++) {
if (raw) { // output raw format
System.out.println("doc="+hits[i].doc+" score="+hits[i].score);
continue;
}
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
if (path != null) {
System.out.println((i+1) + ". " + path);
String title = doc.get("title");
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i+1) + ". " + "No path for this document");
}
}
if (!interactive) {
break;
}
if (numTotalHits >= end) {
boolean quit = false;
while (true) {
System.out.print("Press ");
if (start - hitsPerPage >= 0) {
System.out.print("(p)revious page, ");
}
if (start + hitsPerPage < numTotalHits) {
System.out.print("(n)ext page, ");
}
System.out.println("(q)uit or enter number to jump to a page.");
String line = in.readLine();
if (line.length() == 0 || line.charAt(0)=='q') {
quit = true;
break;
}
if (line.charAt(0) == 'p') {
start = Math.max(0, start - hitsPerPage);
break;
} else if (line.charAt(0) == 'n') {
if (start + hitsPerPage < numTotalHits) {
start+=hitsPerPage;
}
break;
} else {
int page = Integer.parseInt(line);
if ((page - 1) * hitsPerPage < numTotalHits) {
start = (page - 1) * hitsPerPage;
break;
} else {
System.out.println("No such page");
}
}
}
if (quit) break;
end = Math.min(numTotalHits, start + hitsPerPage);
}
}
}
}

View File

@ -0,0 +1,329 @@
package org.apache.lucene.demo.html;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
public class Entities {
static final Map<String,String> decoder = new HashMap<String,String>(300);
static final String[] encoder = new String[0x100];
static final String decode(String entity) {
if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
entity = entity.substring(0, entity.length()-1);
if (entity.charAt(1) == '#') {
int start = 2;
int radix = 10;
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
start++;
radix = 16;
}
Character c =
new Character((char)Integer.parseInt(entity.substring(start), radix));
return c.toString();
} else {
String s = (String)decoder.get(entity);
if (s != null)
return s;
else return "";
}
}
public static final String encode(String s) {
int length = s.length();
StringBuffer buffer = new StringBuffer(length * 2);
for (int i = 0; i < length; i++) {
char c = s.charAt(i);
int j = (int)c;
if (j < 0x100 && encoder[j] != null) {
buffer.append(encoder[j]); // have a named encoding
buffer.append(';');
} else if (j < 0x80) {
buffer.append(c); // use ASCII value
} else {
buffer.append("&#"); // use numeric encoding
buffer.append((int)c);
buffer.append(';');
}
}
return buffer.toString();
}
static final void add(String entity, int value) {
decoder.put(entity, (new Character((char)value)).toString());
if (value < 0x100)
encoder[value] = entity;
}
static {
add("&nbsp", 160);
add("&iexcl", 161);
add("&cent", 162);
add("&pound", 163);
add("&curren", 164);
add("&yen", 165);
add("&brvbar", 166);
add("&sect", 167);
add("&uml", 168);
add("&copy", 169);
add("&ordf", 170);
add("&laquo", 171);
add("&not", 172);
add("&shy", 173);
add("&reg", 174);
add("&macr", 175);
add("&deg", 176);
add("&plusmn", 177);
add("&sup2", 178);
add("&sup3", 179);
add("&acute", 180);
add("&micro", 181);
add("&para", 182);
add("&middot", 183);
add("&cedil", 184);
add("&sup1", 185);
add("&ordm", 186);
add("&raquo", 187);
add("&frac14", 188);
add("&frac12", 189);
add("&frac34", 190);
add("&iquest", 191);
add("&Agrave", 192);
add("&Aacute", 193);
add("&Acirc", 194);
add("&Atilde", 195);
add("&Auml", 196);
add("&Aring", 197);
add("&AElig", 198);
add("&Ccedil", 199);
add("&Egrave", 200);
add("&Eacute", 201);
add("&Ecirc", 202);
add("&Euml", 203);
add("&Igrave", 204);
add("&Iacute", 205);
add("&Icirc", 206);
add("&Iuml", 207);
add("&ETH", 208);
add("&Ntilde", 209);
add("&Ograve", 210);
add("&Oacute", 211);
add("&Ocirc", 212);
add("&Otilde", 213);
add("&Ouml", 214);
add("&times", 215);
add("&Oslash", 216);
add("&Ugrave", 217);
add("&Uacute", 218);
add("&Ucirc", 219);
add("&Uuml", 220);
add("&Yacute", 221);
add("&THORN", 222);
add("&szlig", 223);
add("&agrave", 224);
add("&aacute", 225);
add("&acirc", 226);
add("&atilde", 227);
add("&auml", 228);
add("&aring", 229);
add("&aelig", 230);
add("&ccedil", 231);
add("&egrave", 232);
add("&eacute", 233);
add("&ecirc", 234);
add("&euml", 235);
add("&igrave", 236);
add("&iacute", 237);
add("&icirc", 238);
add("&iuml", 239);
add("&eth", 240);
add("&ntilde", 241);
add("&ograve", 242);
add("&oacute", 243);
add("&ocirc", 244);
add("&otilde", 245);
add("&ouml", 246);
add("&divide", 247);
add("&oslash", 248);
add("&ugrave", 249);
add("&uacute", 250);
add("&ucirc", 251);
add("&uuml", 252);
add("&yacute", 253);
add("&thorn", 254);
add("&yuml", 255);
add("&fnof", 402);
add("&Alpha", 913);
add("&Beta", 914);
add("&Gamma", 915);
add("&Delta", 916);
add("&Epsilon",917);
add("&Zeta", 918);
add("&Eta", 919);
add("&Theta", 920);
add("&Iota", 921);
add("&Kappa", 922);
add("&Lambda", 923);
add("&Mu", 924);
add("&Nu", 925);
add("&Xi", 926);
add("&Omicron",927);
add("&Pi", 928);
add("&Rho", 929);
add("&Sigma", 931);
add("&Tau", 932);
add("&Upsilon",933);
add("&Phi", 934);
add("&Chi", 935);
add("&Psi", 936);
add("&Omega", 937);
add("&alpha", 945);
add("&beta", 946);
add("&gamma", 947);
add("&delta", 948);
add("&epsilon",949);
add("&zeta", 950);
add("&eta", 951);
add("&theta", 952);
add("&iota", 953);
add("&kappa", 954);
add("&lambda", 955);
add("&mu", 956);
add("&nu", 957);
add("&xi", 958);
add("&omicron",959);
add("&pi", 960);
add("&rho", 961);
add("&sigmaf", 962);
add("&sigma", 963);
add("&tau", 964);
add("&upsilon",965);
add("&phi", 966);
add("&chi", 967);
add("&psi", 968);
add("&omega", 969);
add("&thetasym",977);
add("&upsih", 978);
add("&piv", 982);
add("&bull", 8226);
add("&hellip", 8230);
add("&prime", 8242);
add("&Prime", 8243);
add("&oline", 8254);
add("&frasl", 8260);
add("&weierp", 8472);
add("&image", 8465);
add("&real", 8476);
add("&trade", 8482);
add("&alefsym",8501);
add("&larr", 8592);
add("&uarr", 8593);
add("&rarr", 8594);
add("&darr", 8595);
add("&harr", 8596);
add("&crarr", 8629);
add("&lArr", 8656);
add("&uArr", 8657);
add("&rArr", 8658);
add("&dArr", 8659);
add("&hArr", 8660);
add("&forall", 8704);
add("&part", 8706);
add("&exist", 8707);
add("&empty", 8709);
add("&nabla", 8711);
add("&isin", 8712);
add("&notin", 8713);
add("&ni", 8715);
add("&prod", 8719);
add("&sum", 8721);
add("&minus", 8722);
add("&lowast", 8727);
add("&radic", 8730);
add("&prop", 8733);
add("&infin", 8734);
add("&ang", 8736);
add("&and", 8743);
add("&or", 8744);
add("&cap", 8745);
add("&cup", 8746);
add("&int", 8747);
add("&there4", 8756);
add("&sim", 8764);
add("&cong", 8773);
add("&asymp", 8776);
add("&ne", 8800);
add("&equiv", 8801);
add("&le", 8804);
add("&ge", 8805);
add("&sub", 8834);
add("&sup", 8835);
add("&nsub", 8836);
add("&sube", 8838);
add("&supe", 8839);
add("&oplus", 8853);
add("&otimes", 8855);
add("&perp", 8869);
add("&sdot", 8901);
add("&lceil", 8968);
add("&rceil", 8969);
add("&lfloor", 8970);
add("&rfloor", 8971);
add("&lang", 9001);
add("&rang", 9002);
add("&loz", 9674);
add("&spades", 9824);
add("&clubs", 9827);
add("&hearts", 9829);
add("&diams", 9830);
add("&quot", 34);
add("&amp", 38);
add("&lt", 60);
add("&gt", 62);
add("&OElig", 338);
add("&oelig", 339);
add("&Scaron", 352);
add("&scaron", 353);
add("&Yuml", 376);
add("&circ", 710);
add("&tilde", 732);
add("&ensp", 8194);
add("&emsp", 8195);
add("&thinsp", 8201);
add("&zwnj", 8204);
add("&zwj", 8205);
add("&lrm", 8206);
add("&rlm", 8207);
add("&ndash", 8211);
add("&mdash", 8212);
add("&lsquo", 8216);
add("&rsquo", 8217);
add("&sbquo", 8218);
add("&ldquo", 8220);
add("&rdquo", 8221);
add("&bdquo", 8222);
add("&dagger", 8224);
add("&Dagger", 8225);
add("&permil", 8240);
add("&lsaquo", 8249);
add("&rsaquo", 8250);
add("&euro", 8364);
}
}

View File

@ -0,0 +1,754 @@
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
package org.apache.lucene.demo.html;
import java.io.*;
import java.util.Properties;
public class HTMLParser implements HTMLParserConstants {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
Properties metaTags=new Properties();
String currentMetaTag=null;
String currentMetaContent=null;
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
Reader pipeIn = null;
Writer pipeOut;
private MyPipedInputStream pipeInStream = null;
private PipedOutputStream pipeOutStream = null;
private class MyPipedInputStream extends PipedInputStream{
public MyPipedInputStream(){
super();
}
public MyPipedInputStream(PipedOutputStream src) throws IOException{
super(src);
}
public boolean full() throws IOException{
return this.available() >= PipedInputStream.PIPE_SIZE;
}
}
/**
* @deprecated Use HTMLParser(FileInputStream) instead
*/
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return title.toString().trim();
}
public Properties getMetaTags() throws IOException,
InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return metaTags;
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit) || sum.equals(""))
return tit;
else
return sum;
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeInStream = new MyPipedInputStream();
pipeOutStream = new PipedOutputStream(pipeInStream);
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inStyle)
return;
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !(title.length() == 0)) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addMetaTag() {
metaTags.setProperty(currentMetaTag, currentMetaContent);
currentMetaTag = null;
currentMetaContent = null;
return;
}
void addSpace() throws IOException {
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
final public void HTMLDocument() throws ParseException, IOException {
Token t;
label_1:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ScriptStart:
case TagName:
case DeclName:
case Comment1:
case Comment2:
case Word:
case Entity:
case Space:
case Punct:
;
break;
default:
jj_la1[0] = jj_gen;
break label_1;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case TagName:
Tag();
afterTag = true;
break;
case DeclName:
t = Decl();
afterTag = true;
break;
case Comment1:
case Comment2:
CommentTag();
afterTag = true;
break;
case ScriptStart:
ScriptTag();
afterTag = true;
break;
case Word:
t = jj_consume_token(Word);
addText(t.image); afterTag = false;
break;
case Entity:
t = jj_consume_token(Entity);
addText(Entities.decode(t.image)); afterTag = false;
break;
case Punct:
t = jj_consume_token(Punct);
addText(t.image); afterTag = false;
break;
case Space:
jj_consume_token(Space);
addSpace(); afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(0);
}
final public void Tag() throws ParseException, IOException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
String tagName = t1.image.toLowerCase();
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
label_2:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
;
break;
default:
jj_la1[2] = jj_gen;
break label_2;
}
t1 = jj_consume_token(ArgName);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgEquals:
jj_consume_token(ArgEquals);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgValue:
case ArgQuote1:
case ArgQuote2:
t2 = ArgValue();
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
if(inMetaTag &&
( t1.image.equalsIgnoreCase("name") ||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase();
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
currentMetaContent=t2.image.toLowerCase();
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
break;
default:
jj_la1[3] = jj_gen;
;
}
break;
default:
jj_la1[4] = jj_gen;
;
}
}
jj_consume_token(TagEnd);
}
final public Token ArgValue() throws ParseException {
Token t = null;
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgValue:
t = jj_consume_token(ArgValue);
{if (true) return t;}
break;
default:
jj_la1[5] = jj_gen;
if (jj_2_1(2)) {
jj_consume_token(ArgQuote1);
jj_consume_token(CloseQuote1);
{if (true) return t;}
} else {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgQuote1:
jj_consume_token(ArgQuote1);
t = jj_consume_token(Quote1Text);
jj_consume_token(CloseQuote1);
{if (true) return t;}
break;
default:
jj_la1[6] = jj_gen;
if (jj_2_2(2)) {
jj_consume_token(ArgQuote2);
jj_consume_token(CloseQuote2);
{if (true) return t;}
} else {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgQuote2:
jj_consume_token(ArgQuote2);
t = jj_consume_token(Quote2Text);
jj_consume_token(CloseQuote2);
{if (true) return t;}
break;
default:
jj_la1[7] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
}
}
}
throw new Error("Missing return statement in function");
}
final public Token Decl() throws ParseException {
Token t;
t = jj_consume_token(DeclName);
label_3:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
case ArgEquals:
case ArgValue:
case ArgQuote1:
case ArgQuote2:
;
break;
default:
jj_la1[8] = jj_gen;
break label_3;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
jj_consume_token(ArgName);
break;
case ArgValue:
case ArgQuote1:
case ArgQuote2:
ArgValue();
break;
case ArgEquals:
jj_consume_token(ArgEquals);
break;
default:
jj_la1[9] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(TagEnd);
{if (true) return t;}
throw new Error("Missing return statement in function");
}
final public void CommentTag() throws ParseException {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case Comment1:
jj_consume_token(Comment1);
label_4:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case CommentText1:
;
break;
default:
jj_la1[10] = jj_gen;
break label_4;
}
jj_consume_token(CommentText1);
}
jj_consume_token(CommentEnd1);
break;
case Comment2:
jj_consume_token(Comment2);
label_5:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case CommentText2:
;
break;
default:
jj_la1[11] = jj_gen;
break label_5;
}
jj_consume_token(CommentText2);
}
jj_consume_token(CommentEnd2);
break;
default:
jj_la1[12] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
final public void ScriptTag() throws ParseException {
jj_consume_token(ScriptStart);
label_6:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ScriptText:
;
break;
default:
jj_la1[13] = jj_gen;
break label_6;
}
jj_consume_token(ScriptText);
}
jj_consume_token(ScriptEnd);
}
private boolean jj_2_1(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_1(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(0, xla); }
}
private boolean jj_2_2(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_2(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(1, xla); }
}
private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
/** Generated Token Manager. */
public HTMLParserTokenManager token_source;
SimpleCharStream jj_input_stream;
/** Current token. */
public Token token;
/** Next token. */
public Token jj_nt;
private int jj_ntk;
private Token jj_scanpos, jj_lastpos;
private int jj_la;
private int jj_gen;
final private int[] jj_la1 = new int[14];
static private int[] jj_la1_0;
static {
jj_la1_init_0();
}
private static void jj_la1_init_0() {
jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
}
final private JJCalls[] jj_2_rtns = new JJCalls[2];
private boolean jj_rescan = false;
private int jj_gc = 0;
/** Constructor with InputStream. */
public HTMLParser(java.io.InputStream stream) {
this(stream, null);
}
/** Constructor with InputStream and supplied encoding */
public HTMLParser(java.io.InputStream stream, String encoding) {
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
token_source = new HTMLParserTokenManager(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Reinitialise. */
public void ReInit(java.io.InputStream stream) {
ReInit(stream, null);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream stream, String encoding) {
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
token_source.ReInit(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Constructor. */
public HTMLParser(java.io.Reader stream) {
jj_input_stream = new SimpleCharStream(stream, 1, 1);
token_source = new HTMLParserTokenManager(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Reinitialise. */
public void ReInit(java.io.Reader stream) {
jj_input_stream.ReInit(stream, 1, 1);
token_source.ReInit(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Constructor with generated Token Manager. */
public HTMLParser(HTMLParserTokenManager tm) {
token_source = tm;
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Reinitialise. */
public void ReInit(HTMLParserTokenManager tm) {
token_source = tm;
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
private Token jj_consume_token(int kind) throws ParseException {
Token oldToken;
if ((oldToken = token).next != null) token = token.next;
else token = token.next = token_source.getNextToken();
jj_ntk = -1;
if (token.kind == kind) {
jj_gen++;
if (++jj_gc > 100) {
jj_gc = 0;
for (int i = 0; i < jj_2_rtns.length; i++) {
JJCalls c = jj_2_rtns[i];
while (c != null) {
if (c.gen < jj_gen) c.first = null;
c = c.next;
}
}
}
return token;
}
token = oldToken;
jj_kind = kind;
throw generateParseException();
}
static private final class LookaheadSuccess extends java.lang.Error { }
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
private boolean jj_scan_token(int kind) {
if (jj_scanpos == jj_lastpos) {
jj_la--;
if (jj_scanpos.next == null) {
jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
} else {
jj_lastpos = jj_scanpos = jj_scanpos.next;
}
} else {
jj_scanpos = jj_scanpos.next;
}
if (jj_rescan) {
int i = 0; Token tok = token;
while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
if (tok != null) jj_add_error_token(kind, i);
}
if (jj_scanpos.kind != kind) return true;
if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
return false;
}
/** Get the next Token. */
final public Token getNextToken() {
if (token.next != null) token = token.next;
else token = token.next = token_source.getNextToken();
jj_ntk = -1;
jj_gen++;
return token;
}
/** Get the specific Token. */
final public Token getToken(int index) {
Token t = token;
for (int i = 0; i < index; i++) {
if (t.next != null) t = t.next;
else t = t.next = token_source.getNextToken();
}
return t;
}
private int jj_ntk() {
if ((jj_nt=token.next) == null)
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
else
return (jj_ntk = jj_nt.kind);
}
private java.util.List<int[]> jj_expentries = new java.util.ArrayList<int[]>();
private int[] jj_expentry;
private int jj_kind = -1;
private int[] jj_lasttokens = new int[100];
private int jj_endpos;
private void jj_add_error_token(int kind, int pos) {
if (pos >= 100) return;
if (pos == jj_endpos + 1) {
jj_lasttokens[jj_endpos++] = kind;
} else if (jj_endpos != 0) {
jj_expentry = new int[jj_endpos];
for (int i = 0; i < jj_endpos; i++) {
jj_expentry[i] = jj_lasttokens[i];
}
jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
int[] oldentry = (int[])(it.next());
if (oldentry.length == jj_expentry.length) {
for (int i = 0; i < jj_expentry.length; i++) {
if (oldentry[i] != jj_expentry[i]) {
continue jj_entries_loop;
}
}
jj_expentries.add(jj_expentry);
break jj_entries_loop;
}
}
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
}
}
/** Generate ParseException. */
public ParseException generateParseException() {
jj_expentries.clear();
boolean[] la1tokens = new boolean[31];
if (jj_kind >= 0) {
la1tokens[jj_kind] = true;
jj_kind = -1;
}
for (int i = 0; i < 14; i++) {
if (jj_la1[i] == jj_gen) {
for (int j = 0; j < 32; j++) {
if ((jj_la1_0[i] & (1<<j)) != 0) {
la1tokens[j] = true;
}
}
}
}
for (int i = 0; i < 31; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;
jj_expentries.add(jj_expentry);
}
}
jj_endpos = 0;
jj_rescan_token();
jj_add_error_token(0, 0);
int[][] exptokseq = new int[jj_expentries.size()][];
for (int i = 0; i < jj_expentries.size(); i++) {
exptokseq[i] = jj_expentries.get(i);
}
return new ParseException(token, exptokseq, tokenImage);
}
/** Enable tracing. */
final public void enable_tracing() {
}
/** Disable tracing. */
final public void disable_tracing() {
}
private void jj_rescan_token() {
jj_rescan = true;
for (int i = 0; i < 2; i++) {
try {
JJCalls p = jj_2_rtns[i];
do {
if (p.gen > jj_gen) {
jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
switch (i) {
case 0: jj_3_1(); break;
case 1: jj_3_2(); break;
}
}
p = p.next;
} while (p != null);
} catch(LookaheadSuccess ls) { }
}
jj_rescan = false;
}
private void jj_save(int index, int xla) {
JJCalls p = jj_2_rtns[index];
while (p.gen > jj_gen) {
if (p.next == null) { p = p.next = new JJCalls(); break; }
p = p.next;
}
p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
}
static final class JJCalls {
int gen;
Token first;
int arg;
JJCalls next;
}
// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
// Token t;
// do {
// t = getNextToken();
// } while (t.kind != TagEnd);
// }
}

View File

@ -0,0 +1,392 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// HTMLParser.jj
options {
STATIC = false;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_LOOKAHEAD = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(HTMLParser)
package org.apache.lucene.demo.html;
import java.io.*;
import java.util.Properties;
public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
Properties metaTags=new Properties();
String currentMetaTag=null;
String currentMetaContent=null;
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
Reader pipeIn = null;
Writer pipeOut;
private MyPipedInputStream pipeInStream = null;
private PipedOutputStream pipeOutStream = null;
private class MyPipedInputStream extends PipedInputStream{
public MyPipedInputStream(){
super();
}
public MyPipedInputStream(PipedOutputStream src) throws IOException{
super(src);
}
public boolean full() throws IOException{
return this.available() >= PipedInputStream.PIPE_SIZE;
}
}
/**
* @deprecated Use HTMLParser(FileInputStream) instead
*/
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return title.toString().trim();
}
public Properties getMetaTags() throws IOException,
InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return metaTags;
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit) || sum.equals(""))
return tit;
else
return sum;
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeInStream = new MyPipedInputStream();
pipeOutStream = new PipedOutputStream(pipeInStream);
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inStyle)
return;
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !(title.length() == 0)) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addMetaTag() {
metaTags.setProperty(currentMetaTag, currentMetaContent);
currentMetaTag = null;
currentMetaContent = null;
return;
}
void addSpace() throws IOException {
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
// Token t;
// do {
// t = getNextToken();
// } while (t.kind != TagEnd);
// }
}
PARSER_END(HTMLParser)
void HTMLDocument() throws IOException :
{
Token t;
}
{
// try {
( Tag() { afterTag = true; }
| t=Decl() { afterTag = true; }
| CommentTag() { afterTag = true; }
| ScriptTag() { afterTag = true; }
| t=<Word> { addText(t.image); afterTag = false; }
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
| t=<Punct> { addText(t.image); afterTag = false; }
| <Space> { addSpace(); afterTag = false; }
)* <EOF>
// } catch (ParseException e) {
// handleException(e);
// }
}
void Tag() throws IOException :
{
Token t1, t2;
boolean inImg = false;
}
{
t1=<TagName> {
String tagName = t1.image.toLowerCase();
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
}
(t1=<ArgName>
(<ArgEquals>
(t2=ArgValue() // save ALT text in IMG tag
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
if(inMetaTag &&
( t1.image.equalsIgnoreCase("name") ||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase();
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
currentMetaContent=t2.image.toLowerCase();
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
}
)?
)?
)*
<TagEnd>
}
Token ArgValue() :
{
Token t = null;
}
{
t=<ArgValue> { return t; }
| LOOKAHEAD(2)
<ArgQuote1> <CloseQuote1> { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
| LOOKAHEAD(2)
<ArgQuote2> <CloseQuote2> { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
}
Token Decl() :
{
Token t;
}
{
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
{ return t; }
}
void CommentTag() :
{}
{
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
}
void ScriptTag() :
{}
{
<ScriptStart> ( <ScriptText> )* <ScriptEnd>
}
TOKEN :
{
< ScriptStart: "<script" > : WithinScript
| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < Comment1: "<!--" > : WithinComment1
| < Comment2: "<!" > : WithinComment2
| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
<LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
| < #NUM: ["0"-"9"] >
| < #HEX: ["0"-"9","A"-"F","a"-"f"] >
| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
| < Space: (<SP>)+ >
| < #SP: [" ","\t","\r","\n"] >
| < Punct: ~[] > // Keep this last. It is a catch-all.
}
<WithinScript> TOKEN:
{
< ScriptText: (~["<",">"])+ | "<" | ">" >
| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
}
<WithinTag> TOKEN:
{
< ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
(~[" ","\t","\r","\n","=",">"])* >
| < ArgEquals: "=" > : AfterEquals
| < TagEnd: ">" | "=>" > : DEFAULT
}
<AfterEquals> TOKEN:
{
< ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
(~[" ","\t","\r","\n",">"])* > : WithinTag
}
<WithinTag, AfterEquals> TOKEN:
{
< ArgQuote1: "'" > : WithinQuote1
| < ArgQuote2: "\"" > : WithinQuote2
}
<WithinTag, AfterEquals> SKIP:
{
< <Space> >
}
<WithinQuote1> TOKEN:
{
< Quote1Text: (~["'"])+ >
| < CloseQuote1: <ArgQuote1> > : WithinTag
}
<WithinQuote2> TOKEN:
{
< Quote2Text: (~["\""])+ >
| < CloseQuote2: <ArgQuote2> > : WithinTag
}
<WithinComment1> TOKEN :
{
< CommentText1: (~["-"])+ | "-" >
| < CommentEnd1: "-->" > : DEFAULT
}
<WithinComment2> TOKEN :
{
< CommentText2: (~[">"])+ >
| < CommentEnd2: ">" > : DEFAULT
}

View File

@ -0,0 +1,124 @@
/* Generated By:JavaCC: Do not edit this line. HTMLParserConstants.java */
package org.apache.lucene.demo.html;
/**
* Token literal values and constants.
* Generated by org.javacc.parser.OtherFilesGen#start()
*/
public interface HTMLParserConstants {
/** End of File. */
int EOF = 0;
/** RegularExpression Id. */
int ScriptStart = 1;
/** RegularExpression Id. */
int TagName = 2;
/** RegularExpression Id. */
int DeclName = 3;
/** RegularExpression Id. */
int Comment1 = 4;
/** RegularExpression Id. */
int Comment2 = 5;
/** RegularExpression Id. */
int Word = 6;
/** RegularExpression Id. */
int LET = 7;
/** RegularExpression Id. */
int NUM = 8;
/** RegularExpression Id. */
int HEX = 9;
/** RegularExpression Id. */
int Entity = 10;
/** RegularExpression Id. */
int Space = 11;
/** RegularExpression Id. */
int SP = 12;
/** RegularExpression Id. */
int Punct = 13;
/** RegularExpression Id. */
int ScriptText = 14;
/** RegularExpression Id. */
int ScriptEnd = 15;
/** RegularExpression Id. */
int ArgName = 16;
/** RegularExpression Id. */
int ArgEquals = 17;
/** RegularExpression Id. */
int TagEnd = 18;
/** RegularExpression Id. */
int ArgValue = 19;
/** RegularExpression Id. */
int ArgQuote1 = 20;
/** RegularExpression Id. */
int ArgQuote2 = 21;
/** RegularExpression Id. */
int Quote1Text = 23;
/** RegularExpression Id. */
int CloseQuote1 = 24;
/** RegularExpression Id. */
int Quote2Text = 25;
/** RegularExpression Id. */
int CloseQuote2 = 26;
/** RegularExpression Id. */
int CommentText1 = 27;
/** RegularExpression Id. */
int CommentEnd1 = 28;
/** RegularExpression Id. */
int CommentText2 = 29;
/** RegularExpression Id. */
int CommentEnd2 = 30;
/** Lexical state. */
int DEFAULT = 0;
/** Lexical state. */
int WithinScript = 1;
/** Lexical state. */
int WithinTag = 2;
/** Lexical state. */
int AfterEquals = 3;
/** Lexical state. */
int WithinQuote1 = 4;
/** Lexical state. */
int WithinQuote2 = 5;
/** Lexical state. */
int WithinComment1 = 6;
/** Lexical state. */
int WithinComment2 = 7;
/** Literal token values. */
String[] tokenImage = {
"<EOF>",
"\"<script\"",
"<TagName>",
"<DeclName>",
"\"<!--\"",
"\"<!\"",
"<Word>",
"<LET>",
"<NUM>",
"<HEX>",
"<Entity>",
"<Space>",
"<SP>",
"<Punct>",
"<ScriptText>",
"<ScriptEnd>",
"<ArgName>",
"\"=\"",
"<TagEnd>",
"<ArgValue>",
"\"\\\'\"",
"\"\\\"\"",
"<token of kind 22>",
"<Quote1Text>",
"<CloseQuote1>",
"<Quote2Text>",
"<CloseQuote2>",
"<CommentText1>",
"\"-->\"",
"<CommentText2>",
"\">\"",
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,198 @@
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
/* JavaCCOptions:KEEP_LINE_COL=null */
package org.apache.lucene.demo.html;
/**
* This exception is thrown when parse errors are encountered.
* You can explicitly create objects of this exception type by
* calling the method generateParseException in the generated
* parser.
*
* You can modify this class to customize your error reporting
* mechanisms so long as you retain the public fields.
*/
public class ParseException extends Exception {
/**
* This constructor is used by the method "generateParseException"
* in the generated parser. Calling this constructor generates
* a new object of this type with the fields "currentToken",
* "expectedTokenSequences", and "tokenImage" set. The boolean
* flag "specialConstructor" is also set to true to indicate that
* this constructor was used to create this object.
* This constructor calls its super class with the empty string
* to force the "toString" method of parent class "Throwable" to
* print the error message in the form:
* ParseException: <result of getMessage>
*/
public ParseException(Token currentTokenVal,
int[][] expectedTokenSequencesVal,
String[] tokenImageVal
)
{
super("");
specialConstructor = true;
currentToken = currentTokenVal;
expectedTokenSequences = expectedTokenSequencesVal;
tokenImage = tokenImageVal;
}
/**
* The following constructors are for use by you for whatever
* purpose you can think of. Constructing the exception in this
* manner makes the exception behave in the normal way - i.e., as
* documented in the class "Throwable". The fields "errorToken",
* "expectedTokenSequences", and "tokenImage" do not contain
* relevant information. The JavaCC generated code does not use
* these constructors.
*/
public ParseException() {
super();
specialConstructor = false;
}
/** Constructor with message. */
public ParseException(String message) {
super(message);
specialConstructor = false;
}
/**
* This variable determines which constructor was used to create
* this object and thereby affects the semantics of the
* "getMessage" method (see below).
*/
protected boolean specialConstructor;
/**
* This is the last token that has been consumed successfully. If
* this object has been created due to a parse error, the token
* followng this token will (therefore) be the first error token.
*/
public Token currentToken;
/**
* Each entry in this array is an array of integers. Each array
* of integers represents a sequence of tokens (by their ordinal
* values) that is expected at this point of the parse.
*/
public int[][] expectedTokenSequences;
/**
* This is a reference to the "tokenImage" array of the generated
* parser within which the parse error occurred. This array is
* defined in the generated ...Constants interface.
*/
public String[] tokenImage;
/**
* This method has the standard behavior when this object has been
* created using the standard constructors. Otherwise, it uses
* "currentToken" and "expectedTokenSequences" to generate a parse
* error message and returns it. If this object has been created
* due to a parse error, and you do not catch it (it gets thrown
* from the parser), then this method is called during the printing
* of the final stack trace, and hence the correct error message
* gets displayed.
*/
public String getMessage() {
if (!specialConstructor) {
return super.getMessage();
}
StringBuffer expected = new StringBuffer();
int maxSize = 0;
for (int i = 0; i < expectedTokenSequences.length; i++) {
if (maxSize < expectedTokenSequences[i].length) {
maxSize = expectedTokenSequences[i].length;
}
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
}
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
expected.append("...");
}
expected.append(eol).append(" ");
}
String retval = "Encountered \"";
Token tok = currentToken.next;
for (int i = 0; i < maxSize; i++) {
if (i != 0) retval += " ";
if (tok.kind == 0) {
retval += tokenImage[0];
break;
}
retval += " " + tokenImage[tok.kind];
retval += " \"";
retval += add_escapes(tok.image);
retval += " \"";
tok = tok.next;
}
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
retval += "." + eol;
if (expectedTokenSequences.length == 1) {
retval += "Was expecting:" + eol + " ";
} else {
retval += "Was expecting one of:" + eol + " ";
}
retval += expected.toString();
return retval;
}
/**
* The end of line string for this machine.
*/
protected String eol = System.getProperty("line.separator", "\n");
/**
* Used to convert raw characters to their escaped version
* when these raw version cannot be used as part of an ASCII
* string literal.
*/
protected String add_escapes(String str) {
StringBuffer retval = new StringBuffer();
char ch;
for (int i = 0; i < str.length(); i++) {
switch (str.charAt(i))
{
case 0 :
continue;
case '\b':
retval.append("\\b");
continue;
case '\t':
retval.append("\\t");
continue;
case '\n':
retval.append("\\n");
continue;
case '\f':
retval.append("\\f");
continue;
case '\r':
retval.append("\\r");
continue;
case '\"':
retval.append("\\\"");
continue;
case '\'':
retval.append("\\\'");
continue;
case '\\':
retval.append("\\\\");
continue;
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}
continue;
}
}
return retval.toString();
}
}
/* JavaCC - OriginalChecksum=63b2008c66e199b79536447c26bee2ab (do not edit this line) */

View File

@ -0,0 +1,50 @@
package org.apache.lucene.demo.html;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
class ParserThread extends Thread {
HTMLParser parser;
ParserThread(HTMLParser p) {
parser = p;
}
@Override
public void run() { // convert pipeOut to pipeIn
try {
try { // parse document to pipeOut
parser.HTMLDocument();
} catch (ParseException e) {
System.out.println("Parse Aborted: " + e.getMessage());
} catch (TokenMgrError e) {
System.out.println("Parse Aborted: " + e.getMessage());
} finally {
parser.pipeOut.close();
synchronized (parser) {
parser.summary.setLength(HTMLParser.SUMMARY_LENGTH);
parser.titleComplete = true;
parser.notifyAll();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,472 @@
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.1 */
/* JavaCCOptions:STATIC=false */
package org.apache.lucene.demo.html;
/**
* An implementation of interface CharStream, where the stream is assumed to
* contain only ASCII characters (without unicode processing).
*/
public class SimpleCharStream
{
/** Whether parser is static. */
public static final boolean staticFlag = false;
int bufsize;
int available;
int tokenBegin;
/** Position in buffer. */
public int bufpos = -1;
protected int bufline[];
protected int bufcolumn[];
protected int column = 0;
protected int line = 1;
protected boolean prevCharIsCR = false;
protected boolean prevCharIsLF = false;
protected java.io.Reader inputStream;
protected char[] buffer;
protected int maxNextCharInd = 0;
protected int inBuf = 0;
protected int tabSize = 8;
protected void setTabSize(int i) { tabSize = i; }
protected int getTabSize(int i) { return tabSize; }
protected void ExpandBuff(boolean wrapAround)
{
char[] newbuffer = new char[bufsize + 2048];
int newbufline[] = new int[bufsize + 2048];
int newbufcolumn[] = new int[bufsize + 2048];
try
{
if (wrapAround)
{
System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
System.arraycopy(buffer, 0, newbuffer,
bufsize - tokenBegin, bufpos);
buffer = newbuffer;
System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
System.arraycopy(bufline, 0, newbufline, bufsize - tokenBegin, bufpos);
bufline = newbufline;
System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
System.arraycopy(bufcolumn, 0, newbufcolumn, bufsize - tokenBegin, bufpos);
bufcolumn = newbufcolumn;
maxNextCharInd = (bufpos += (bufsize - tokenBegin));
}
else
{
System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
buffer = newbuffer;
System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
bufline = newbufline;
System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
bufcolumn = newbufcolumn;
maxNextCharInd = (bufpos -= tokenBegin);
}
}
catch (Throwable t)
{
throw new Error(t.getMessage());
}
bufsize += 2048;
available = bufsize;
tokenBegin = 0;
}
protected void FillBuff() throws java.io.IOException
{
if (maxNextCharInd == available)
{
if (available == bufsize)
{
if (tokenBegin > 2048)
{
bufpos = maxNextCharInd = 0;
available = tokenBegin;
}
else if (tokenBegin < 0)
bufpos = maxNextCharInd = 0;
else
ExpandBuff(false);
}
else if (available > tokenBegin)
available = bufsize;
else if ((tokenBegin - available) < 2048)
ExpandBuff(true);
else
available = tokenBegin;
}
int i;
try {
if ((i = inputStream.read(buffer, maxNextCharInd,
available - maxNextCharInd)) == -1)
{
inputStream.close();
throw new java.io.IOException();
}
else
maxNextCharInd += i;
return;
}
catch(java.io.IOException e) {
--bufpos;
backup(0);
if (tokenBegin == -1)
tokenBegin = bufpos;
throw e;
}
}
/** Start. */
public char BeginToken() throws java.io.IOException
{
tokenBegin = -1;
char c = readChar();
tokenBegin = bufpos;
return c;
}
protected void UpdateLineColumn(char c)
{
column++;
if (prevCharIsLF)
{
prevCharIsLF = false;
line += (column = 1);
}
else if (prevCharIsCR)
{
prevCharIsCR = false;
if (c == '\n')
{
prevCharIsLF = true;
}
else
line += (column = 1);
}
switch (c)
{
case '\r' :
prevCharIsCR = true;
break;
case '\n' :
prevCharIsLF = true;
break;
case '\t' :
column--;
column += (tabSize - (column % tabSize));
break;
default :
break;
}
bufline[bufpos] = line;
bufcolumn[bufpos] = column;
}
/** Read a character. */
public char readChar() throws java.io.IOException
{
if (inBuf > 0)
{
--inBuf;
if (++bufpos == bufsize)
bufpos = 0;
return buffer[bufpos];
}
if (++bufpos >= maxNextCharInd)
FillBuff();
char c = buffer[bufpos];
UpdateLineColumn(c);
return c;
}
/**
* @deprecated
* @see #getEndColumn
*/
public int getColumn() {
return bufcolumn[bufpos];
}
/**
* @deprecated
* @see #getEndLine
*/
public int getLine() {
return bufline[bufpos];
}
/** Get token end column number. */
public int getEndColumn() {
return bufcolumn[bufpos];
}
/** Get token end line number. */
public int getEndLine() {
return bufline[bufpos];
}
/** Get token beginning column number. */
public int getBeginColumn() {
return bufcolumn[tokenBegin];
}
/** Get token beginning line number. */
public int getBeginLine() {
return bufline[tokenBegin];
}
/** Backup a number of characters. */
public void backup(int amount) {
inBuf += amount;
if ((bufpos -= amount) < 0)
bufpos += bufsize;
}
/** Constructor. */
public SimpleCharStream(java.io.Reader dstream, int startline,
int startcolumn, int buffersize)
{
inputStream = dstream;
line = startline;
column = startcolumn - 1;
available = bufsize = buffersize;
buffer = new char[buffersize];
bufline = new int[buffersize];
bufcolumn = new int[buffersize];
}
/** Constructor. */
public SimpleCharStream(java.io.Reader dstream, int startline,
int startcolumn)
{
this(dstream, startline, startcolumn, 4096);
}
/** Constructor. */
public SimpleCharStream(java.io.Reader dstream)
{
this(dstream, 1, 1, 4096);
}
/** Reinitialise. */
public void ReInit(java.io.Reader dstream, int startline,
int startcolumn, int buffersize)
{
inputStream = dstream;
line = startline;
column = startcolumn - 1;
if (buffer == null || buffersize != buffer.length)
{
available = bufsize = buffersize;
buffer = new char[buffersize];
bufline = new int[buffersize];
bufcolumn = new int[buffersize];
}
prevCharIsLF = prevCharIsCR = false;
tokenBegin = inBuf = maxNextCharInd = 0;
bufpos = -1;
}
/** Reinitialise. */
public void ReInit(java.io.Reader dstream, int startline,
int startcolumn)
{
ReInit(dstream, startline, startcolumn, 4096);
}
/** Reinitialise. */
public void ReInit(java.io.Reader dstream)
{
ReInit(dstream, 1, 1, 4096);
}
/** Constructor. */
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
{
this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
}
/** Constructor. */
public SimpleCharStream(java.io.InputStream dstream, int startline,
int startcolumn, int buffersize)
{
this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
}
/** Constructor. */
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
int startcolumn) throws java.io.UnsupportedEncodingException
{
this(dstream, encoding, startline, startcolumn, 4096);
}
/** Constructor. */
public SimpleCharStream(java.io.InputStream dstream, int startline,
int startcolumn)
{
this(dstream, startline, startcolumn, 4096);
}
/** Constructor. */
public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
{
this(dstream, encoding, 1, 1, 4096);
}
/** Constructor. */
public SimpleCharStream(java.io.InputStream dstream)
{
this(dstream, 1, 1, 4096);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
{
ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream dstream, int startline,
int startcolumn, int buffersize)
{
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
{
ReInit(dstream, encoding, 1, 1, 4096);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream dstream)
{
ReInit(dstream, 1, 1, 4096);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
int startcolumn) throws java.io.UnsupportedEncodingException
{
ReInit(dstream, encoding, startline, startcolumn, 4096);
}
/** Reinitialise. */
public void ReInit(java.io.InputStream dstream, int startline,
int startcolumn)
{
ReInit(dstream, startline, startcolumn, 4096);
}
/** Get token literal value. */
public String GetImage()
{
if (bufpos >= tokenBegin)
return new String(buffer, tokenBegin, bufpos - tokenBegin + 1);
else
return new String(buffer, tokenBegin, bufsize - tokenBegin) +
new String(buffer, 0, bufpos + 1);
}
/** Get the suffix. */
public char[] GetSuffix(int len)
{
char[] ret = new char[len];
if ((bufpos + 1) >= len)
System.arraycopy(buffer, bufpos - len + 1, ret, 0, len);
else
{
System.arraycopy(buffer, bufsize - (len - bufpos - 1), ret, 0,
len - bufpos - 1);
System.arraycopy(buffer, 0, ret, len - bufpos - 1, bufpos + 1);
}
return ret;
}
/** Reset buffer when finished. */
public void Done()
{
buffer = null;
bufline = null;
bufcolumn = null;
}
/**
* Method to adjust line and column numbers for the start of a token.
*/
public void adjustBeginLineColumn(int newLine, int newCol)
{
int start = tokenBegin;
int len;
if (bufpos >= tokenBegin)
{
len = bufpos - tokenBegin + inBuf + 1;
}
else
{
len = bufsize - tokenBegin + bufpos + 1 + inBuf;
}
int i = 0, j = 0, k = 0;
int nextColDiff = 0, columnDiff = 0;
while (i < len &&
bufline[j = start % bufsize] == bufline[k = ++start % bufsize])
{
bufline[j] = newLine;
nextColDiff = columnDiff + bufcolumn[k] - bufcolumn[j];
bufcolumn[j] = newCol + columnDiff;
columnDiff = nextColDiff;
i++;
}
if (i < len)
{
bufline[j] = newLine++;
bufcolumn[j] = newCol + columnDiff;
while (i++ < len)
{
if (bufline[j = start % bufsize] != bufline[++start % bufsize])
bufline[j] = newLine++;
else
bufline[j] = newLine;
}
}
line = bufline[j];
column = bufcolumn[j];
}
}
/* JavaCC - OriginalChecksum=7393ed4ac2709e2de22d164f9db78b65 (do not edit this line) */

View File

@ -0,0 +1,64 @@
package org.apache.lucene.demo.html;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
public final class Tags {
/**
* contains all tags for which whitespaces have to be inserted for proper tokenization
*/
public static final Set<String> WS_ELEMS = Collections.synchronizedSet(new HashSet<String>());
static{
WS_ELEMS.add("<hr");
WS_ELEMS.add("<hr/"); // note that "<hr />" does not need to be listed explicitly
WS_ELEMS.add("<br");
WS_ELEMS.add("<br/");
WS_ELEMS.add("<p");
WS_ELEMS.add("</p");
WS_ELEMS.add("<div");
WS_ELEMS.add("</div");
WS_ELEMS.add("<td");
WS_ELEMS.add("</td");
WS_ELEMS.add("<li");
WS_ELEMS.add("</li");
WS_ELEMS.add("<q");
WS_ELEMS.add("</q");
WS_ELEMS.add("<blockquote");
WS_ELEMS.add("</blockquote");
WS_ELEMS.add("<dt");
WS_ELEMS.add("</dt");
WS_ELEMS.add("<h1");
WS_ELEMS.add("</h1");
WS_ELEMS.add("<h2");
WS_ELEMS.add("</h2");
WS_ELEMS.add("<h3");
WS_ELEMS.add("</h3");
WS_ELEMS.add("<h4");
WS_ELEMS.add("</h4");
WS_ELEMS.add("<h5");
WS_ELEMS.add("</h5");
WS_ELEMS.add("<h6");
WS_ELEMS.add("</h6");
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.demo.html;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
class Test {
public static void main(String[] argv) throws IOException, InterruptedException {
if ("-dir".equals(argv[0])) {
String[] files = new File(argv[1]).list();
java.util.Arrays.sort(files);
for (int i = 0; i < files.length; i++) {
System.err.println(files[i]);
File file = new File(argv[1], files[i]);
parse(file);
}
} else
parse(new File(argv[0]));
}
public static void parse(File file) throws IOException, InterruptedException {
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
HTMLParser parser = new HTMLParser(fis);
System.out.println("Title: " + Entities.encode(parser.getTitle()));
System.out.println("Summary: " + Entities.encode(parser.getSummary()));
System.out.println("Content:");
LineNumberReader reader = new LineNumberReader(parser.getReader());
for (String l = reader.readLine(); l != null; l = reader.readLine())
System.out.println(l);
} finally {
if (fis != null) fis.close();
}
}
}

View File

@ -0,0 +1,124 @@
/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
/* JavaCCOptions:TOKEN_EXTENDS=,KEEP_LINE_COL=null */
package org.apache.lucene.demo.html;
/**
* Describes the input token stream.
*/
public class Token {
/**
* An integer that describes the kind of this token. This numbering
* system is determined by JavaCCParser, and a table of these numbers is
* stored in the file ...Constants.java.
*/
public int kind;
/** The line number of the first character of this Token. */
public int beginLine;
/** The column number of the first character of this Token. */
public int beginColumn;
/** The line number of the last character of this Token. */
public int endLine;
/** The column number of the last character of this Token. */
public int endColumn;
/**
* The string image of the token.
*/
public String image;
/**
* A reference to the next regular (non-special) token from the input
* stream. If this is the last token from the input stream, or if the
* token manager has not read tokens beyond this one, this field is
* set to null. This is true only if this token is also a regular
* token. Otherwise, see below for a description of the contents of
* this field.
*/
public Token next;
/**
* This field is used to access special tokens that occur prior to this
* token, but after the immediately preceding regular (non-special) token.
* If there are no such special tokens, this field is set to null.
* When there are more than one such special token, this field refers
* to the last of these special tokens, which in turn refers to the next
* previous special token through its specialToken field, and so on
* until the first special token (whose specialToken field is null).
* The next fields of special tokens refer to other special tokens that
* immediately follow it (without an intervening regular token). If there
* is no such token, this field is null.
*/
public Token specialToken;
/**
* An optional attribute value of the Token.
* Tokens which are not used as syntactic sugar will often contain
* meaningful values that will be used later on by the compiler or
* interpreter. This attribute value is often different from the image.
* Any subclass of Token that actually wants to return a non-null value can
* override this method as appropriate.
*/
public Object getValue() {
return null;
}
/**
* No-argument constructor
*/
public Token() {}
/**
* Constructs a new token for the specified Image.
*/
public Token(int kind)
{
this(kind, null);
}
/**
* Constructs a new token for the specified Image and Kind.
*/
public Token(int kind, String image)
{
this.kind = kind;
this.image = image;
}
/**
* Returns the image.
*/
public String toString()
{
return image;
}
/**
* Returns a new Token object, by default. However, if you want, you
* can create and return subclass objects based on the value of ofKind.
* Simply add the cases to the switch for all those special cases.
* For example, if you have a subclass of Token called IDToken that
* you want to create if ofKind is ID, simply add something like :
*
* case MyParserConstants.ID : return new IDToken(ofKind, image);
*
* to the following switch statement. Then you can cast matchedToken
* variable to the appropriate type and use sit in your lexical actions.
*/
public static Token newToken(int ofKind, String image)
{
switch(ofKind)
{
default : return new Token(ofKind, image);
}
}
public static Token newToken(int ofKind)
{
return newToken(ofKind, null);
}
}
/* JavaCC - OriginalChecksum=7bf8bdbb1c45bccd8162cdd48316d5e0 (do not edit this line) */

View File

@ -0,0 +1,141 @@
/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 4.1 */
/* JavaCCOptions: */
package org.apache.lucene.demo.html;
/** Token Manager Error. */
@SuppressWarnings("serial")
public class TokenMgrError extends Error
{
/*
* Ordinals for various reasons why an Error of this type can be thrown.
*/
/**
* Lexical error occurred.
*/
static final int LEXICAL_ERROR = 0;
/**
* An attempt was made to create a second instance of a static token manager.
*/
static final int STATIC_LEXER_ERROR = 1;
/**
* Tried to change to an invalid lexical state.
*/
static final int INVALID_LEXICAL_STATE = 2;
/**
* Detected (and bailed out of) an infinite loop in the token manager.
*/
static final int LOOP_DETECTED = 3;
/**
* Indicates the reason why the exception is thrown. It will have
* one of the above 4 values.
*/
int errorCode;
/**
* Replaces unprintable characters by their escaped (or unicode escaped)
* equivalents in the given string
*/
protected static final String addEscapes(String str) {
StringBuffer retval = new StringBuffer();
char ch;
for (int i = 0; i < str.length(); i++) {
switch (str.charAt(i))
{
case 0 :
continue;
case '\b':
retval.append("\\b");
continue;
case '\t':
retval.append("\\t");
continue;
case '\n':
retval.append("\\n");
continue;
case '\f':
retval.append("\\f");
continue;
case '\r':
retval.append("\\r");
continue;
case '\"':
retval.append("\\\"");
continue;
case '\'':
retval.append("\\\'");
continue;
case '\\':
retval.append("\\\\");
continue;
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}
continue;
}
}
return retval.toString();
}
/**
* Returns a detailed message for the Error when it is thrown by the
* token manager to indicate a lexical error.
* Parameters :
* EOFSeen : indicates if EOF caused the lexical error
* curLexState : lexical state in which this error occurred
* errorLine : line number when the error occurred
* errorColumn : column number when the error occurred
* errorAfter : prefix that was seen before this error occurred
* curchar : the offending character
* Note: You can customize the lexical error message by modifying this method.
*/
protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
return("Lexical error at line " +
errorLine + ", column " +
errorColumn + ". Encountered: " +
(EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
"after : \"" + addEscapes(errorAfter) + "\"");
}
/**
* You can also modify the body of this method to customize your error messages.
* For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
* of end-users concern, so you can return something like :
*
* "Internal Error : Please file a bug report .... "
*
* from this method for such cases in the release version of your parser.
*/
public String getMessage() {
return super.getMessage();
}
/*
* Constructors of various flavors follow.
*/
/** No arg constructor. */
public TokenMgrError() {
}
/** Constructor with message and reason. */
public TokenMgrError(String message, int reason) {
super(message);
errorCode = reason;
}
/** Full Constructor. */
public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
}
}
/* JavaCC - OriginalChecksum=5ffb7e46d5ae93d8d59e6f4ae7eb36d1 (do not edit this line) */

View File

@ -0,0 +1,29 @@
package org.apache.lucene;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Lucene's package information, including version. **/
public final class LucenePackage {
private LucenePackage() {} // can't construct
/** Return Lucene's package, including version information. */
public static Package get() {
return LucenePackage.class.getPackage();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,144 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.IOException;
import java.io.Closeable;
import java.lang.reflect.Method;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.document.Fieldable;
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
* Typical implementations first build a Tokenizer, which breaks the stream of
* characters from the Reader into raw Tokens. One or more TokenFilters may
* then be applied to the output of the Tokenizer.
*/
public abstract class Analyzer implements Closeable {
/** Creates a TokenStream which tokenizes all the text in the provided
* Reader. Must be able to handle null field name for
* backward compatibility.
*/
public abstract TokenStream tokenStream(String fieldName, Reader reader);
/** Creates a TokenStream that is allowed to be re-used
* from the previous time that the same thread called
* this method. Callers that do not need to use more
* than one TokenStream at the same time from this
* analyzer should use this method for better
* performance.
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
return tokenStream(fieldName, reader);
}
private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
/** Used by Analyzers that implement reusableTokenStream
* to retrieve previously saved TokenStreams for re-use
* by the same thread. */
protected Object getPreviousTokenStream() {
try {
return tokenStreams.get();
} catch (NullPointerException npe) {
if (tokenStreams == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/** Used by Analyzers that implement reusableTokenStream
* to save a TokenStream for later re-use by the same
* thread. */
protected void setPreviousTokenStream(Object obj) {
try {
tokenStreams.set(obj);
} catch (NullPointerException npe) {
if (tokenStreams == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/** @deprecated */
protected boolean overridesTokenStreamMethod = false;
/** @deprecated This is only present to preserve
* back-compat of classes that subclass a core analyzer
* and override tokenStream but not reusableTokenStream */
protected void setOverridesTokenStreamMethod(Class<? extends Analyzer> baseClass) {
try {
Method m = this.getClass().getMethod("tokenStream", String.class, Reader.class);
overridesTokenStreamMethod = m.getDeclaringClass() != baseClass;
} catch (NoSuchMethodException nsme) {
// cannot happen, as baseClass is subclass of Analyzer through generics
overridesTokenStreamMethod = false;
}
}
/**
* Invoked before indexing a Fieldable instance if
* terms have already been added to that field. This allows custom
* analyzers to place an automatic position increment gap between
* Fieldable instances using the same field name. The default value
* position increment gap is 0. With a 0 position increment gap and
* the typical default token position increment of 1, all terms in a field,
* including across Fieldable instances, are in successive positions, allowing
* exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
*
* @param fieldName Fieldable name being indexed.
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
*/
public int getPositionIncrementGap(String fieldName) {
return 0;
}
/**
* Just like {@link #getPositionIncrementGap}, except for
* Token offsets instead. By default this returns 1 for
* tokenized fields and, as if the fields were joined
* with an extra space character, and 0 for un-tokenized
* fields. This method is only called if the field
* produced at least one token for indexing.
*
* @param field the field just indexed
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
*/
public int getOffsetGap(Fieldable field) {
if (field.isTokenized())
return 1;
else
return 0;
}
/** Frees persistent resources used by this Analyzer */
public void close() {
tokenStreams.close();
tokenStreams = null;
}
}

View File

@ -0,0 +1,93 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.util.ArrayList;
import java.util.List;
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
* {@link #addOffCorrectMap}, and then invoke the correct
* method to correct an offset.
*
* <p><b>NOTE</b>: This class is not particularly efficient.
* For example, a new class instance is created for every
* call to {@link #addOffCorrectMap}, which is then appended
* to a private list.
*/
public abstract class BaseCharFilter extends CharFilter {
private List<OffCorrectMap> pcmList;
public BaseCharFilter(CharStream in) {
super(in);
}
/** Retrieve the corrected offset. Note that this method
* is slow, if you correct positions far before the most
* recently added position, as it's a simple linear
* search backwards through all offset corrections added
* by {@link #addOffCorrectMap}. */
@Override
protected int correct(int currentOff) {
if (pcmList == null || pcmList.isEmpty()) {
return currentOff;
}
for (int i = pcmList.size() - 1; i >= 0; i--) {
if (currentOff >= pcmList.get(i).off) {
return currentOff + pcmList.get(i).cumulativeDiff;
}
}
return currentOff;
}
protected int getLastCumulativeDiff() {
return pcmList == null || pcmList.isEmpty() ?
0 : pcmList.get(pcmList.size() - 1).cumulativeDiff;
}
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (pcmList == null) {
pcmList = new ArrayList<OffCorrectMap>();
}
pcmList.add(new OffCorrectMap(off, cumulativeDiff));
}
static class OffCorrectMap {
int off;
int cumulativeDiff;
OffCorrectMap(int off, int cumulativeDiff) {
this.off = off;
this.cumulativeDiff = cumulativeDiff;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('(');
sb.append(off);
sb.append(',');
sb.append(cumulativeDiff);
sb.append(')');
return sb.toString();
}
}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.util.AttributeSource;
/**
* This class can be used if the token attributes of a TokenStream
* are intended to be consumed more than once. It caches
* all token attribute states locally in a List.
*
* <P>CachingTokenFilter implements the optional method
* {@link TokenStream#reset()}, which repositions the
* stream to the first Token.
*/
public final class CachingTokenFilter extends TokenFilter {
private List<AttributeSource.State> cache = null;
private Iterator<AttributeSource.State> iterator = null;
private AttributeSource.State finalState;
public CachingTokenFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList<AttributeSource.State>();
fillCache();
iterator = cache.iterator();
}
if (!iterator.hasNext()) {
// the cache is exhausted, return false
return false;
}
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
restoreState(iterator.next());
return true;
}
@Override
public final void end() throws IOException {
if (finalState != null) {
restoreState(finalState);
}
}
@Override
public void reset() throws IOException {
if(cache != null) {
iterator = cache.iterator();
}
}
private void fillCache() throws IOException {
while(input.incrementToken()) {
cache.add(captureState());
}
// capture final state
input.end();
finalState = captureState();
}
}

View File

@ -0,0 +1,390 @@
package org.apache.lucene.analysis;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A simple class that stores Strings as char[]'s in a
* hash table. Note that this is not a general purpose
* class. For example, it cannot remove items from the
* set, nor does it resize its hash table to be smaller,
* etc. It is designed to be quick to test if a char[]
* is in the set without the necessity of converting it
* to a String first.
* <P>
* <em>Please note:</em> This class implements {@link java.util.Set Set} but
* does not behave like it should in all cases. The generic type is
* {@code Set<Object>}, because you can add any object to it,
* that has a string representation. The add methods will use
* {@link Object#toString} and store the result using a {@code char[]}
* buffer. The same behaviour have the {@code contains()} methods.
* The {@link #iterator()} returns an {@code Iterator<String>}.
* For type safety also {@link #stringIterator()} is provided.
*/
public class CharArraySet extends AbstractSet<Object> {
private final static int INIT_SIZE = 8;
private char[][] entries;
private int count;
private final boolean ignoreCase;
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
/** Create set with enough capacity to hold startSize
* terms */
public CharArraySet(int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
while(startSize + (startSize>>2) > size)
size <<= 1;
entries = new char[size][];
}
/** Create set from a Collection of char[] or String */
public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
this(c.size(), ignoreCase);
addAll(c);
}
/** Create set from entries */
private CharArraySet(char[][] entries, boolean ignoreCase, int count){
this.entries = entries;
this.ignoreCase = ignoreCase;
this.count = count;
}
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
* are in the set */
public boolean contains(char[] text, int off, int len) {
return entries[getSlot(text, off, len)] != null;
}
/** true if the <code>CharSequence</code> is in the set */
public boolean contains(CharSequence cs) {
return entries[getSlot(cs)] != null;
}
private int getSlot(char[] text, int off, int len) {
int code = getHashCode(text, off, len);
int pos = code & (entries.length-1);
char[] text2 = entries[pos];
if (text2 != null && !equals(text, off, len, text2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (entries.length-1);
text2 = entries[pos];
} while (text2 != null && !equals(text, off, len, text2));
}
return pos;
}
/** Returns true if the String is in the set */
private int getSlot(CharSequence text) {
int code = getHashCode(text);
int pos = code & (entries.length-1);
char[] text2 = entries[pos];
if (text2 != null && !equals(text, text2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (entries.length-1);
text2 = entries[pos];
} while (text2 != null && !equals(text, text2));
}
return pos;
}
/** Add this CharSequence into the set */
public boolean add(CharSequence text) {
return add(text.toString()); // could be more efficient
}
/** Add this String into the set */
public boolean add(String text) {
return add(text.toCharArray());
}
/** Add this char[] directly to the set.
* If ignoreCase is true for this Set, the text array will be directly modified.
* The user should never modify this text array after calling this method.
*/
public boolean add(char[] text) {
if (ignoreCase)
for(int i=0;i<text.length;i++)
text[i] = Character.toLowerCase(text[i]);
int slot = getSlot(text, 0, text.length);
if (entries[slot] != null) return false;
entries[slot] = text;
count++;
if (count + (count>>2) > entries.length) {
rehash();
}
return true;
}
private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1[off+i]) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1[off+i] != text2[i])
return false;
}
}
return true;
}
private boolean equals(CharSequence text1, char[] text2) {
int len = text1.length();
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1.charAt(i) != text2[i])
return false;
}
}
return true;
}
private void rehash() {
final int newSize = 2*entries.length;
char[][] oldEntries = entries;
entries = new char[newSize][];
for(int i=0;i<oldEntries.length;i++) {
char[] text = oldEntries[i];
if (text != null) {
// todo: could be faster... no need to compare strings on collision
entries[getSlot(text,0,text.length)] = text;
}
}
}
private int getHashCode(char[] text, int offset, int len) {
int code = 0;
final int stop = offset + len;
if (ignoreCase) {
for (int i=offset; i<stop; i++) {
code = code*31 + Character.toLowerCase(text[i]);
}
} else {
for (int i=offset; i<stop; i++) {
code = code*31 + text[i];
}
}
return code;
}
private int getHashCode(CharSequence text) {
int code = 0;
int len = text.length();
if (ignoreCase) {
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text.charAt(i));
}
} else {
for (int i=0; i<len; i++) {
code = code*31 + text.charAt(i);
}
}
return code;
}
@Override
public int size() {
return count;
}
@Override
public boolean isEmpty() {
return count==0;
}
@Override
public boolean contains(Object o) {
if (o instanceof char[]) {
final char[] text = (char[])o;
return contains(text, 0, text.length);
}
return contains(o.toString());
}
@Override
public boolean add(Object o) {
if (o instanceof char[]) {
return add((char[])o);
}
return add(o.toString());
}
/**
* Returns an unmodifiable {@link CharArraySet}. This allows to provide
* unmodifiable views of internal sets for "read-only" use.
*
* @param set
* a set for which the unmodifiable set is returned.
* @return an new unmodifiable {@link CharArraySet}.
* @throws NullPointerException
* if the given set is <code>null</code>.
*/
public static CharArraySet unmodifiableSet(CharArraySet set) {
if (set == null)
throw new NullPointerException("Given set is null");
if (set == EMPTY_SET)
return EMPTY_SET;
if (set instanceof UnmodifiableCharArraySet)
return set;
/*
* Instead of delegating calls to the given set copy the low-level values to
* the unmodifiable Subclass
*/
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
}
/**
* Returns a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be preserved.
*
* @param set
* a set to copy
* @return a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be
* preserved.
*/
public static CharArraySet copy(Set<?> set) {
if (set == null)
throw new NullPointerException("Given set is null");
if(set == EMPTY_SET)
return EMPTY_SET;
final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
: false;
return new CharArraySet(set, ignoreCase);
}
/** The Iterator<String> for this set. Strings are constructed on the fly, so
* use <code>nextCharArray</code> for more efficient access. */
public class CharArraySetIterator implements Iterator<String> {
int pos=-1;
char[] next;
CharArraySetIterator() {
goNext();
}
private void goNext() {
next = null;
pos++;
while (pos < entries.length && (next=entries[pos]) == null) pos++;
}
public boolean hasNext() {
return next != null;
}
/** do not modify the returned char[] */
public char[] nextCharArray() {
char[] ret = next;
goNext();
return ret;
}
/** Returns the next String, as a Set<String> would...
* use nextCharArray() for better efficiency. */
public String next() {
return new String(nextCharArray());
}
public void remove() {
throw new UnsupportedOperationException();
}
}
/** returns an iterator of new allocated Strings */
public Iterator<String> stringIterator() {
return new CharArraySetIterator();
}
/** returns an iterator of new allocated Strings, this method violates the Set interface */
@Override
@SuppressWarnings("unchecked")
public Iterator<Object> iterator() {
return (Iterator) stringIterator();
}
/**
* Efficient unmodifiable {@link CharArraySet}. This implementation does not
* delegate calls to a give {@link CharArraySet} like
* {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
* the internal representation of a {@link CharArraySet} to a super
* constructor and overrides all mutators.
*/
private static final class UnmodifiableCharArraySet extends CharArraySet {
private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
int count) {
super(entries, ignoreCase, count);
}
@Override
public boolean add(Object o){
throw new UnsupportedOperationException();
}
@Override
public boolean addAll(Collection<? extends Object> coll) {
throw new UnsupportedOperationException();
}
@Override
public boolean add(char[] text) {
throw new UnsupportedOperationException();
}
@Override
public boolean add(CharSequence text) {
throw new UnsupportedOperationException();
}
@Override
public boolean add(String text) {
throw new UnsupportedOperationException();
}
}
}

View File

@ -0,0 +1,82 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
/**
* Subclasses of CharFilter can be chained to filter CharStream.
* They can be used as {@link java.io.Reader} with additional offset
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
* if a CharFilter/CharStream subclass is used.
*
* @version $Id$
*
*/
public abstract class CharFilter extends CharStream {
protected CharStream input;
protected CharFilter(CharStream in) {
input = in;
}
/**
* Subclass may want to override to correct the current offset.
*
* @param currentOff current offset
* @return corrected offset
*/
protected int correct(int currentOff) {
return currentOff;
}
/**
* Chains the corrected offset through the input
* CharFilter.
*/
@Override
public final int correctOffset(int currentOff) {
return input.correctOffset(correct(currentOff));
}
@Override
public void close() throws IOException {
input.close();
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return input.read(cbuf, off, len);
}
@Override
public boolean markSupported(){
return input.markSupported();
}
@Override
public void mark( int readAheadLimit ) throws IOException {
input.mark(readAheadLimit);
}
@Override
public void reset() throws IOException {
input.reset();
}
}

View File

@ -0,0 +1,71 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* CharReader is a Reader wrapper. It reads chars from
* Reader and outputs {@link CharStream}, defining an
* identify function {@link #correctOffset} method that
* simply returns the provided offset.
*/
public final class CharReader extends CharStream {
protected Reader input;
public static CharStream get(Reader input) {
return input instanceof CharStream ?
(CharStream)input : new CharReader(input);
}
private CharReader(Reader in) {
input = in;
}
@Override
public int correctOffset(int currentOff) {
return currentOff;
}
@Override
public void close() throws IOException {
input.close();
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return input.read(cbuf, off, len);
}
@Override
public boolean markSupported(){
return input.markSupported();
}
@Override
public void mark( int readAheadLimit ) throws IOException {
input.mark(readAheadLimit);
}
@Override
public void reset() throws IOException {
input.reset();
}
}

View File

@ -0,0 +1,41 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.Reader;
/**
* CharStream adds {@link #correctOffset}
* functionality over {@link Reader}. All Tokenizers accept a
* CharStream instead of {@link Reader} as input, which enables
* arbitrary character based filtering before tokenization.
* The {@link #correctOffset} method fixed offsets to account for
* removal or insertion of characters, so that the offsets
* reported in the tokens match the character offsets of the
* original Reader.
*/
public abstract class CharStream extends Reader {
/**
* Called by CharFilter(s) and Tokenizer to correct token offset.
*
* @param currentOff offset as seen in the output
* @return corrected offset based on the input
*/
public abstract int correctOffset(int currentOff);
}

View File

@ -0,0 +1,126 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
super(input);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(TermAttribute.class);
}
public CharTokenizer(AttributeSource source, Reader input) {
super(source, input);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(TermAttribute.class);
}
public CharTokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(TermAttribute.class);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
* define token boundaries and are not included in tokens. */
protected abstract boolean isTokenChar(char c);
/** Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this
* to, e.g., lowercase tokens. */
protected char normalize(char c) {
return c;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
int length = 0;
int start = bufferIndex;
char[] buffer = termAtt.termBuffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
dataLen = input.read(ioBuffer);
if (dataLen == -1) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0)
break;
else
return false;
}
bufferIndex = 0;
}
final char c = ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
start = offset + bufferIndex - 1;
else if (length == buffer.length)
buffer = termAtt.resizeTermBuffer(1+length);
buffer[length++] = normalize(c); // buffer it, normalized
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
termAtt.setTermLength(length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
bufferIndex = 0;
offset = 0;
dataLen = 0;
}
}

View File

@ -0,0 +1,260 @@
package org.apache.lucene.analysis;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A filter that replaces accented characters in the ISO Latin 1 character set
* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
* <p>
* For instance, '&agrave;' will be replaced by 'a'.
* <p>
*
* @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
* which covers a superset of Latin 1.
* This class is included for use with existing
* indexes and will be removed in a future release (possibly Lucene 4.0).
*/
public final class ISOLatin1AccentFilter extends TokenFilter {
public ISOLatin1AccentFilter(TokenStream input) {
super(input);
termAtt = addAttribute(TermAttribute.class);
}
private char[] output = new char[256];
private int outputPos;
private TermAttribute termAtt;
@Override
public final boolean incrementToken() throws java.io.IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
// If no characters actually require rewriting then we
// just return token as-is:
for(int i=0;i<length;i++) {
final char c = buffer[i];
if (c >= '\u00c0' && c <= '\uFB06') {
removeAccents(buffer, length);
termAtt.setTermBuffer(output, 0, outputPos);
break;
}
}
return true;
} else
return false;
}
/**
* To replace accented characters in a String by unaccented equivalents.
*/
public final void removeAccents(char[] input, int length) {
// Worst-case length required:
final int maxSizeNeeded = 2*length;
int size = output.length;
while (size < maxSizeNeeded)
size *= 2;
if (size != output.length)
output = new char[size];
outputPos = 0;
int pos = 0;
for (int i=0; i<length; i++, pos++) {
final char c = input[pos];
// Quick test: if it's not in range then just keep
// current character
if (c < '\u00c0' || c > '\uFB06')
output[outputPos++] = c;
else {
switch (c) {
case '\u00C0' : // À
case '\u00C1' : // Á
case '\u00C2' : // Â
case '\u00C3' : // Ã
case '\u00C4' : // Ä
case '\u00C5' : // Å
output[outputPos++] = 'A';
break;
case '\u00C6' : // Æ
output[outputPos++] = 'A';
output[outputPos++] = 'E';
break;
case '\u00C7' : // Ç
output[outputPos++] = 'C';
break;
case '\u00C8' : // È
case '\u00C9' : // É
case '\u00CA' : // Ê
case '\u00CB' : // Ë
output[outputPos++] = 'E';
break;
case '\u00CC' : // Ì
case '\u00CD' : // Í
case '\u00CE' : // Î
case '\u00CF' : // Ï
output[outputPos++] = 'I';
break;
case '\u0132' : // IJ
output[outputPos++] = 'I';
output[outputPos++] = 'J';
break;
case '\u00D0' : // Ð
output[outputPos++] = 'D';
break;
case '\u00D1' : // Ñ
output[outputPos++] = 'N';
break;
case '\u00D2' : // Ò
case '\u00D3' : // Ó
case '\u00D4' : // Ô
case '\u00D5' : // Õ
case '\u00D6' : // Ö
case '\u00D8' : // Ø
output[outputPos++] = 'O';
break;
case '\u0152' : // Œ
output[outputPos++] = 'O';
output[outputPos++] = 'E';
break;
case '\u00DE' : // Þ
output[outputPos++] = 'T';
output[outputPos++] = 'H';
break;
case '\u00D9' : // Ù
case '\u00DA' : // Ú
case '\u00DB' : // Û
case '\u00DC' : // Ü
output[outputPos++] = 'U';
break;
case '\u00DD' : // Ý
case '\u0178' : // Ÿ
output[outputPos++] = 'Y';
break;
case '\u00E0' : // à
case '\u00E1' : // á
case '\u00E2' : // â
case '\u00E3' : // ã
case '\u00E4' : // ä
case '\u00E5' : // å
output[outputPos++] = 'a';
break;
case '\u00E6' : // æ
output[outputPos++] = 'a';
output[outputPos++] = 'e';
break;
case '\u00E7' : // ç
output[outputPos++] = 'c';
break;
case '\u00E8' : // è
case '\u00E9' : // é
case '\u00EA' : // ê
case '\u00EB' : // ë
output[outputPos++] = 'e';
break;
case '\u00EC' : // ì
case '\u00ED' : // í
case '\u00EE' : // î
case '\u00EF' : // ï
output[outputPos++] = 'i';
break;
case '\u0133' : // ij
output[outputPos++] = 'i';
output[outputPos++] = 'j';
break;
case '\u00F0' : // ð
output[outputPos++] = 'd';
break;
case '\u00F1' : // ñ
output[outputPos++] = 'n';
break;
case '\u00F2' : // ò
case '\u00F3' : // ó
case '\u00F4' : // ô
case '\u00F5' : // õ
case '\u00F6' : // ö
case '\u00F8' : // ø
output[outputPos++] = 'o';
break;
case '\u0153' : // œ
output[outputPos++] = 'o';
output[outputPos++] = 'e';
break;
case '\u00DF' : // ß
output[outputPos++] = 's';
output[outputPos++] = 's';
break;
case '\u00FE' : // þ
output[outputPos++] = 't';
output[outputPos++] = 'h';
break;
case '\u00F9' : // ù
case '\u00FA' : // ú
case '\u00FB' : // û
case '\u00FC' : // ü
output[outputPos++] = 'u';
break;
case '\u00FD' : // ý
case '\u00FF' : // ÿ
output[outputPos++] = 'y';
break;
case '\uFB00': //
output[outputPos++] = 'f';
output[outputPos++] = 'f';
break;
case '\uFB01': //
output[outputPos++] = 'f';
output[outputPos++] = 'i';
break;
case '\uFB02': //
output[outputPos++] = 'f';
output[outputPos++] = 'l';
break;
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
// case '\uFB03': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'i';
// break;
// case '\uFB04': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'l';
// break;
case '\uFB05': //
output[outputPos++] = 'f';
output[outputPos++] = 't';
break;
case '\uFB06': //
output[outputPos++] = 's';
output[outputPos++] = 't';
break;
default :
output[outputPos++] = c;
break;
}
}
}
}
}

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
/**
* "Tokenizes" the entire stream as a single token. This is useful
* for data like zip codes, ids, and some product names.
*/
public class KeywordAnalyzer extends Analyzer {
public KeywordAnalyzer() {
setOverridesTokenStreamMethod(KeywordAnalyzer.class);
}
@Override
public TokenStream tokenStream(String fieldName,
final Reader reader) {
return new KeywordTokenizer(reader);
}
@Override
public TokenStream reusableTokenStream(String fieldName,
final Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new KeywordTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Emits the entire input as a single token.
*/
public final class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
private int finalOffset;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
}
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
init(bufferSize);
}
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
super(source, input);
init(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
super(factory, input);
init(bufferSize);
}
private void init(int bufferSize) {
this.done = false;
termAtt = addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt.resizeTermBuffer(bufferSize);
}
@Override
public final boolean incrementToken() throws IOException {
if (!done) {
clearAttributes();
done = true;
int upto = 0;
char[] buffer = termAtt.termBuffer();
while (true) {
final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
upto += length;
if (upto == buffer.length)
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
finalOffset = correctOffset(upto);
offsetAtt.setOffset(correctOffset(0), finalOffset);
return true;
}
return false;
}
@Override
public final void end() {
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
this.done = false;
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Removes words that are too long or too short from the stream.
*/
public final class LengthFilter extends TokenFilter {
final int min;
final int max;
private TermAttribute termAtt;
/**
* Build a filter that removes words that are too long or too
* short from the text.
*/
public LengthFilter(TokenStream in, int min, int max)
{
super(in);
this.min = min;
this.max = max;
termAtt = addAttribute(TermAttribute.class);
}
/**
* Returns the next input Token whose term() is the right len
*/
@Override
public final boolean incrementToken() throws IOException {
// return the first non-stop word found
while (input.incrementToken()) {
int len = termAtt.termLength();
if (len >= min && len <= max) {
return true;
}
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return false
return false;
}
}

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public class LetterTokenizer extends CharTokenizer {
/** Construct a new LetterTokenizer. */
public LetterTokenizer(Reader in) {
super(in);
}
/** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
public LetterTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/** Construct a new LetterTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
public LetterTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Collects only characters which satisfy
* {@link Character#isLetter(char)}.*/
@Override
protected boolean isTokenChar(char c) {
return Character.isLetter(c);
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Normalizes token text to lower case.
*/
public final class LowerCaseFilter extends TokenFilter {
public LowerCaseFilter(TokenStream in) {
super(in);
termAtt = addAttribute(TermAttribute.class);
}
private TermAttribute termAtt;
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
for(int i=0;i<length;i++)
buffer[i] = Character.toLowerCase(buffer[i]);
return true;
} else
return false;
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
/**
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together. It divides text at non-letters and converts
* them to lower case. While it is functionally equivalent to the combination
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
* to doing the two tasks at once, hence this (redundant) implementation.
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
*/
public final class LowerCaseTokenizer extends LetterTokenizer {
/** Construct a new LowerCaseTokenizer. */
public LowerCaseTokenizer(Reader in) {
super(in);
}
/** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
public LowerCaseTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/** Construct a new LowerCaseTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Converts char to lower case
* {@link Character#toLowerCase(char)}.*/
@Override
protected char normalize(char c) {
return Character.toLowerCase(c);
}
}

View File

@ -0,0 +1,137 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
/**
* Simplistic {@link CharFilter} that applies the mappings
* contained in a {@link NormalizeCharMap} to the character
* stream, and correcting the resulting changes to the
* offsets.
*/
public class MappingCharFilter extends BaseCharFilter {
private final NormalizeCharMap normMap;
private LinkedList<Character> buffer;
private String replacement;
private int charPointer;
private int nextCharCounter;
/** Default constructor that takes a {@link CharStream}. */
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
super(in);
this.normMap = normMap;
}
/** Easy-use constructor that takes a {@link Reader}. */
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
super(CharReader.get(in));
this.normMap = normMap;
}
@Override
public int read() throws IOException {
while(true) {
if (replacement != null && charPointer < replacement.length()) {
return replacement.charAt(charPointer++);
}
int firstChar = nextChar();
if (firstChar == -1) return -1;
NormalizeCharMap nm = normMap.submap != null ?
normMap.submap.get(Character.valueOf((char) firstChar)) : null;
if (nm == null) return firstChar;
NormalizeCharMap result = match(nm);
if (result == null) return firstChar;
replacement = result.normStr;
charPointer = 0;
if (result.diff != 0) {
int prevCumulativeDiff = getLastCumulativeDiff();
if (result.diff < 0) {
for(int i = 0; i < -result.diff ; i++)
addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
} else {
addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
}
}
}
}
private int nextChar() throws IOException {
nextCharCounter++;
if (buffer != null && !buffer.isEmpty()) {
return buffer.removeFirst().charValue();
}
return input.read();
}
private void pushChar(int c) {
nextCharCounter--;
if(buffer == null)
buffer = new LinkedList<Character>();
buffer.addFirst(Character.valueOf((char) c));
}
private void pushLastChar(int c) {
if (buffer == null) {
buffer = new LinkedList<Character>();
}
buffer.addLast(Character.valueOf((char) c));
}
private NormalizeCharMap match(NormalizeCharMap map) throws IOException {
NormalizeCharMap result = null;
if (map.submap != null) {
int chr = nextChar();
if (chr != -1) {
NormalizeCharMap subMap = map.submap.get(Character.valueOf((char) chr));
if (subMap != null) {
result = match(subMap);
}
if (result == null) {
pushChar(chr);
}
}
}
if (result == null && map.normStr != null) {
result = map;
}
return result;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
char[] tmp = new char[len];
int l = input.read(tmp, 0, len);
if (l != -1) {
for(int i = 0; i < l; i++)
pushLastChar(tmp[i]);
}
l = 0;
for(int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
l++;
}
return l == 0 ? -1 : l;
}
}

View File

@ -0,0 +1,61 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.util.HashMap;
import java.util.Map;
/**
* Holds a map of String input to String output, to be used
* with {@link MappingCharFilter}.
*/
public class NormalizeCharMap {
Map<Character, NormalizeCharMap> submap;
String normStr;
int diff;
/** Records a replacement to be applied to the inputs
* stream. Whenever <code>singleMatch</code> occurs in
* the input, it will be replaced with
* <code>replacement</code>.
*
* @param singleMatch input String to be replaced
* @param replacement output String
*/
public void add(String singleMatch, String replacement) {
NormalizeCharMap currMap = this;
for(int i = 0; i < singleMatch.length(); i++) {
char c = singleMatch.charAt(i);
if (currMap.submap == null) {
currMap.submap = new HashMap<Character, NormalizeCharMap>(1);
}
NormalizeCharMap map = currMap.submap.get(Character.valueOf(c));
if (map == null) {
map = new NormalizeCharMap();
currMap.submap.put(Character.valueOf(c), map);
}
currMap = map;
}
if (currMap.normStr != null) {
throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch);
}
currMap.normStr = replacement;
currMap.diff = singleMatch.length() - replacement.length();
}
}

View File

@ -0,0 +1,252 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.document.NumericField; // for javadocs
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
import org.apache.lucene.search.NumericRangeFilter; // for javadocs
import org.apache.lucene.search.SortField; // for javadocs
import org.apache.lucene.search.FieldCache; // javadocs
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* <b>Expert:</b> This class provides a {@link TokenStream}
* for indexing numeric values that can be used by {@link
* NumericRangeQuery} or {@link NumericRangeFilter}.
*
* <p>Note that for simple usage, {@link NumericField} is
* recommended. {@link NumericField} disables norms and
* term freqs, as they are not usually needed during
* searching. If you need to change these settings, you
* should use this class.
*
* <p>See {@link NumericField} for capabilities of fields
* indexed numerically.</p>
*
* <p>Here's an example usage, for an <code>int</code> field:
*
* <pre>
* Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value));
* field.setOmitNorms(true);
* field.setOmitTermFreqAndPositions(true);
* document.add(field);
* </pre>
*
* <p>For optimal performance, re-use the TokenStream and Field instance
* for more than one document:
*
* <pre>
* NumericTokenStream stream = new NumericTokenStream(precisionStep);
* Field field = new Field(name, stream);
* field.setOmitNorms(true);
* field.setOmitTermFreqAndPositions(true);
* Document document = new Document();
* document.add(field);
*
* for(all documents) {
* stream.setIntValue(value)
* writer.addDocument(document);
* }
* </pre>
*
* <p>This stream is not intended to be used in analyzers;
* it's more for iterating the different precisions during
* indexing a specific numeric value.</p>
* <p><b>NOTE</b>: as token streams are only consumed once
* the document is added to the index, if you index more
* than one numeric field, use a separate <code>NumericTokenStream</code>
* instance for each.</p>
*
* <p>See {@link NumericRangeQuery} for more details on the
* <a
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
* parameter as well as how numeric fields work under the hood.</p>
*
* <p><font color="red"><b>NOTE:</b> This API is experimental and
* might change in incompatible ways in the next release.</font>
*
* @since 2.9
*/
public final class NumericTokenStream extends TokenStream {
/** The full precision token gets this token type assigned. */
public static final String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric";
/** The lower precision tokens gets this token type assigned. */
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric";
/**
* Creates a token stream for numeric values using the default <code>precisionStep</code>
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream() {
this(NumericUtils.PRECISION_STEP_DEFAULT);
}
/**
* Creates a token stream for numeric values with the specified
* <code>precisionStep</code>. The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream(final int precisionStep) {
super();
this.precisionStep = precisionStep;
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
}
/**
* Expert: Creates a token stream for numeric values with the specified
* <code>precisionStep</code> using the given {@link AttributeSource}.
* The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream(AttributeSource source, final int precisionStep) {
super(source);
this.precisionStep = precisionStep;
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
}
/**
* Expert: Creates a token stream for numeric values with the specified
* <code>precisionStep</code> using the given
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
* The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream(AttributeFactory factory, final int precisionStep) {
super(factory);
this.precisionStep = precisionStep;
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
}
/**
* Initializes the token stream with the supplied <code>long</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setLongValue(value))</code>
*/
public NumericTokenStream setLongValue(final long value) {
this.value = value;
valSize = 64;
shift = 0;
return this;
}
/**
* Initializes the token stream with the supplied <code>int</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setIntValue(value))</code>
*/
public NumericTokenStream setIntValue(final int value) {
this.value = (long) value;
valSize = 32;
shift = 0;
return this;
}
/**
* Initializes the token stream with the supplied <code>double</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value))</code>
*/
public NumericTokenStream setDoubleValue(final double value) {
this.value = NumericUtils.doubleToSortableLong(value);
valSize = 64;
shift = 0;
return this;
}
/**
* Initializes the token stream with the supplied <code>float</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value))</code>
*/
public NumericTokenStream setFloatValue(final float value) {
this.value = (long) NumericUtils.floatToSortableInt(value);
valSize = 32;
shift = 0;
return this;
}
@Override
public void reset() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
shift = 0;
}
@Override
public boolean incrementToken() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
if (shift >= valSize)
return false;
clearAttributes();
final char[] buffer;
switch (valSize) {
case 64:
buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer));
break;
case 32:
buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT);
termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer));
break;
default:
// should not happen
throw new IllegalArgumentException("valSize must be 32 or 64");
}
typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0);
shift += precisionStep;
return true;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder("(numeric,valSize=").append(valSize);
sb.append(",precisionStep=").append(precisionStep).append(')');
return sb.toString();
}
// members
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int shift = 0, valSize = 0; // valSize==0 means not initialized
private final int precisionStep;
private long value = 0L;
}

View File

@ -0,0 +1,127 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.IOException;
import java.util.Map;
import java.util.HashMap;
/**
* This analyzer is used to facilitate scenarios where different
* fields require different analysis techniques. Use {@link #addAnalyzer}
* to add a non-default analyzer on a field name basis.
*
* <p>Example usage:
*
* <pre>
* PerFieldAnalyzerWrapper aWrapper =
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
* </pre>
*
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
* and "lastname", for which KeywordAnalyzer will be used.
*
* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
* and query parsing.
*/
public class PerFieldAnalyzerWrapper extends Analyzer {
private Analyzer defaultAnalyzer;
private Map<String,Analyzer> analyzerMap = new HashMap<String,Analyzer>();
/**
* Constructs with default analyzer.
*
* @param defaultAnalyzer Any fields not specifically
* defined to use a different analyzer will use the one provided here.
*/
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer) {
this(defaultAnalyzer, null);
}
/**
* Constructs with default analyzer and a map of analyzers to use for
* specific fields.
*
* @param defaultAnalyzer Any fields not specifically
* defined to use a different analyzer will use the one provided here.
* @param fieldAnalyzers a Map (String field name to the Analyzer) to be
* used for those fields
*/
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer,
Map<String,Analyzer> fieldAnalyzers) {
this.defaultAnalyzer = defaultAnalyzer;
if (fieldAnalyzers != null) {
analyzerMap.putAll(fieldAnalyzers);
}
setOverridesTokenStreamMethod(PerFieldAnalyzerWrapper.class);
}
/**
* Defines an analyzer to use for the specified field.
*
* @param fieldName field name requiring a non-default analyzer
* @param analyzer non-default analyzer to use for field
*/
public void addAnalyzer(String fieldName, Analyzer analyzer) {
analyzerMap.put(fieldName, analyzer);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
Analyzer analyzer = analyzerMap.get(fieldName);
if (analyzer == null) {
analyzer = defaultAnalyzer;
}
return analyzer.tokenStream(fieldName, reader);
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
Analyzer analyzer = analyzerMap.get(fieldName);
if (analyzer == null)
analyzer = defaultAnalyzer;
return analyzer.reusableTokenStream(fieldName, reader);
}
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
@Override
public int getPositionIncrementGap(String fieldName) {
Analyzer analyzer = analyzerMap.get(fieldName);
if (analyzer == null)
analyzer = defaultAnalyzer;
return analyzer.getPositionIncrementGap(fieldName);
}
@Override
public String toString() {
return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")";
}
}

View File

@ -0,0 +1,61 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** Transforms the token stream as per the Porter stemming algorithm.
Note: the input to the stemming filter must already be in lower case,
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
down the Tokenizer chain in order for this to work properly!
<P>
To use this filter with other analyzers, you'll want to write an
Analyzer class that sets up the TokenStream chain as you want it.
To use this with LowerCaseTokenizer, for example, you'd write an
analyzer like this:
<P>
<PRE>
class MyAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new PorterStemFilter(new LowerCaseTokenizer(reader));
}
}
</PRE>
*/
public final class PorterStemFilter extends TokenFilter {
private PorterStemmer stemmer;
private TermAttribute termAtt;
public PorterStemFilter(TokenStream in) {
super(in);
stemmer = new PorterStemmer();
termAtt = addAttribute(TermAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return true;
}
}

View File

@ -0,0 +1,546 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
Porter stemmer in Java. The original paper is in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,
See also http://www.tartarus.org/~martin/PorterStemmer/index.html
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.
Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.
Release 3.
[ This version is derived from Release 3, modified by Brian Goetz to
optimize for fewer object creations. ]
*/
import java.io.*;
/**
*
* Stemmer, implementing the Porter Stemming Algorithm
*
* The Stemmer class transforms a word into its root form. The input
* word can be provided a character at time (by calling add()), or at once
* by calling one of the various stem(something) methods.
*/
class PorterStemmer
{
private char[] b;
private int i, /* offset into b */
j, k, k0;
private boolean dirty = false;
private static final int INC = 50; /* unit of size whereby b is increased */
private static final int EXTRA = 1;
public PorterStemmer() {
b = new char[INC];
i = 0;
}
/**
* reset() resets the stemmer so it can stem another word. If you invoke
* the stemmer by calling add(char) and then stem(), you must call reset()
* before starting another word.
*/
public void reset() { i = 0; dirty = false; }
/**
* Add a character to the word being stemmed. When you are finished
* adding characters, you can call stem(void) to process the word.
*/
public void add(char ch) {
if (b.length <= i + EXTRA) {
char[] new_b = new char[b.length+INC];
System.arraycopy(b, 0, new_b, 0, b.length);
b = new_b;
}
b[i++] = ch;
}
/**
* After a word has been stemmed, it can be retrieved by toString(),
* or a reference to the internal buffer can be retrieved by getResultBuffer
* and getResultLength (which is generally more efficient.)
*/
@Override
public String toString() { return new String(b,0,i); }
/**
* Returns the length of the word resulting from the stemming process.
*/
public int getResultLength() { return i; }
/**
* Returns a reference to a character buffer containing the results of
* the stemming process. You also need to consult getResultLength()
* to determine the length of the result.
*/
public char[] getResultBuffer() { return b; }
/* cons(i) is true <=> b[i] is a consonant. */
private final boolean cons(int i) {
switch (b[i]) {
case 'a': case 'e': case 'i': case 'o': case 'u':
return false;
case 'y':
return (i==k0) ? true : !cons(i-1);
default:
return true;
}
}
/* m() measures the number of consonant sequences between k0 and j. if c is
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
*/
private final int m() {
int n = 0;
int i = k0;
while(true) {
if (i > j)
return n;
if (! cons(i))
break;
i++;
}
i++;
while(true) {
while(true) {
if (i > j)
return n;
if (cons(i))
break;
i++;
}
i++;
n++;
while(true) {
if (i > j)
return n;
if (! cons(i))
break;
i++;
}
i++;
}
}
/* vowelinstem() is true <=> k0,...j contains a vowel */
private final boolean vowelinstem() {
int i;
for (i = k0; i <= j; i++)
if (! cons(i))
return true;
return false;
}
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
private final boolean doublec(int j) {
if (j < k0+1)
return false;
if (b[j] != b[j-1])
return false;
return cons(j);
}
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
and also if the second c is not w,x or y. this is used when trying to
restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
*/
private final boolean cvc(int i) {
if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
return false;
else {
int ch = b[i];
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
}
return true;
}
private final boolean ends(String s) {
int l = s.length();
int o = k-l+1;
if (o < k0)
return false;
for (int i = 0; i < l; i++)
if (b[o+i] != s.charAt(i))
return false;
j = k-l;
return true;
}
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
k. */
void setto(String s) {
int l = s.length();
int o = j+1;
for (int i = 0; i < l; i++)
b[o+i] = s.charAt(i);
k = j+l;
dirty = true;
}
/* r(s) is used further down. */
void r(String s) { if (m() > 0) setto(s); }
/* step1() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
ties -> ti
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
*/
private final void step1() {
if (b[k] == 's') {
if (ends("sses")) k -= 2;
else if (ends("ies")) setto("i");
else if (b[k-1] != 's') k--;
}
if (ends("eed")) {
if (m() > 0)
k--;
}
else if ((ends("ed") || ends("ing")) && vowelinstem()) {
k = j;
if (ends("at")) setto("ate");
else if (ends("bl")) setto("ble");
else if (ends("iz")) setto("ize");
else if (doublec(k)) {
int ch = b[k--];
if (ch == 'l' || ch == 's' || ch == 'z')
k++;
}
else if (m() == 1 && cvc(k))
setto("e");
}
}
/* step2() turns terminal y to i when there is another vowel in the stem. */
private final void step2() {
if (ends("y") && vowelinstem()) {
b[k] = 'i';
dirty = true;
}
}
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
-ation) maps to -ize etc. note that the string before the suffix must give
m() > 0. */
private final void step3() {
if (k == k0) return; /* For Bug 1 */
switch (b[k-1]) {
case 'a':
if (ends("ational")) { r("ate"); break; }
if (ends("tional")) { r("tion"); break; }
break;
case 'c':
if (ends("enci")) { r("ence"); break; }
if (ends("anci")) { r("ance"); break; }
break;
case 'e':
if (ends("izer")) { r("ize"); break; }
break;
case 'l':
if (ends("bli")) { r("ble"); break; }
if (ends("alli")) { r("al"); break; }
if (ends("entli")) { r("ent"); break; }
if (ends("eli")) { r("e"); break; }
if (ends("ousli")) { r("ous"); break; }
break;
case 'o':
if (ends("ization")) { r("ize"); break; }
if (ends("ation")) { r("ate"); break; }
if (ends("ator")) { r("ate"); break; }
break;
case 's':
if (ends("alism")) { r("al"); break; }
if (ends("iveness")) { r("ive"); break; }
if (ends("fulness")) { r("ful"); break; }
if (ends("ousness")) { r("ous"); break; }
break;
case 't':
if (ends("aliti")) { r("al"); break; }
if (ends("iviti")) { r("ive"); break; }
if (ends("biliti")) { r("ble"); break; }
break;
case 'g':
if (ends("logi")) { r("log"); break; }
}
}
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
private final void step4() {
switch (b[k]) {
case 'e':
if (ends("icate")) { r("ic"); break; }
if (ends("ative")) { r(""); break; }
if (ends("alize")) { r("al"); break; }
break;
case 'i':
if (ends("iciti")) { r("ic"); break; }
break;
case 'l':
if (ends("ical")) { r("ic"); break; }
if (ends("ful")) { r(""); break; }
break;
case 's':
if (ends("ness")) { r(""); break; }
break;
}
}
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
private final void step5() {
if (k == k0) return; /* for Bug 1 */
switch (b[k-1]) {
case 'a':
if (ends("al")) break;
return;
case 'c':
if (ends("ance")) break;
if (ends("ence")) break;
return;
case 'e':
if (ends("er")) break; return;
case 'i':
if (ends("ic")) break; return;
case 'l':
if (ends("able")) break;
if (ends("ible")) break; return;
case 'n':
if (ends("ant")) break;
if (ends("ement")) break;
if (ends("ment")) break;
/* element etc. not stripped before the m */
if (ends("ent")) break;
return;
case 'o':
if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break;
return;
/* takes care of -ous */
case 's':
if (ends("ism")) break;
return;
case 't':
if (ends("ate")) break;
if (ends("iti")) break;
return;
case 'u':
if (ends("ous")) break;
return;
case 'v':
if (ends("ive")) break;
return;
case 'z':
if (ends("ize")) break;
return;
default:
return;
}
if (m() > 1)
k = j;
}
/* step6() removes a final -e if m() > 1. */
private final void step6() {
j = k;
if (b[k] == 'e') {
int a = m();
if (a > 1 || a == 1 && !cvc(k-1))
k--;
}
if (b[k] == 'l' && doublec(k) && m() > 1)
k--;
}
/**
* Stem a word provided as a String. Returns the result as a String.
*/
public String stem(String s) {
if (stem(s.toCharArray(), s.length()))
return toString();
else
return s;
}
/** Stem a word contained in a char[]. Returns true if the stemming process
* resulted in a word different from the input. You can retrieve the
* result with getResultLength()/getResultBuffer() or toString().
*/
public boolean stem(char[] word) {
return stem(word, word.length);
}
/** Stem a word contained in a portion of a char[] array. Returns
* true if the stemming process resulted in a word different from
* the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public boolean stem(char[] wordBuffer, int offset, int wordLen) {
reset();
if (b.length < wordLen) {
char[] new_b = new char[wordLen + EXTRA];
b = new_b;
}
System.arraycopy(wordBuffer, offset, b, 0, wordLen);
i = wordLen;
return stem(0);
}
/** Stem a word contained in a leading portion of a char[] array.
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public boolean stem(char[] word, int wordLen) {
return stem(word, 0, wordLen);
}
/** Stem the word placed into the Stemmer buffer through calls to add().
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public boolean stem() {
return stem(0);
}
public boolean stem(int i0) {
k = i - 1;
k0 = i0;
if (k > k0+1) {
step1(); step2(); step3(); step4(); step5(); step6();
}
// Also, a word is considered dirty if we lopped off letters
// Thanks to Ifigenia Vairelles for pointing this out.
if (i != k+1)
dirty = true;
i = k+1;
return dirty;
}
/** Test program for demonstrating the Stemmer. It reads a file and
* stems each word, writing the result to standard out.
* Usage: Stemmer file-name
*/
public static void main(String[] args) {
PorterStemmer s = new PorterStemmer();
for (int i = 0; i < args.length; i++) {
try {
InputStream in = new FileInputStream(args[i]);
byte[] buffer = new byte[1024];
int bufferLen, offset, ch;
bufferLen = in.read(buffer);
offset = 0;
s.reset();
while(true) {
if (offset < bufferLen)
ch = buffer[offset++];
else {
bufferLen = in.read(buffer);
offset = 0;
if (bufferLen < 0)
ch = -1;
else
ch = buffer[offset++];
}
if (Character.isLetter((char) ch)) {
s.add(Character.toLowerCase((char) ch));
}
else {
s.stem();
System.out.print(s.toString());
s.reset();
if (ch < 0)
break;
else {
System.out.print((char) ch);
}
}
}
in.close();
}
catch (IOException e) {
System.out.println("error reading " + args[i]);
}
}
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.IOException;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter} */
public final class SimpleAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new LowerCaseTokenizer(reader);
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new LowerCaseTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
}
}

View File

@ -0,0 +1,119 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
import java.util.List;
import org.apache.lucene.util.Version;
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StopAnalyzer:
* <ul>
* <li> As of 2.9, position increments are preserved
* </ul>
*/
public final class StopAnalyzer extends Analyzer {
private final Set<?> stopWords;
private final boolean enablePositionIncrements;
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
public static final Set<?> ENGLISH_STOP_WORDS_SET;
static {
final List<String> stopWords = Arrays.asList(
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
);
final CharArraySet stopSet = new CharArraySet(stopWords.size(), false);
stopSet.addAll(stopWords);
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
}
/** Builds an analyzer which removes words in
* {@link #ENGLISH_STOP_WORDS_SET}.
* @param matchVersion See <a href="#version">above</a>
*/
public StopAnalyzer(Version matchVersion) {
stopWords = ENGLISH_STOP_WORDS_SET;
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
}
/** Builds an analyzer with the stop words from the given set.
* @param matchVersion See <a href="#version">above</a>
* @param stopWords Set of stop words */
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
this.stopWords = stopWords;
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
}
/** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(File)
* @param matchVersion See <a href="#version">above</a>
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
}
/** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader)
* @param matchVersion See <a href="#version">above</a>
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
}
/** Filters LowerCaseTokenizer with StopFilter. */
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
}
/** Filters LowerCaseTokenizer with StopFilter. */
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new LowerCaseTokenizer(reader);
streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
setPreviousTokenStream(streams);
} else
streams.source.reset(reader);
return streams.result;
}
}

View File

@ -0,0 +1,191 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
import org.apache.lucene.util.Version;
/**
* Removes stop words from a token stream.
*/
public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
private boolean enablePositionIncrements = false;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
/**
* Construct a token stream filtering the given input.
* If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
* <code>makeStopSet()</code> was used to construct the set) it will be directly used
* and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
* directly controls case sensitivity.
* <p/>
* If <code>stopWords</code> is not an instance of {@link CharArraySet},
* a new CharArraySet will be constructed and <code>ignoreCase</code> will be
* used to specify the case sensitivity of that set.
*
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param input Input TokenStream
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
* @param ignoreCase if true, all words are lower cased first
*/
public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{
super(input);
if (stopWords instanceof CharArraySet) {
this.stopWords = (CharArraySet)stopWords;
} else {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
this.stopWords.addAll(stopWords);
}
this.enablePositionIncrements = enablePositionIncrements;
termAtt = addAttribute(TermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
}
/**
* Constructs a filter which removes words from the input
* TokenStream that are named in the Set.
*
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param in Input stream
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
* @see #makeStopSet(java.lang.String[])
*/
public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
this(enablePositionIncrements, in, stopWords, false);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
*
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
*/
public static final Set<Object> makeStopSet(String... stopWords) {
return makeStopSet(stopWords, false);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @return A Set ({@link CharArraySet}) containing the words
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
*/
public static final Set<Object> makeStopSet(List<?> stopWords) {
return makeStopSet(stopWords, false);
}
/**
*
* @param stopWords An array of stopwords
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
*/
public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
stopSet.addAll(Arrays.asList(stopWords));
return stopSet;
}
/**
*
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @param ignoreCase if true, all words are lower cased first
* @return A Set ({@link CharArraySet}) containing the words
*/
public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
stopSet.addAll(stopWords);
return stopSet;
}
/**
* Returns the next input Token whose term() is not a stop word.
*/
@Override
public final boolean incrementToken() throws IOException {
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {
if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
}
/**
* Returns version-dependent default for
* enablePositionIncrements. Analyzers that embed
* StopFilter use this method when creating the
* StopFilter. Prior to 2.9, this returns false. On 2.9
* or later, it returns true.
*/
public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
return matchVersion.onOrAfter(Version.LUCENE_29);
}
/**
* @see #setEnablePositionIncrements(boolean).
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this StopFilter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed stop tokens).
* Generally, <code>true</code> is best as it does not
* lose information (positions of the original tokens)
* during indexing.
*
* <p> When set, when a token is stopped
* (omitted), the position increment of the following
* token is incremented.
*
* <p> <b>NOTE</b>: be sure to also
* set {@link QueryParser#setEnablePositionIncrements} if
* you use QueryParser to create queries.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -0,0 +1,245 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
/**
* This TokenFilter provides the ability to set aside attribute states
* that have already been analyzed. This is useful in situations where multiple fields share
* many common analysis steps and then go their separate ways.
* <p/>
* It is also useful for doing things like entity extraction or proper noun analysis as
* part of the analysis workflow and saving off those tokens for use in another field.
*
* <pre>
TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
source2.addSinkTokenStream(sink1);
source2.addSinkTokenStream(sink2);
TokenStream final1 = new LowerCaseFilter(source1);
TokenStream final2 = source2;
TokenStream final3 = new EntityDetect(sink1);
TokenStream final4 = new URLDetect(sink2);
d.add(new Field("f1", final1));
d.add(new Field("f2", final2));
d.add(new Field("f3", final3));
d.add(new Field("f4", final4));
* </pre>
* In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
* <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
* It is important, that tees are consumed before sinks (in the above example, the field names must be
* less the sink's field names). If you are not sure, which stream is consumed first, you can simply
* add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
* This TokenFilter is exhausted after this. In the above example, change
* the example above to:
* <pre>
...
TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
TokenStream final2 = source2.newSinkTokenStream();
sink1.consumeAllTokens();
sink2.consumeAllTokens();
...
* </pre>
* In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
* <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
*/
public final class TeeSinkTokenFilter extends TokenFilter {
private final List<WeakReference<SinkTokenStream>> sinks = new LinkedList<WeakReference<SinkTokenStream>>();
/**
* Instantiates a new TeeSinkTokenFilter.
*/
public TeeSinkTokenFilter(TokenStream input) {
super(input);
}
/**
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
*/
public SinkTokenStream newSinkTokenStream() {
return newSinkTokenStream(ACCEPT_ALL_FILTER);
}
/**
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
* that pass the supplied filter.
* @see SinkFilter
*/
public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
this.sinks.add(new WeakReference<SinkTokenStream>(sink));
return sink;
}
/**
* Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
* to this one. The supplied stream will also receive all consumed tokens.
* This method can be used to pass tokens from two different tees to one sink.
*/
public void addSinkTokenStream(final SinkTokenStream sink) {
// check that sink has correct factory
if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
}
// add eventually missing attribute impls to the existing sink
for (Iterator<AttributeImpl> it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
sink.addAttributeImpl(it.next());
}
this.sinks.add(new WeakReference<SinkTokenStream>(sink));
}
/**
* <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
* when itself is consumed. To be sure, that all tokens from the input
* stream are passed to the sinks, you can call this methods.
* This instance is exhausted after this, but all sinks are instant available.
*/
public void consumeAllTokens() throws IOException {
while (incrementToken());
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
// capture state lazily - maybe no SinkFilter accepts this state
AttributeSource.State state = null;
for (WeakReference<SinkTokenStream> ref : sinks) {
final SinkTokenStream sink = ref.get();
if (sink != null) {
if (sink.accept(this)) {
if (state == null) {
state = this.captureState();
}
sink.addState(state);
}
}
}
return true;
}
return false;
}
@Override
public final void end() throws IOException {
super.end();
AttributeSource.State finalState = captureState();
for (WeakReference<SinkTokenStream> ref : sinks) {
final SinkTokenStream sink = ref.get();
if (sink != null) {
sink.setFinalState(finalState);
}
}
}
/**
* A filter that decides which {@link AttributeSource} states to store in the sink.
*/
public static abstract class SinkFilter {
/**
* Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored
* in the sink.
*/
public abstract boolean accept(AttributeSource source);
/**
* Called by {@link SinkTokenStream#reset()}. This method does nothing by default
* and can optionally be overridden.
*/
public void reset() throws IOException {
// nothing to do; can be overridden
}
}
public static final class SinkTokenStream extends TokenStream {
private final List<AttributeSource.State> cachedStates = new LinkedList<AttributeSource.State>();
private AttributeSource.State finalState;
private Iterator<AttributeSource.State> it = null;
private SinkFilter filter;
private SinkTokenStream(AttributeSource source, SinkFilter filter) {
super(source);
this.filter = filter;
}
private boolean accept(AttributeSource source) {
return filter.accept(source);
}
private void addState(AttributeSource.State state) {
if (it != null) {
throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
}
cachedStates.add(state);
}
private void setFinalState(AttributeSource.State finalState) {
this.finalState = finalState;
}
@Override
public final boolean incrementToken() throws IOException {
// lazy init the iterator
if (it == null) {
it = cachedStates.iterator();
}
if (!it.hasNext()) {
return false;
}
AttributeSource.State state = it.next();
restoreState(state);
return true;
}
@Override
public final void end() throws IOException {
if (finalState != null) {
restoreState(finalState);
}
}
@Override
public final void reset() {
it = cachedStates.iterator();
}
}
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
@Override
public boolean accept(AttributeSource source) {
return true;
}
};
}

View File

@ -0,0 +1,811 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeImpl;
/**
A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
<p>
The start and end offsets permit applications to re-associate a token with
its source text, e.g., to display highlighted query terms in a document
browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
display, etc.
<p>
The type is a string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word".
<p>
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
length byte array. Use {@link TermPositions#getPayloadLength()} and
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
<br><br>
<p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
Even though it is not necessary to use Token anymore, with the new TokenStream API it can
be used as convenience class that implements all {@link Attribute}s, which is especially useful
to easily switch from the old to the new TokenStream API.
<br><br>
<p>Tokenizers and TokenFilters should try to re-use a Token
instance when possible for best performance, by
implementing the {@link TokenStream#incrementToken()} API.
Failing that, to create a new Token you should first use
one of the constructors that starts with null text. To load
the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}.
Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
if you know that your text is shorter than the capacity of the termBuffer
or {@link #resizeTermBuffer(int)}, if there is any possibility
that you may need to grow the buffer. Fill in the characters of your term into this
buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
set the length of the term text. See <a target="_top"
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
for details.</p>
<p>Typical Token reuse patterns:
<ul>
<li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre>
return reusableToken.reinit(string, startOffset, endOffset[, type]);
</pre>
</li>
<li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre>
return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
</pre>
</li>
</li>
<li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre>
return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
</pre>
</li>
<li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre>
return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
</pre>
</li>
<li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre>
return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
</pre>
</li>
</ul>
A few things to note:
<ul>
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
<li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
</ul>
</p>
@see org.apache.lucene.index.Payload
*/
public class Token extends AttributeImpl
implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
FlagsAttribute, OffsetAttribute, PayloadAttribute {
public static final String DEFAULT_TYPE = "word";
private static int MIN_BUFFER_SIZE = 10;
private char[] termBuffer;
private int termLength;
private int startOffset,endOffset;
private String type = DEFAULT_TYPE;
private int flags;
private Payload payload;
private int positionIncrement = 1;
/** Constructs a Token will null text. */
public Token() {
}
/** Constructs a Token with null text and start & end
* offsets.
* @param start start offset in the source text
* @param end end offset in the source text */
public Token(int start, int end) {
startOffset = start;
endOffset = end;
}
/** Constructs a Token with null text and start & end
* offsets plus the Token type.
* @param start start offset in the source text
* @param end end offset in the source text
* @param typ the lexical type of this Token */
public Token(int start, int end, String typ) {
startOffset = start;
endOffset = end;
type = typ;
}
/**
* Constructs a Token with null text and start & end
* offsets plus flags. NOTE: flags is EXPERIMENTAL.
* @param start start offset in the source text
* @param end end offset in the source text
* @param flags The bits to set for this token
*/
public Token(int start, int end, int flags) {
startOffset = start;
endOffset = end;
this.flags = flags;
}
/** Constructs a Token with the given term text, and start
* & end offsets. The type defaults to "word."
* <b>NOTE:</b> for better indexing speed you should
* instead use the char[] termBuffer methods to set the
* term text.
* @param text term text
* @param start start offset
* @param end end offset
*/
public Token(String text, int start, int end) {
setTermBuffer(text);
startOffset = start;
endOffset = end;
}
/** Constructs a Token with the given text, start and end
* offsets, & type. <b>NOTE:</b> for better indexing
* speed you should instead use the char[] termBuffer
* methods to set the term text.
* @param text term text
* @param start start offset
* @param end end offset
* @param typ token type
*/
public Token(String text, int start, int end, String typ) {
setTermBuffer(text);
startOffset = start;
endOffset = end;
type = typ;
}
/**
* Constructs a Token with the given text, start and end
* offsets, & type. <b>NOTE:</b> for better indexing
* speed you should instead use the char[] termBuffer
* methods to set the term text.
* @param text
* @param start
* @param end
* @param flags token type bits
*/
public Token(String text, int start, int end, int flags) {
setTermBuffer(text);
startOffset = start;
endOffset = end;
this.flags = flags;
}
/**
* Constructs a Token with the given term buffer (offset
* & length), start and end
* offsets
* @param startTermBuffer
* @param termBufferOffset
* @param termBufferLength
* @param start
* @param end
*/
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength);
startOffset = start;
endOffset = end;
}
/** Set the position increment. This determines the position of this token
* relative to the previous Token in a {@link TokenStream}, used in phrase
* searching.
*
* <p>The default value is one.
*
* <p>Some common uses for this are:<ul>
*
* <li>Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* <li>Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* </ul>
* @param positionIncrement the distance from the prior term
* @see org.apache.lucene.index.TermPositions
*/
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
throw new IllegalArgumentException
("Increment must be zero or greater: " + positionIncrement);
this.positionIncrement = positionIncrement;
}
/** Returns the position increment of this Token.
* @see #setPositionIncrement
*/
public int getPositionIncrement() {
return positionIncrement;
}
/** Returns the Token's term text.
*
* This method has a performance penalty
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
* String, use this method, which is nothing more than
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
*/
public final String term() {
initTermBuffer();
return new String(termBuffer, 0, termLength);
}
/** Copies the contents of buffer, starting at offset for
* length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public final void setTermBuffer(char[] buffer, int offset, int length) {
growTermBuffer(length);
System.arraycopy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
/** Copies the contents of buffer into the termBuffer array.
* @param buffer the buffer to copy
*/
public final void setTermBuffer(String buffer) {
final int length = buffer.length();
growTermBuffer(length);
buffer.getChars(0, length, termBuffer, 0);
termLength = length;
}
/** Copies the contents of buffer, starting at offset and continuing
* for length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public final void setTermBuffer(String buffer, int offset, int length) {
assert offset <= buffer.length();
assert offset + length <= buffer.length();
growTermBuffer(length);
buffer.getChars(offset, offset + length, termBuffer, 0);
termLength = length;
}
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
* #resizeTermBuffer(int)} to increase it. After
* altering the buffer be sure to call {@link
* #setTermLength} to record the number of valid
* characters that were placed into the termBuffer. */
public final char[] termBuffer() {
initTermBuffer();
return termBuffer;
}
/** Grows the termBuffer to at least size newSize, preserving the
* existing content. Note: If the next operation is to change
* the contents of the term buffer use
* {@link #setTermBuffer(char[], int, int)},
* {@link #setTermBuffer(String)}, or
* {@link #setTermBuffer(String, int, int)}
* to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation and preserve content
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
}
return termBuffer;
}
/** Allocates a buffer char[] of at least newSize, without preserving the existing content.
* its always used in places that set the content
* @param newSize minimum size of the buffer
*/
private void growTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation:
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
}
}
}
private void initTermBuffer() {
if (termBuffer == null) {
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
termLength = 0;
}
}
/** Return number of valid characters (length of the term)
* in the termBuffer array. */
public final int termLength() {
initTermBuffer();
return termLength;
}
/** Set number of valid characters (length of the term) in
* the termBuffer array. Use this to truncate the termBuffer
* or to synchronize with external manipulation of the termBuffer.
* Note: to grow the size of the array,
* use {@link #resizeTermBuffer(int)} first.
* @param length the truncated length
*/
public final void setTermLength(int length) {
initTermBuffer();
if (length > termBuffer.length)
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
termLength = length;
}
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
Note that the difference between endOffset() and startOffset() may not be
equal to {@link #termLength}, as the term text may have been altered by a
stemmer or some other filter. */
public final int startOffset() {
return startOffset;
}
/** Set the starting offset.
@see #startOffset() */
public void setStartOffset(int offset) {
this.startOffset = offset;
}
/** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. The length
of the token in the source text is (endOffset - startOffset). */
public final int endOffset() {
return endOffset;
}
/** Set the ending offset.
@see #endOffset() */
public void setEndOffset(int offset) {
this.endOffset = offset;
}
/** Set the starting and ending offset.
@see #startOffset() and #endOffset()*/
public void setOffset(int startOffset, int endOffset) {
this.startOffset = startOffset;
this.endOffset = endOffset;
}
/** Returns this Token's lexical type. Defaults to "word". */
public final String type() {
return type;
}
/** Set the lexical type.
@see #type() */
public final void setType(String type) {
this.type = type;
}
/**
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
*
*
* @return The bits
*/
public int getFlags() {
return flags;
}
/**
* @see #getFlags()
*/
public void setFlags(int flags) {
this.flags = flags;
}
/**
* Returns this Token's payload.
*/
public Payload getPayload() {
return this.payload;
}
/**
* Sets this Token's payload.
*/
public void setPayload(Payload payload) {
this.payload = payload;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('(');
initTermBuffer();
if (termBuffer == null)
sb.append("null");
else
sb.append(termBuffer, 0, termLength);
sb.append(',').append(startOffset).append(',').append(endOffset);
if (!type.equals("word"))
sb.append(",type=").append(type);
if (positionIncrement != 1)
sb.append(",posIncr=").append(positionIncrement);
sb.append(')');
return sb.toString();
}
/** Resets the term text, payload, flags, and positionIncrement,
* startOffset, endOffset and token type to default.
*/
@Override
public void clear() {
payload = null;
// Leave termBuffer to allow re-use
termLength = 0;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
@Override
public Object clone() {
Token t = (Token)super.clone();
// Do a deep clone
if (termBuffer != null) {
t.termBuffer = (char[]) termBuffer.clone();
}
if (payload != null) {
t.payload = (Payload) payload.clone();
}
return t;
}
/** Makes a clone, but replaces the term buffer &
* start/end offset in the process. This is more
* efficient than doing a full clone (and then calling
* setTermBuffer) because it saves a wasted copy of the old
* termBuffer. */
public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
t.positionIncrement = positionIncrement;
t.flags = flags;
t.type = type;
if (payload != null)
t.payload = (Payload) payload.clone();
return t;
}
@Override
public boolean equals(Object obj) {
if (obj == this)
return true;
if (obj instanceof Token) {
Token other = (Token) obj;
initTermBuffer();
other.initTermBuffer();
if (termLength == other.termLength &&
startOffset == other.startOffset &&
endOffset == other.endOffset &&
flags == other.flags &&
positionIncrement == other.positionIncrement &&
subEqual(type, other.type) &&
subEqual(payload, other.payload)) {
for(int i=0;i<termLength;i++)
if (termBuffer[i] != other.termBuffer[i])
return false;
return true;
} else
return false;
} else
return false;
}
private boolean subEqual(Object o1, Object o2) {
if (o1 == null)
return o2 == null;
else
return o1.equals(o2);
}
@Override
public int hashCode() {
initTermBuffer();
int code = termLength;
code = code * 31 + startOffset;
code = code * 31 + endOffset;
code = code * 31 + flags;
code = code * 31 + positionIncrement;
code = code * 31 + type.hashCode();
code = (payload == null ? code : code * 31 + payload.hashCode());
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
return code;
}
// like clear() but doesn't clear termBuffer/text
private void clearNoTermBuffer() {
payload = null;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
/** Shorthand for calling {@link #clear},
* {@link #setTermBuffer(char[], int, int)},
* {@link #setStartOffset},
* {@link #setEndOffset},
* {@link #setType}
* @return this Token instance */
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
clearNoTermBuffer();
payload = null;
positionIncrement = 1;
setTermBuffer(newTermBuffer, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #setTermBuffer(char[], int, int)},
* {@link #setStartOffset},
* {@link #setEndOffset}
* {@link #setType} on Token.DEFAULT_TYPE
* @return this Token instance */
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
clearNoTermBuffer();
setTermBuffer(newTermBuffer, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #setTermBuffer(String)},
* {@link #setStartOffset},
* {@link #setEndOffset}
* {@link #setType}
* @return this Token instance */
public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
clearNoTermBuffer();
setTermBuffer(newTerm);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #setTermBuffer(String, int, int)},
* {@link #setStartOffset},
* {@link #setEndOffset}
* {@link #setType}
* @return this Token instance */
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
clearNoTermBuffer();
setTermBuffer(newTerm, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #setTermBuffer(String)},
* {@link #setStartOffset},
* {@link #setEndOffset}
* {@link #setType} on Token.DEFAULT_TYPE
* @return this Token instance */
public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
clearNoTermBuffer();
setTermBuffer(newTerm);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #setTermBuffer(String, int, int)},
* {@link #setStartOffset},
* {@link #setEndOffset}
* {@link #setType} on Token.DEFAULT_TYPE
* @return this Token instance */
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
clearNoTermBuffer();
setTermBuffer(newTerm, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/**
* Copy the prototype token's fields into this one. Note: Payloads are shared.
* @param prototype
*/
public void reinit(Token prototype) {
prototype.initTermBuffer();
setTermBuffer(prototype.termBuffer, 0, prototype.termLength);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
/**
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
* @param prototype
* @param newTerm
*/
public void reinit(Token prototype, String newTerm) {
setTermBuffer(newTerm);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
/**
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
* @param prototype
* @param newTermBuffer
* @param offset
* @param length
*/
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
setTermBuffer(newTermBuffer, offset, length);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
@Override
public void copyTo(AttributeImpl target) {
if (target instanceof Token) {
final Token to = (Token) target;
to.reinit(this);
// reinit shares the payload, so clone it:
if (payload !=null) {
to.payload = (Payload) payload.clone();
}
} else {
initTermBuffer();
((TermAttribute) target).setTermBuffer(termBuffer, 0, termLength);
((OffsetAttribute) target).setOffset(startOffset, endOffset);
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
((FlagsAttribute) target).setFlags(flags);
((TypeAttribute) target).setType(type);
}
}
/** Convenience factory that returns <code>Token</code> as implementation for the basic
* attributes and return the default impl (with &quot;Impl&quot; appended) for all other
* attributes.
* @since 3.0
*/
public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
/** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
* and for all other attributes calls the given delegate factory.
* @since 3.0
*/
public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
private final AttributeSource.AttributeFactory delegate;
/** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
* and for all other attributes calls the given delegate factory. */
public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
this.delegate = delegate;
}
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
return attClass.isAssignableFrom(Token.class)
? new Token() : delegate.createAttributeInstance(attClass);
}
@Override
public boolean equals(Object other) {
if (this == other) return true;
if (other instanceof TokenAttributeFactory) {
final TokenAttributeFactory af = (TokenAttributeFactory) other;
return this.delegate.equals(af.delegate);
}
return false;
}
@Override
public int hashCode() {
return delegate.hashCode() ^ 0x0a45aa31;
}
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/** A TokenFilter is a TokenStream whose input is another TokenStream.
<p>
This is an abstract class; subclasses must override {@link #incrementToken()}.
@see TokenStream
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
protected final TokenStream input;
/** Construct a token stream filtering the given input. */
protected TokenFilter(TokenStream input) {
super(input);
this.input = input;
}
/** Performs end-of-stream operations, if any, and calls then <code>end()</code> on the
* input TokenStream.<p/>
* <b>NOTE:</b> Be sure to call <code>super.end()</code> first when overriding this method.*/
@Override
public void end() throws IOException {
input.end();
}
/** Close the input TokenStream. */
@Override
public void close() throws IOException {
input.close();
}
/** Reset the filter as well as the input TokenStream. */
@Override
public void reset() throws IOException {
input.reset();
}
}

View File

@ -0,0 +1,161 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Closeable;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
/**
* A <code>TokenStream</code> enumerates the sequence of tokens, either from
* {@link Field}s of a {@link Document} or from query text.
* <p>
* This is an abstract class; concrete subclasses are:
* <ul>
* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
* <code>TokenStream</code>.
* </ul>
* A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
* has moved from being {@link Token}-based to {@link Attribute}-based. While
* {@link Token} still exists in 2.9 as a convenience class, the preferred way
* to store the information of a {@link Token} is to use {@link AttributeImpl}s.
* <p>
* <code>TokenStream</code> now extends {@link AttributeSource}, which provides
* access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
* Note that only one instance per {@link AttributeImpl} is created and reused
* for every token. This approach reduces object creation and allows local
* caching of references to the {@link AttributeImpl}s. See
* {@link #incrementToken()} for further details.
* <p>
* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
* <ol>
* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
* attributes to/from the {@link AttributeSource}.
* <li>The consumer calls {@link TokenStream#reset()}.
* <li>The consumer retrieves attributes from the stream and stores local
* references to all attributes it wants to access.
* <li>The consumer calls {@link #incrementToken()} until it returns false
* consuming the attributes after each call.
* <li>The consumer calls {@link #end()} so that any end-of-stream operations
* can be performed.
* <li>The consumer calls {@link #close()} to release any resource when finished
* using the <code>TokenStream</code>.
* </ol>
* To make sure that filters and consumers know which attributes are available,
* the attributes must be added during instantiation. Filters and consumers are
* not required to check for availability of attributes in
* {@link #incrementToken()}.
* <p>
* You can find some example code for the new API in the analysis package level
* Javadoc.
* <p>
* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
* e.g., for buffering purposes (see {@link CachingTokenFilter},
* {@link TeeSinkTokenFilter}). For this usecase
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
* can be used.
*/
public abstract class TokenStream extends AttributeSource implements Closeable {
/**
* A TokenStream using the default attribute factory.
*/
protected TokenStream() {
super();
}
/**
* A TokenStream that uses the same attributes as the supplied one.
*/
protected TokenStream(AttributeSource input) {
super(input);
}
/**
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
*/
protected TokenStream(AttributeFactory factory) {
super(factory);
}
/**
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
* the next token. Implementing classes must implement this method and update
* the appropriate {@link AttributeImpl}s with the attributes of the next
* token.
* <P>
* The producer must make no assumptions about the attributes after the method
* has been returned: the caller may arbitrarily change it. If the producer
* needs to preserve the state for subsequent calls, it can use
* {@link #captureState} to create a copy of the current attribute state.
* <p>
* This method is called for every token of a document, so an efficient
* implementation is crucial for good performance. To avoid calls to
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
* references to all {@link AttributeImpl}s that this stream uses should be
* retrieved during instantiation.
* <p>
* To ensure that filters and consumers know which attributes are available,
* the attributes must be added during instantiation. Filters and consumers
* are not required to check for availability of attributes in
* {@link #incrementToken()}.
*
* @return false for end of stream; true otherwise
*/
public abstract boolean incrementToken() throws IOException;
/**
* This method is called by the consumer after the last token has been
* consumed, after {@link #incrementToken()} returned <code>false</code>
* (using the new <code>TokenStream</code> API). Streams implementing the old API
* should upgrade to use this feature.
* <p/>
* This method can be used to perform any end-of-stream operations, such as
* setting the final offset of a stream. The final offset of a stream might
* differ from the offset of the last token eg in case one or more whitespaces
* followed after the last token, but a {@link WhitespaceTokenizer} was used.
*
* @throws IOException
*/
public void end() throws IOException {
// do nothing by default
}
/**
* Resets this stream to the beginning. This is an optional operation, so
* subclasses may or may not implement this method. {@link #reset()} is not needed for
* the standard indexing process. However, if the tokens of a
* <code>TokenStream</code> are intended to be consumed more than once, it is
* necessary to implement {@link #reset()}. Note that if your TokenStream
* caches tokens and feeds them back again after a reset, it is imperative
* that you clone the tokens when you store them away (on the first pass) as
* well as when you return them (on future passes after {@link #reset()}).
*/
public void reset() throws IOException {}
/** Releases resources associated with this stream. */
public void close() throws IOException {}
}

View File

@ -0,0 +1,92 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.AttributeSource;
import java.io.Reader;
import java.io.IOException;
/** A Tokenizer is a TokenStream whose input is a Reader.
<p>
This is an abstract class; subclasses must override {@link #incrementToken()}
<p>
NOTE: Subclasses overriding {@link #incrementToken()} must
call {@link AttributeSource#clearAttributes()} before
setting attributes.
Subclasses overriding {@link #incrementToken()} must call
{@link Token#clear()} before setting Token attributes.
*/
public abstract class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
protected Reader input;
/** Construct a tokenizer with null input. */
protected Tokenizer() {}
/** Construct a token stream processing the given input. */
protected Tokenizer(Reader input) {
this.input = CharReader.get(input);
}
/** Construct a tokenizer with null input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory) {
super(factory);
}
/** Construct a token stream processing the given input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory, Reader input) {
super(factory);
this.input = CharReader.get(input);
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source) {
super(source);
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source, Reader input) {
super(source);
this.input = CharReader.get(input);
}
/** By default, closes the input Reader. */
@Override
public void close() throws IOException {
input.close();
}
/** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass
* this method calls {@link CharStream#correctOffset}, else returns <code>currentOff</code>.
* @param currentOff offset as seen in the output
* @return corrected offset based on the input
* @see CharStream#correctOffset
*/
protected final int correctOffset(int currentOff) {
return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff;
}
/** Expert: Reset the tokenizer to a new reader. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
public void reset(Reader input) throws IOException {
this.input = input;
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.IOException;
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
public final class WhitespaceAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new WhitespaceTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
public class WhitespaceTokenizer extends CharTokenizer {
/** Construct a new WhitespaceTokenizer. */
public WhitespaceTokenizer(Reader in) {
super(in);
}
/** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
public WhitespaceTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/** Construct a new WhitespaceTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(char)}.*/
@Override
protected boolean isTokenChar(char c) {
return !Character.isWhitespace(c);
}
}

View File

@ -0,0 +1,177 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
/**
* Loader for text files that represent a list of stopwords.
*/
public class WordlistLoader {
/**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile) throws IOException {
HashSet<String> result = new HashSet<String>();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
result = getWordSet(reader);
}
finally {
if (reader != null)
reader.close();
}
return result;
}
/**
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @param comment The comment string to ignore
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
HashSet<String> result = new HashSet<String>();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
result = getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
return result;
}
/**
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @return A HashSet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader) throws IOException {
HashSet<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
result.add(word.trim());
}
}
finally {
if (br != null)
br.close();
}
return result;
}
/**
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
* @return A HashSet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
HashSet<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false){
result.add(word.trim());
}
}
}
finally {
if (br != null)
br.close();
}
return result;
}
/**
* Reads a stem dictionary. Each line contains:
* <pre>word<b>\t</b>stem</pre>
* (i.e. two tab seperated words)
*
* @return stem dictionary that overrules the stemming algorithm
* @throws IOException
*/
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
HashMap<String, String> result = new HashMap<String, String>();
BufferedReader br = null;
FileReader fr = null;
try {
fr = new FileReader(wordstemfile);
br = new BufferedReader(fr);
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
if (fr != null)
fr.close();
if (br != null)
br.close();
}
return result;
}
}

View File

@ -0,0 +1,635 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
<p>API and code to convert text into indexable/searchable tokens. Covers {@link org.apache.lucene.analysis.Analyzer} and related classes.</p>
<h2>Parsing? Tokenization? Analysis!</h2>
<p>
Lucene, indexing and search library, accepts only plain text input.
<p>
<h2>Parsing</h2>
<p>
Applications that build their search capabilities upon Lucene may support documents in various formats &ndash; HTML, XML, PDF, Word &ndash; just to name a few.
Lucene does not care about the <i>Parsing</i> of these and other document formats, and it is the responsibility of the
application using Lucene to use an appropriate <i>Parser</i> to convert the original format into plain text before passing that plain text to Lucene.
<p>
<h2>Tokenization</h2>
<p>
Plain text passed to Lucene for indexing goes through a process generally called tokenization. Tokenization is the process
of breaking input text into small indexing elements &ndash; tokens.
The way input text is broken into tokens heavily influences how people will then be able to search for that text.
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
and proximity searches (though sentence identification is not provided by Lucene).
<p>
In some cases simply breaking the input text into tokens is not enough &ndash; a deeper <i>Analysis</i> may be needed.
There are many post tokenization steps that can be done, including (but not limited to):
<ul>
<li><a href="http://en.wikipedia.org/wiki/Stemming">Stemming</a> &ndash;
Replacing of words by their stems.
For instance with English stemming "bikes" is replaced by "bike";
now query "bike" can find both documents containing "bike" and those containing "bikes".
</li>
<li><a href="http://en.wikipedia.org/wiki/Stop_words">Stop Words Filtering</a> &ndash;
Common words like "the", "and" and "a" rarely add any value to a search.
Removing them shrinks the index size and increases performance.
It may also reduce some "noise" and actually improve search quality.
</li>
<li><a href="http://en.wikipedia.org/wiki/Text_normalization">Text Normalization</a> &ndash;
Stripping accents and other character markings can make for better searching.
</li>
<li><a href="http://en.wikipedia.org/wiki/Synonym">Synonym Expansion</a> &ndash;
Adding in synonyms at the same token position as the current word can mean better
matching when users search with words in the synonym set.
</li>
</ul>
<p>
<h2>Core Analysis</h2>
<p>
The analysis package provides the mechanism to convert Strings and Readers into tokens that can be indexed by Lucene. There
are three main classes in the package from which all analysis processes are derived. These are:
<ul>
<li>{@link org.apache.lucene.analysis.Analyzer} &ndash; An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
<li>{@link org.apache.lucene.analysis.Tokenizer} &ndash; A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in
the analysis process.</li>
<li>{@link org.apache.lucene.analysis.TokenFilter} &ndash; A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
for modifying tokens that have been created by the Tokenizer. Common modifications performed by a
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
</ul>
<b>Lucene 2.9 introduces a new TokenStream API. Please see the section "New TokenStream API" below for more details.</b>
</p>
<h2>Hints, Tips and Traps</h2>
<p>
The synergy between {@link org.apache.lucene.analysis.Analyzer} and {@link org.apache.lucene.analysis.Tokenizer}
is sometimes confusing. To ease on this confusion, some clarifications:
<ul>
<li>The {@link org.apache.lucene.analysis.Analyzer} is responsible for the entire task of
<u>creating</u> tokens out of the input text, while the {@link org.apache.lucene.analysis.Tokenizer}
is only responsible for <u>breaking</u> the input text into tokens. Very likely, tokens created
by the {@link org.apache.lucene.analysis.Tokenizer} would be modified or even omitted
by the {@link org.apache.lucene.analysis.Analyzer} (via one or more
{@link org.apache.lucene.analysis.TokenFilter}s) before being returned.
</li>
<li>{@link org.apache.lucene.analysis.Tokenizer} is a {@link org.apache.lucene.analysis.TokenStream},
but {@link org.apache.lucene.analysis.Analyzer} is not.
</li>
<li>{@link org.apache.lucene.analysis.Analyzer} is "field aware", but
{@link org.apache.lucene.analysis.Tokenizer} is not.
</li>
</ul>
</p>
<p>
Lucene Java provides a number of analysis capabilities, the most commonly used one being the {@link
org.apache.lucene.analysis.standard.StandardAnalyzer}. Many applications will have a long and industrious life with nothing more
than the StandardAnalyzer. However, there are a few other classes/packages that are worth mentioning:
<ol>
<li>{@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper} &ndash; Most Analyzers perform the same operation on all
{@link org.apache.lucene.document.Field}s. The PerFieldAnalyzerWrapper can be used to associate a different Analyzer with different
{@link org.apache.lucene.document.Field}s.</li>
<li>The contrib/analyzers library located at the root of the Lucene distribution has a number of different Analyzer implementations to solve a variety
of different problems related to searching. Many of the Analyzers are designed to analyze non-English languages.</li>
<li>The contrib/snowball library
located at the root of the Lucene distribution has Analyzer and TokenFilter
implementations for a variety of Snowball stemmers.
See <a href="http://snowball.tartarus.org">http://snowball.tartarus.org</a>
for more information on Snowball stemmers.</li>
<li>There are a variety of Tokenizer and TokenFilter implementations in this package. Take a look around, chances are someone has implemented what you need.</li>
</ol>
</p>
<p>
Analysis is one of the main causes of performance degradation during indexing. Simply put, the more you analyze the slower the indexing (in most cases).
Perhaps your application would be just fine using the simple {@link org.apache.lucene.analysis.WhitespaceTokenizer} combined with a
{@link org.apache.lucene.analysis.StopFilter}. The contrib/benchmark library can be useful for testing out the speed of the analysis process.
</p>
<h2>Invoking the Analyzer</h2>
<p>
Applications usually do not invoke analysis &ndash; Lucene does it for them:
<ul>
<li>At indexing, as a consequence of
{@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document) addDocument(doc)},
the Analyzer in effect for indexing is invoked for each indexed field of the added document.
</li>
<li>At search, as a consequence of
{@link org.apache.lucene.queryParser.QueryParser#parse(java.lang.String) QueryParser.parse(queryText)},
the QueryParser may invoke the Analyzer in effect.
Note that for some queries analysis does not take place, e.g. wildcard queries.
</li>
</ul>
However an application might invoke Analysis of any text for testing or for any other purpose, something like:
<PRE>
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
while (ts.incrementToken()) {
System.out.println("token: "+ts));
}
</PRE>
</p>
<h2>Indexing Analysis vs. Search Analysis</h2>
<p>
Selecting the "correct" analyzer is crucial
for search quality, and can also affect indexing and search performance.
The "correct" analyzer differs between applications.
Lucene java's wiki page
<a href="http://wiki.apache.org/lucene-java/AnalysisParalysis">AnalysisParalysis</a>
provides some data on "analyzing your analyzer".
Here are some rules of thumb:
<ol>
<li>Test test test... (did we say test?)</li>
<li>Beware of over analysis &ndash; might hurt indexing performance.</li>
<li>Start with same analyzer for indexing and search, otherwise searches would not find what they are supposed to...</li>
<li>In some cases a different analyzer is required for indexing and search, for instance:
<ul>
<li>Certain searches require more stop words to be filtered. (I.e. more than those that were filtered at indexing.)</li>
<li>Query expansion by synonyms, acronyms, auto spell correction, etc.</li>
</ul>
This might sometimes require a modified analyzer &ndash; see the next section on how to do that.
</li>
</ol>
</p>
<h2>Implementing your own Analyzer</h2>
<p>Creating your own Analyzer is straightforward. It usually involves either wrapping an existing Tokenizer and set of TokenFilters to create a new Analyzer
or creating both the Analyzer and a Tokenizer or TokenFilter. Before pursuing this approach, you may find it worthwhile
to explore the contrib/analyzers library and/or ask on the java-user@lucene.apache.org mailing list first to see if what you need already exists.
If you are still committed to creating your own Analyzer or TokenStream derivation (Tokenizer or TokenFilter) have a look at
the source code of any one of the many samples located in this package.
</p>
<p>
The following sections discuss some aspects of implementing your own analyzer.
</p>
<h3>Field Section Boundaries</h3>
<p>
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
is called multiple times for the same field name, we could say that each such call creates a new
section for that field in that document.
In fact, a separate call to
{@link org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) tokenStream(field,reader)}
would take place for each of these so called "sections".
However, the default Analyzer behavior is to treat all these sections as one large section.
This allows phrase search and proximity search to seamlessly cross
boundaries between these "sections".
In other words, if a certain field "f" is added like this:
<PRE>
document.add(new Field("f","first ends",...);
document.add(new Field("f","starts two",...);
indexWriter.addDocument(document);
</PRE>
Then, a phrase search for "ends starts" would find that document.
Where desired, this behavior can be modified by introducing a "position gap" between consecutive field "sections",
simply by overriding
{@link org.apache.lucene.analysis.Analyzer#getPositionIncrementGap(java.lang.String) Analyzer.getPositionIncrementGap(fieldName)}:
<PRE>
Analyzer myAnalyzer = new StandardAnalyzer() {
public int getPositionIncrementGap(String fieldName) {
return 10;
}
};
</PRE>
</p>
<h3>Token Position Increments</h3>
<p>
By default, all tokens created by Analyzers and Tokenizers have a
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
This means that the position stored for that token in the index would be one more than
that of the previous token.
Recall that phrase and proximity searches rely on position info.
</p>
<p>
If the selected analyzer filters the stop words "is" and "the", then for a document
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
would find that document, because the same analyzer filters the same stop words from
that query. But also the phrase query "blue sky" would find that document.
</p>
<p>
If this behavior does not fit the application needs,
a modified analyzer can be used, that would increment further the positions of
tokens following a removed stop word, using
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
This can be done with something like:
<PRE>
public TokenStream tokenStream(final String fieldName, Reader reader) {
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
TokenStream res = new TokenStream() {
TermAttribute termAtt = addAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
public boolean incrementToken() throws IOException {
int extraIncrement = 0;
while (true) {
boolean hasNext = ts.incrementToken();
if (hasNext) {
if (stopWords.contains(termAtt.term())) {
extraIncrement++; // filter this word
continue;
}
if (extraIncrement>0) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
}
}
return hasNext;
}
}
};
return res;
}
</PRE>
Now, with this modified analyzer, the phrase query "blue sky" would find that document.
But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
where both w1 and w2 are stop words would match that document.
</p>
<p>
Few more use cases for modifying position increments are:
<ol>
<li>Inhibiting phrase and proximity matches in sentence boundaries &ndash; for this, a tokenizer that
identifies a new sentence can add 1 to the position increment of the first token of the new sentence.</li>
<li>Injecting synonyms &ndash; here, synonyms of a token should be added after that token,
and their position increment should be set to 0.
As result, all synonyms of a token would be considered to appear in exactly the
same position as that token, and so would they be seen by phrase and proximity searches.</li>
</ol>
</p>
<h2>New TokenStream API</h2>
<p>
With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token
has getter and setter methods for different properties like positionIncrement and termText.
While this approach was sufficient for the default indexing format, it is not versatile enough for
Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom
index formats.
</p>
<p>
A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API
is necessary that can transport custom types of data from the documents to the indexer.
</p>
<h3>Attribute and AttributeSource</h3>
Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and
{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a
particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token.
An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which
means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also
AttributeSources.
<p>
Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
<ul>
<li>{@link org.apache.lucene.analysis.tokenattributes.TermAttribute}<p>The term text of a token.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}<p>The start and end offset of token in characters.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}<p>See above for detailed information about position increment.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}<p>The payload that a Token can optionally have.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}<p>The type of the token. Default is 'word'.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}<p>Optional flags a token can have.</p></li>
</ul>
</p>
<h3>Using the new TokenStream API</h3>
There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
to walk through the example below first and come back to this section afterwards.
<ol><li>
Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if
a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
with the TokenStream.
</li>
<br>
<li>
Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
the Attribute instances.
</li>
<br>
<li>
For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
in incrementToken() will avoid attribute lookups for every token in the document.
</li>
<br>
<li>
All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
result. This is especially important to know for addAttribute(). The method takes the <b>type</b> (<code>Class</code>)
of an Attribute as an argument and returns an <b>instance</b>. If an Attribute of the same type was previously added, then
the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
can safely call addAttribute() with the same Attribute type multiple times. Even consumers of TokenStreams should
normally call addAttribute() instead of getAttribute(), because it would not fail if the TokenStream does not have this
Attribute (getAttribute() would throw an IllegalArgumentException, if the Attribute is missing). More advanced code
could simply check with hasAttribute(), if a TokenStream has it, and may conditionally leave out processing for
extra performance.
</li></ol>
<h3>Example</h3>
In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only
have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained
here to illustrate the usage of the new TokenStream API.<br>
Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
<h4>Whitespace tokenization</h4>
<pre>
public class MyAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
return stream;
}
public static void main(String[] args) throws IOException {
// text to tokenize
final String text = "This is a demo of the new TokenStream API";
MyAnalyzer analyzer = new MyAnalyzer();
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
// get the TermAttribute from the TokenStream
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
stream.reset();
// print all tokens until stream is exhausted
while (stream.incrementToken()) {
System.out.println(termAtt.term());
}
stream.end()
stream.close();
}
}
</pre>
In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and
prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides.
Here is the output:
<pre>
This
is
a
demo
of
the
new
TokenStream
API
</pre>
<h4>Adding a LengthFilter</h4>
We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter
to the chain. Only the tokenStream() method in our analyzer needs to be changed:
<pre>
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
return stream;
}
</pre>
Note how now only words with 3 or more characters are contained in the output:
<pre>
This
demo
the
new
TokenStream
API
</pre>
Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
<pre>
public final class LengthFilter extends TokenFilter {
final int min;
final int max;
private TermAttribute termAtt;
/**
* Build a filter that removes words that are too long or too
* short from the text.
*/
public LengthFilter(TokenStream in, int min, int max)
{
super(in);
this.min = min;
this.max = max;
termAtt = addAttribute(TermAttribute.class);
}
/**
* Returns the next input Token whose term() is the right len
*/
public final boolean incrementToken() throws IOException
{
assert termAtt != null;
// return the first non-stop word found
while (input.incrementToken()) {
int len = termAtt.termLength();
if (len >= min && len <= max) {
return true;
}
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return null
return false;
}
}
</pre>
The TermAttribute is added in the constructor and stored in the instance variable <code>termAtt</code>.
Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
<code>addAttribute()</code> call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
are retrieved from the input stream in the <code>incrementToken()</code> method. By looking at the term text
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
<h4>Adding a custom Attribute</h4>
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
<code>PartOfSpeechAttribute</code>. First we need to define the interface of the new Attribute:
<pre>
public interface PartOfSpeechAttribute extends Attribute {
public static enum PartOfSpeech {
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
}
public void setPartOfSpeech(PartOfSpeech pos);
public PartOfSpeech getPartOfSpeech();
}
</pre>
Now we also need to write the implementing class. The name of that class is important here: By default, Lucene
checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would
consequently call the implementing class <code>PartOfSpeechAttributeImpl</code>. <br/>
This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions:
{@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument
and returns an actual instance. You can implement your own factory if you need to change the default behavior. <br/><br/>
Now here is the actual class that implements our new Attribute. Notice that the class has to extend
{@link org.apache.lucene.util.AttributeImpl}:
<pre>
public final class PartOfSpeechAttributeImpl extends AttributeImpl
implements PartOfSpeechAttribute{
private PartOfSpeech pos = PartOfSpeech.Unknown;
public void setPartOfSpeech(PartOfSpeech pos) {
this.pos = pos;
}
public PartOfSpeech getPartOfSpeech() {
return pos;
}
public void clear() {
pos = PartOfSpeech.Unknown;
}
public void copyTo(AttributeImpl target) {
((PartOfSpeechAttributeImpl) target).pos = pos;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PartOfSpeechAttributeImpl) {
return pos == ((PartOfSpeechAttributeImpl) other).pos;
}
return false;
}
public int hashCode() {
return pos.ordinal();
}
}
</pre>
This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the
new <code>AttributeImpl</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode()</code>.
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
<pre>
public static class PartOfSpeechTaggingFilter extends TokenFilter {
PartOfSpeechAttribute posAtt;
TermAttribute termAtt;
protected PartOfSpeechTaggingFilter(TokenStream input) {
super(input);
posAtt = addAttribute(PartOfSpeechAttribute.class);
termAtt = addAttribute(TermAttribute.class);
}
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {return false;}
posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
return true;
}
// determine the part of speech for the given term
protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
// naive implementation that tags every uppercased word as noun
if (length > 0 && Character.isUpperCase(term[0])) {
return PartOfSpeech.Noun;
}
return PartOfSpeech.Unknown;
}
}
</pre>
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
stores references in instance variables. Notice how you only need to pass in the interface of the new
Attribute and instantiating the correct class is automatically been taken care of.
Now we need to add the filter to the chain:
<pre>
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
stream = new PartOfSpeechTaggingFilter(stream);
return stream;
}
</pre>
Now let's look at the output:
<pre>
This
demo
the
new
TokenStream
API
</pre>
Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
to make use of the new PartOfSpeechAttribute and print it out:
<pre>
public static void main(String[] args) throws IOException {
// text to tokenize
final String text = "This is a demo of the new TokenStream API";
MyAnalyzer analyzer = new MyAnalyzer();
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
// get the TermAttribute from the TokenStream
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
// get the PartOfSpeechAttribute from the TokenStream
PartOfSpeechAttribute posAtt = stream.addAttribute(PartOfSpeechAttribute.class);
stream.reset();
// print all tokens until stream is exhausted
while (stream.incrementToken()) {
System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
}
stream.end();
stream.close();
}
</pre>
The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in
the while loop that consumes the stream. Here is the new output:
<pre>
This: Noun
demo: Unknown
the: Unknown
new: Unknown
TokenStream: Noun
API: Noun
</pre>
Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive
part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it
is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new
API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token
of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
As a small hint, this is how the new Attribute class could begin:
<pre>
public class FirstTokenOfSentenceAttributeImpl extends Attribute
implements FirstTokenOfSentenceAttribute {
private boolean firstToken;
public void setFirstToken(boolean firstToken) {
this.firstToken = firstToken;
}
public boolean getFirstToken() {
return firstToken;
}
public void clear() {
firstToken = false;
}
...
</pre>
</body>
</html>

View File

@ -0,0 +1,25 @@
/*
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
the tokenizer, only use Java 1.4 !!!
This grammar currently uses constructs (eg :digit:, :letter:) whose
meaning can vary according to the JRE used to run jflex. See
https://issues.apache.org/jira/browse/LUCENE-1126 for details.
For current backwards compatibility it is needed to support
only Java 1.4 - this will change in Lucene 3.1.

View File

@ -0,0 +1,161 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
/**
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}, using a list of
* English stop words.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 2.9, StopFilter preserves position
* increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
* </ul>
*/
public class StandardAnalyzer extends Analyzer {
private Set<?> stopSet;
/**
* Specifies whether deprecated acronyms should be replaced with HOST type.
* See {@linkplain https://issues.apache.org/jira/browse/LUCENE-1068}
*/
private final boolean replaceInvalidAcronym,enableStopPositionIncrements;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
private final Version matchVersion;
/** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/
public StandardAnalyzer(Version matchVersion) {
this(matchVersion, STOP_WORDS_SET);
}
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
stopSet = stopWords;
setOverridesTokenStreamMethod(StandardAnalyzer.class);
enableStopPositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
this.matchVersion = matchVersion;
}
/** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(File)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result);
result = new StopFilter(enableStopPositionIncrements, result, stopSet);
return result;
}
private static final class SavedStreams {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
}
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* reusableTokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new StandardTokenizer(matchVersion, reader);
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
streams.filteredTokenStream, stopSet);
} else {
streams.tokenStream.reset(reader);
}
streams.tokenStream.setMaxTokenLength(maxTokenLength);
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
return streams.filteredTokenStream;
}
}

View File

@ -0,0 +1,76 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
public final class StandardFilter extends TokenFilter {
/** Construct filtering <i>in</i>. */
public StandardFilter(TokenStream in) {
super(in);
termAtt = addAttribute(TermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
}
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
// this filters uses attribute type
private TypeAttribute typeAtt;
private TermAttribute termAtt;
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
@Override
public final boolean incrementToken() throws java.io.IOException {
if (!input.incrementToken()) {
return false;
}
char[] buffer = termAtt.termBuffer();
final int bufferLength = termAtt.termLength();
final String type = typeAtt.type();
if (type == APOSTROPHE_TYPE && // remove 's
bufferLength >= 2 &&
buffer[bufferLength-2] == '\'' &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
// Strip last 2 characters off
termAtt.setTermLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
int upto = 0;
for(int i=0;i<bufferLength;i++) {
char c = buffer[i];
if (c != '.')
buffer[upto++] = c;
}
termAtt.setTermLength(upto);
}
return true;
}
}

View File

@ -0,0 +1,244 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** A grammar-based tokenizer constructed with JFlex
*
* <p> This should be a good tokenizer for most European-language documents:
*
* <ul>
* <li>Splits words at punctuation characters, removing punctuation. However, a
* dot that's not followed by whitespace is considered part of a token.
* <li>Splits words at hyphens, unless there's a number in the token, in which case
* the whole token is interpreted as a product number and is not split.
* <li>Recognizes email addresses and internet hostnames as one token.
* </ul>
*
* <p>Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
* </ul>
*/
public final class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private final StandardTokenizerImpl scanner;
public static final int ALPHANUM = 0;
public static final int APOSTROPHE = 1;
public static final int ACRONYM = 2;
public static final int COMPANY = 3;
public static final int EMAIL = 4;
public static final int HOST = 5;
public static final int NUM = 6;
public static final int CJ = 7;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
public static final int ACRONYM_DEP = 8;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>"
};
private boolean replaceInvalidAcronym;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/** @see #setMaxTokenLength */
public int getMaxTokenLength() {
return maxTokenLength;
}
/**
* Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
* the <code>input</code> to the newly created JFlex scanner.
*
* @param input The input reader
*
* See http://issues.apache.org/jira/browse/LUCENE-1068
*/
public StandardTokenizer(Version matchVersion, Reader input) {
super();
this.scanner = new StandardTokenizerImpl(input);
init(input, matchVersion);
}
/**
* Creates a new StandardTokenizer with a given {@link AttributeSource}.
*/
public StandardTokenizer(Version matchVersion, AttributeSource source, Reader input) {
super(source);
this.scanner = new StandardTokenizerImpl(input);
init(input, matchVersion);
}
/**
* Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
*/
public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
super(factory);
this.scanner = new StandardTokenizerImpl(input);
init(input, matchVersion);
}
private void init(Reader input, Version matchVersion) {
if (matchVersion.onOrAfter(Version.LUCENE_24)) {
replaceInvalidAcronym = true;
} else {
replaceInvalidAcronym = false;
}
this.input = input;
termAtt = addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
}
// this tokenizer generates three attributes:
// offset, positionIncrement and type
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
int posIncr = 1;
while(true) {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) {
return false;
}
if (scanner.yylength() <= maxTokenLength) {
posIncrAtt.setPositionIncrement(posIncr);
scanner.getText(termAtt);
final int start = scanner.yychar();
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
}
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return true;
} else
// When we skip a too-long term, we still increment the
// position increment
posIncr++;
}
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#reset()
*/
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
reset();
}
/**
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
* when they should have been labeled as hosts instead.
* @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
*
* @deprecated Remove in 3.X and make true the only valid value
*/
public boolean isReplaceInvalidAcronym() {
return replaceInvalidAcronym;
}
/**
*
* @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
* @deprecated Remove in 3.X and make true the only valid value
*
* See https://issues.apache.org/jira/browse/LUCENE-1068
*/
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
}
}

View File

@ -0,0 +1,723 @@
/* The following code was generated by JFlex 1.4.1 on 9/4/08 6:49 PM */
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
the tokenizer, only use Java 1.4 !!!
This grammar currently uses constructs (eg :digit:, :letter:) whose
meaning can vary according to the JRE used to run jflex. See
https://issues.apache.org/jira/browse/LUCENE-1126 for details.
For current backwards compatibility it is needed to support
only Java 1.4 - this will change in Lucene 3.1.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
* on 9/4/08 6:49 PM from the specification file
* <tt>/tango/mike/src/lucene.standarddigit/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
*/
class StandardTokenizerImpl {
/** This character denotes the end of file */
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
public static final int YYINITIAL = 0;
/**
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
"\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12\34\0\136\12"+
"\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12\11\0\1\12"+
"\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12\1\0\24\12"+
"\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12\12\0\71\12"+
"\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12\67\0\46\12"+
"\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12\56\0\32\12"+
"\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12\17\0\2\12"+
"\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0\46\12\u015f\0"+
"\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2\25\0"+
"\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12\3\0"+
"\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12\23\0\6\12"+
"\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12\1\0\2\12"+
"\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2\2\0\3\12"+
"\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12\1\0\7\12"+
"\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0\1\12"+
"\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12"+
"\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12"+
"\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12\3\0\2\12"+
"\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12\3\0\10\12"+
"\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12\1\0\27\12"+
"\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2\25\0\10\12"+
"\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\44\0\1\12"+
"\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12"+
"\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12\3\0\30\12"+
"\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1\60\12\1\1"+
"\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0\1\12\2\0"+
"\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0\7\12\1\0"+
"\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0\4\12\1\0"+
"\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0\12\2\2\0"+
"\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0\42\12\35\0"+
"\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0\12\2\6\0"+
"\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0\104\12\5\0"+
"\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0\4\12\2\0"+
"\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0\1\12\1\0"+
"\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
"\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0\27\12\1\0"+
"\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\47\12\1\0"+
"\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0"+
"\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0\12\2\6\0"+
"\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0\26\12\2\0"+
"\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0\1\12\1\0"+
"\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0\7\12\1\0"+
"\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0\6\12\4\0"+
"\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0\1\12\4\0"+
"\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0\1\12\1\0"+
"\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0\7\12\u0ecb\0"+
"\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13\2\13\132\13"+
"\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
"\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12\5\0\1\12"+
"\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12\1\0\2\12"+
"\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12\2\0\66\12"+
"\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12\23\0\12\2"+
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
/**
* Translates characters to character classes
*/
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
/**
* Translates DFA states to action switch labels.
*/
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
"\1\4";
private static int [] zzUnpackAction() {
int [] result = new int[51];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAction(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/**
* Translates a state to a row index in the transition table
*/
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
"\0\u0268\0\u0276\0\u0284";
private static int [] zzUnpackRowMap() {
int [] result = new int[51];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
}
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int high = packed.charAt(i++) << 16;
result[j++] = high | packed.charAt(i++);
}
return j;
}
/**
* The transition table of the DFA
*/
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
"\1\11\2\52\1\0\1\24\3\0";
private static int [] zzUnpackTrans() {
int [] result = new int[658];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
}
private static int zzUnpackTrans(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
value--;
do result[j++] = value; while (--count > 0);
}
return j;
}
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
private static final int ZZ_PUSHBACK_2BIG = 2;
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
/**
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
*/
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
private static int [] zzUnpackAttribute() {
int [] result = new int[51];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/** the input device */
private java.io.Reader zzReader;
/** the current state of the DFA */
private int zzState;
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
private int zzMarkedPos;
/** the textposition at the last state to be included in yytext */
private int zzPushbackPos;
/** the current text position in the buffer */
private int zzCurrentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
private boolean zzAtEOF;
/* user code: */
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
public static final int ACRONYM = StandardTokenizer.ACRONYM;
public static final int COMPANY = StandardTokenizer.COMPANY;
public static final int EMAIL = StandardTokenizer.EMAIL;
public static final int HOST = StandardTokenizer.HOST;
public static final int NUM = StandardTokenizer.NUM;
public static final int CJ = StandardTokenizer.CJ;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
public final int yychar()
{
return yychar;
}
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Fills TermAttribute with the current token text.
*/
final void getText(TermAttribute t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
StandardTokenizerImpl(java.io.Reader in) {
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
StandardTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.
*
* @param packed the packed character translation table
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 1154) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
}
return map;
}
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
/* first: make room (if you can) */
if (zzStartRead > 0) {
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
/* translate stored positions */
zzEndRead-= zzStartRead;
zzCurrentPos-= zzStartRead;
zzMarkedPos-= zzStartRead;
zzPushbackPos-= zzStartRead;
zzStartRead = 0;
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length) {
/* if not: blow it up */
char newBuffer[] = new char[zzCurrentPos*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
}
/* finally: fill the buffer with new input */
int numRead = zzReader.read(zzBuffer, zzEndRead,
zzBuffer.length-zzEndRead);
if (numRead < 0) {
return true;
}
else {
zzEndRead+= numRead;
return false;
}
}
/**
* Closes the input stream.
*/
public final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
if (zzReader != null)
zzReader.close();
}
/**
* Resets the scanner to read from a new input stream.
* Does not close the old reader.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
*
* @param reader the new input stream
*/
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
}
/**
* Returns the current lexical state.
*/
public final int yystate() {
return zzLexicalState;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
public final void yybegin(int newState) {
zzLexicalState = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
public final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
public final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
/**
* Returns the length of the matched text region.
*/
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
private void zzScanError(int errorCode) {
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
throw new Error(message);
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
public void yypushback(int number) {
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
public int getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
// cached fields:
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
char [] zzBufferL = zzBuffer;
char [] zzCMapL = ZZ_CMAP;
int [] zzTransL = ZZ_TRANS;
int [] zzRowMapL = ZZ_ROWMAP;
int [] zzAttrL = ZZ_ATTRIBUTE;
while (true) {
zzMarkedPosL = zzMarkedPos;
yychar+= zzMarkedPosL-zzStartRead;
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
zzState = zzLexicalState;
zzForAction: {
while (true) {
if (zzCurrentPosL < zzEndReadL)
zzInput = zzBufferL[zzCurrentPosL++];
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
}
else {
// store back cached positions
zzCurrentPos = zzCurrentPosL;
zzMarkedPos = zzMarkedPosL;
boolean eof = zzRefill();
// get translated positions and possibly new buffer
zzCurrentPosL = zzCurrentPos;
zzMarkedPosL = zzMarkedPos;
zzBufferL = zzBuffer;
zzEndReadL = zzEndRead;
if (eof) {
zzInput = YYEOF;
break zzForAction;
}
else {
zzInput = zzBufferL[zzCurrentPosL++];
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
if (zzNext == -1) break zzForAction;
zzState = zzNext;
int zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
if ( (zzAttributes & 8) == 8 ) break zzForAction;
}
}
}
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 4:
{ return HOST;
}
case 11: break;
case 9:
{ return ACRONYM;
}
case 12: break;
case 8:
{ return ACRONYM_DEP;
}
case 13: break;
case 1:
{ /* ignore */
}
case 14: break;
case 5:
{ return NUM;
}
case 15: break;
case 3:
{ return CJ;
}
case 16: break;
case 2:
{ return ALPHANUM;
}
case 17: break;
case 7:
{ return COMPANY;
}
case 18: break;
case 6:
{ return APOSTROPHE;
}
case 19: break;
case 10:
{ return EMAIL;
}
case 20: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
}
else {
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -0,0 +1,145 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
the tokenizer, only use Java 1.4 !!!
This grammar currently uses constructs (eg :digit:, :letter:) whose
meaning can vary according to the JRE used to run jflex. See
https://issues.apache.org/jira/browse/LUCENE-1126 for details.
For current backwards compatibility it is needed to support
only Java 1.4 - this will change in Lucene 3.1.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
%%
%class StandardTokenizerImpl
%unicode
%integer
%function getNextToken
%pack
%char
%{
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
public static final int ACRONYM = StandardTokenizer.ACRONYM;
public static final int COMPANY = StandardTokenizer.COMPANY;
public static final int EMAIL = StandardTokenizer.EMAIL;
public static final int HOST = StandardTokenizer.HOST;
public static final int NUM = StandardTokenizer.NUM;
public static final int CJ = StandardTokenizer.CJ;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
public final int yychar()
{
return yychar;
}
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Fills TermAttribute with the current token text.
*/
final void getText(TermAttribute t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
THAI = [\u0E00-\u0E59]
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possessives
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM = {LETTER} "." ({LETTER} ".")+
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
// email addresses
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
// hostname
HOST = {ALPHANUM} ((".") {ALPHANUM})+
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {HAS_DIGIT} {P} {ALPHANUM}
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
// punctuation
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
ALPHA = ({LETTER})+
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
LETTER = !(![:letter:]|{CJ})
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
WHITESPACE = \r\n | [ \r\n\t\f]
%%
{ALPHANUM} { return ALPHANUM; }
{APOSTROPHE} { return APOSTROPHE; }
{ACRONYM} { return ACRONYM; }
{COMPANY} { return COMPANY; }
{EMAIL} { return EMAIL; }
{HOST} { return HOST; }
{NUM} { return NUM; }
{CJ} { return CJ; }
{ACRONYM_DEP} { return ACRONYM_DEP; }
/** Ignore the rest */
. | {WHITESPACE} { /* ignore */ }

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
A fast grammar-based tokenizer constructed with JFlex.
</body>
</html>

View File

@ -0,0 +1,44 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Attribute;
/**
* This attribute can be used to pass different flags down the {@link Tokenizer} chain,
* eg from one TokenFilter to another one.
*/
public interface FlagsAttribute extends Attribute {
/**
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
*
*
* @return The bits
*/
public int getFlags();
/**
* @see #getFlags()
*/
public void setFlags(int flags);
}

View File

@ -0,0 +1,80 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.AttributeImpl;
/**
* This attribute can be used to pass different flags down the tokenizer chain,
* eg from one TokenFilter to another one.
*/
public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
private int flags = 0;
/**
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
*
*
* @return The bits
*/
public int getFlags() {
return flags;
}
/**
* @see #getFlags()
*/
public void setFlags(int flags) {
this.flags = flags;
}
@Override
public void clear() {
flags = 0;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other instanceof FlagsAttributeImpl) {
return ((FlagsAttributeImpl) other).flags == flags;
}
return false;
}
@Override
public int hashCode() {
return flags;
}
@Override
public void copyTo(AttributeImpl target) {
FlagsAttribute t = (FlagsAttribute) target;
t.setFlags(flags);
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
/**
* The start and end character offset of a Token.
*/
public interface OffsetAttribute extends Attribute {
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
Note that the difference between endOffset() and startOffset() may not be
equal to termText.length(), as the term text may have been altered by a
stemmer or some other filter. */
public int startOffset();
/** Set the starting and ending offset.
@see #startOffset() and #endOffset()*/
public void setOffset(int startOffset, int endOffset);
/** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. The length
of the token in the source text is (endOffset - startOffset). */
public int endOffset();
}

View File

@ -0,0 +1,90 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.AttributeImpl;
/**
* The start and end character offset of a Token.
*/
public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable, Serializable {
private int startOffset;
private int endOffset;
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
Note that the difference between endOffset() and startOffset() may not be
equal to termText.length(), as the term text may have been altered by a
stemmer or some other filter. */
public int startOffset() {
return startOffset;
}
/** Set the starting and ending offset.
@see #startOffset() and #endOffset()*/
public void setOffset(int startOffset, int endOffset) {
this.startOffset = startOffset;
this.endOffset = endOffset;
}
/** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. The length
of the token in the source text is (endOffset - startOffset). */
public int endOffset() {
return endOffset;
}
@Override
public void clear() {
startOffset = 0;
endOffset = 0;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof OffsetAttributeImpl) {
OffsetAttributeImpl o = (OffsetAttributeImpl) other;
return o.startOffset == startOffset && o.endOffset == endOffset;
}
return false;
}
@Override
public int hashCode() {
int code = startOffset;
code = code * 31 + endOffset;
return code;
}
@Override
public void copyTo(AttributeImpl target) {
OffsetAttribute t = (OffsetAttribute) target;
t.setOffset(startOffset, endOffset);
}
}

View File

@ -0,0 +1,36 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.Attribute;
/**
* The payload of a Token. See also {@link Payload}.
*/
public interface PayloadAttribute extends Attribute {
/**
* Returns this Token's payload.
*/
public Payload getPayload();
/**
* Sets this Token's payload.
*/
public void setPayload(Payload payload);
}

View File

@ -0,0 +1,101 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.AttributeImpl;
/**
* The payload of a Token. See also {@link Payload}.
*/
public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable, Serializable {
private Payload payload;
/**
* Initialize this attribute with no payload.
*/
public PayloadAttributeImpl() {}
/**
* Initialize this attribute with the given payload.
*/
public PayloadAttributeImpl(Payload payload) {
this.payload = payload;
}
/**
* Returns this Token's payload.
*/
public Payload getPayload() {
return this.payload;
}
/**
* Sets this Token's payload.
*/
public void setPayload(Payload payload) {
this.payload = payload;
}
@Override
public void clear() {
payload = null;
}
@Override
public Object clone() {
PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone();
if (payload != null) {
clone.payload = (Payload) payload.clone();
}
return clone;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PayloadAttribute) {
PayloadAttributeImpl o = (PayloadAttributeImpl) other;
if (o.payload == null || payload == null) {
return o.payload == null && payload == null;
}
return o.payload.equals(payload);
}
return false;
}
@Override
public int hashCode() {
return (payload == null) ? 0 : payload.hashCode();
}
@Override
public void copyTo(AttributeImpl target) {
PayloadAttribute t = (PayloadAttribute) target;
t.setPayload((payload == null) ? null : (Payload) payload.clone());
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
/** The positionIncrement determines the position of this token
* relative to the previous Token in a TokenStream, used in phrase
* searching.
*
* <p>The default value is one.
*
* <p>Some common uses for this are:<ul>
*
* <li>Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* <li>Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* </ul>
*
* @see org.apache.lucene.index.TermPositions
*/
public interface PositionIncrementAttribute extends Attribute {
/** Set the position increment. The default value is one.
*
* @param positionIncrement the distance from the prior term
*/
public void setPositionIncrement(int positionIncrement);
/** Returns the position increment of this Token.
* @see #setPositionIncrement
*/
public int getPositionIncrement();
}

View File

@ -0,0 +1,99 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.AttributeImpl;
/** The positionIncrement determines the position of this token
* relative to the previous Token in a {@link TokenStream}, used in phrase
* searching.
*
* <p>The default value is one.
*
* <p>Some common uses for this are:<ul>
*
* <li>Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* <li>Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* </ul>
*/
public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable, Serializable {
private int positionIncrement = 1;
/** Set the position increment. The default value is one.
*
* @param positionIncrement the distance from the prior term
*/
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
throw new IllegalArgumentException
("Increment must be zero or greater: " + positionIncrement);
this.positionIncrement = positionIncrement;
}
/** Returns the position increment of this Token.
* @see #setPositionIncrement
*/
public int getPositionIncrement() {
return positionIncrement;
}
@Override
public void clear() {
this.positionIncrement = 1;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PositionIncrementAttributeImpl) {
return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
}
return false;
}
@Override
public int hashCode() {
return positionIncrement;
}
@Override
public void copyTo(AttributeImpl target) {
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
t.setPositionIncrement(positionIncrement);
}
}

View File

@ -0,0 +1,91 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
/**
* The term text of a Token.
*/
public interface TermAttribute extends Attribute {
/** Returns the Token's term text.
*
* This method has a performance penalty
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
* String, use this method, which is nothing more than
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
*/
public String term();
/** Copies the contents of buffer, starting at offset for
* length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public void setTermBuffer(char[] buffer, int offset, int length);
/** Copies the contents of buffer into the termBuffer array.
* @param buffer the buffer to copy
*/
public void setTermBuffer(String buffer);
/** Copies the contents of buffer, starting at offset and continuing
* for length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public void setTermBuffer(String buffer, int offset, int length);
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
* #resizeTermBuffer(int)} to increase it. After
* altering the buffer be sure to call {@link
* #setTermLength} to record the number of valid
* characters that were placed into the termBuffer. */
public char[] termBuffer();
/** Grows the termBuffer to at least size newSize, preserving the
* existing content. Note: If the next operation is to change
* the contents of the term buffer use
* {@link #setTermBuffer(char[], int, int)},
* {@link #setTermBuffer(String)}, or
* {@link #setTermBuffer(String, int, int)}
* to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize);
/** Return number of valid characters (length of the term)
* in the termBuffer array. */
public int termLength();
/** Set number of valid characters (length of the term) in
* the termBuffer array. Use this to truncate the termBuffer
* or to synchronize with external manipulation of the termBuffer.
* Note: to grow the size of the array,
* use {@link #resizeTermBuffer(int)} first.
* @param length the truncated length
*/
public void setTermLength(int length);
}

View File

@ -0,0 +1,226 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeImpl;
/**
* The term text of a Token.
*/
public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable {
private static int MIN_BUFFER_SIZE = 10;
private char[] termBuffer;
private int termLength;
/** Returns the Token's term text.
*
* This method has a performance penalty
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
* String, use this method, which is nothing more than
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
*/
public String term() {
initTermBuffer();
return new String(termBuffer, 0, termLength);
}
/** Copies the contents of buffer, starting at offset for
* length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public void setTermBuffer(char[] buffer, int offset, int length) {
growTermBuffer(length);
System.arraycopy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
/** Copies the contents of buffer into the termBuffer array.
* @param buffer the buffer to copy
*/
public void setTermBuffer(String buffer) {
int length = buffer.length();
growTermBuffer(length);
buffer.getChars(0, length, termBuffer, 0);
termLength = length;
}
/** Copies the contents of buffer, starting at offset and continuing
* for length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public void setTermBuffer(String buffer, int offset, int length) {
assert offset <= buffer.length();
assert offset + length <= buffer.length();
growTermBuffer(length);
buffer.getChars(offset, offset + length, termBuffer, 0);
termLength = length;
}
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
* #resizeTermBuffer(int)} to increase it. After
* altering the buffer be sure to call {@link
* #setTermLength} to record the number of valid
* characters that were placed into the termBuffer. */
public char[] termBuffer() {
initTermBuffer();
return termBuffer;
}
/** Grows the termBuffer to at least size newSize, preserving the
* existing content. Note: If the next operation is to change
* the contents of the term buffer use
* {@link #setTermBuffer(char[], int, int)},
* {@link #setTermBuffer(String)}, or
* {@link #setTermBuffer(String, int, int)}
* to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation and preserve content
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
}
return termBuffer;
}
/** Allocates a buffer char[] of at least newSize, without preserving the existing content.
* its always used in places that set the content
* @param newSize minimum size of the buffer
*/
private void growTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation:
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
}
}
}
private void initTermBuffer() {
if (termBuffer == null) {
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
termLength = 0;
}
}
/** Return number of valid characters (length of the term)
* in the termBuffer array. */
public int termLength() {
return termLength;
}
/** Set number of valid characters (length of the term) in
* the termBuffer array. Use this to truncate the termBuffer
* or to synchronize with external manipulation of the termBuffer.
* Note: to grow the size of the array,
* use {@link #resizeTermBuffer(int)} first.
* @param length the truncated length
*/
public void setTermLength(int length) {
initTermBuffer();
if (length > termBuffer.length)
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
termLength = length;
}
@Override
public int hashCode() {
initTermBuffer();
int code = termLength;
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
return code;
}
@Override
public void clear() {
termLength = 0;
}
@Override
public Object clone() {
TermAttributeImpl t = (TermAttributeImpl)super.clone();
// Do a deep clone
if (termBuffer != null) {
t.termBuffer = (char[]) termBuffer.clone();
}
return t;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof TermAttribute) {
initTermBuffer();
TermAttributeImpl o = ((TermAttributeImpl) other);
o.initTermBuffer();
if (termLength != o.termLength)
return false;
for(int i=0;i<termLength;i++) {
if (termBuffer[i] != o.termBuffer[i]) {
return false;
}
}
return true;
}
return false;
}
@Override
public String toString() {
initTermBuffer();
return "term=" + new String(termBuffer, 0, termLength);
}
@Override
public void copyTo(AttributeImpl target) {
initTermBuffer();
TermAttribute t = (TermAttribute) target;
t.setTermBuffer(termBuffer, 0, termLength);
}
}

View File

@ -0,0 +1,32 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
/**
* A Token's lexical type. The Default value is "word".
*/
public interface TypeAttribute extends Attribute {
/** Returns this Token's lexical type. Defaults to "word". */
public String type();
/** Set the lexical type.
@see #type() */
public void setType(String type);
}

View File

@ -0,0 +1,78 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.AttributeImpl;
/**
* A Token's lexical type. The Default value is "word".
*/
public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable, Serializable {
private String type;
public static final String DEFAULT_TYPE = "word";
public TypeAttributeImpl() {
this(DEFAULT_TYPE);
}
public TypeAttributeImpl(String type) {
this.type = type;
}
/** Returns this Token's lexical type. Defaults to "word". */
public String type() {
return type;
}
/** Set the lexical type.
@see #type() */
public void setType(String type) {
this.type = type;
}
@Override
public void clear() {
type = DEFAULT_TYPE;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof TypeAttributeImpl) {
return type.equals(((TypeAttributeImpl) other).type);
}
return false;
}
@Override
public int hashCode() {
return type.hashCode();
}
@Override
public void copyTo(AttributeImpl target) {
TypeAttribute t = (TypeAttribute) target;
t.setType(type);
}
}

View File

@ -0,0 +1,294 @@
package org.apache.lucene.document;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.PhraseQuery; // for javadocs
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.StringHelper; // for javadocs
/**
*
*
**/
public abstract class AbstractField implements Fieldable {
protected String name = "body";
protected boolean storeTermVector = false;
protected boolean storeOffsetWithTermVector = false;
protected boolean storePositionWithTermVector = false;
protected boolean omitNorms = false;
protected boolean isStored = false;
protected boolean isIndexed = true;
protected boolean isTokenized = true;
protected boolean isBinary = false;
protected boolean lazy = false;
protected boolean omitTermFreqAndPositions = false;
protected float boost = 1.0f;
// the data object for all different kind of field values
protected Object fieldsData = null;
// pre-analyzed tokenStream for indexed fields
protected TokenStream tokenStream;
// length/offset for all primitive types
protected int binaryLength;
protected int binaryOffset;
protected AbstractField()
{
}
protected AbstractField(String name, Field.Store store, Field.Index index, Field.TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
this.name = StringHelper.intern(name); // field names are interned
this.isStored = store.isStored();
this.isIndexed = index.isIndexed();
this.isTokenized = index.isAnalyzed();
this.omitNorms = index.omitNorms();
this.isBinary = false;
setStoreTermVector(termVector);
}
/** Sets the boost factor hits on this field. This value will be
* multiplied into the score of all hits on this this field of this
* document.
*
* <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
* containing this field. If a document has multiple fields with the same
* name, all such values are multiplied together. This product is then
* used to compute the norm factor for the field. By
* default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(String,
* FieldInvertState)} method, the boost value is multipled
* by the {@link
* org.apache.lucene.search.Similarity#lengthNorm(String,
* int)} and then
* rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(String, org.apache.lucene.index.FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNorm(float)
*/
public void setBoost(float boost) {
this.boost = boost;
}
/** Returns the boost factor for hits for this field.
*
* <p>The default value is 1.0.
*
* <p>Note: this value is not stored directly with the document in the index.
* Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and
* {@link org.apache.lucene.search.Searcher#doc(int)} may thus not have the same value present as when
* this field was indexed.
*
* @see #setBoost(float)
*/
public float getBoost() {
return boost;
}
/** Returns the name of the field as an interned string.
* For example "date", "title", "body", ...
*/
public String name() { return name; }
protected void setStoreTermVector(Field.TermVector termVector) {
this.storeTermVector = termVector.isStored();
this.storePositionWithTermVector = termVector.withPositions();
this.storeOffsetWithTermVector = termVector.withOffsets();
}
/** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is
Reader-valued. */
public final boolean isStored() { return isStored; }
/** True iff the value of the field is to be indexed, so that it may be
searched on. */
public final boolean isIndexed() { return isIndexed; }
/** True iff the value of the field should be tokenized as text prior to
indexing. Un-tokenized fields are indexed as a single word and may not be
Reader-valued. */
public final boolean isTokenized() { return isTokenized; }
/** True iff the term or terms used to index this field are stored as a term
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
* These methods do not provide access to the original content of the field,
* only to terms used to index it. If the original content must be
* preserved, use the <code>stored</code> attribute instead.
*
* @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String)
*/
public final boolean isTermVectorStored() { return storeTermVector; }
/**
* True iff terms are stored as term vector together with their offsets
* (start and end position in source text).
*/
public boolean isStoreOffsetWithTermVector(){
return storeOffsetWithTermVector;
}
/**
* True iff terms are stored as term vector together with their token positions.
*/
public boolean isStorePositionWithTermVector(){
return storePositionWithTermVector;
}
/** True iff the value of the filed is stored as binary */
public final boolean isBinary() {
return isBinary;
}
/**
* Return the raw byte[] for the binary field. Note that
* you must also call {@link #getBinaryLength} and {@link
* #getBinaryOffset} to know which range of bytes in this
* returned array belong to the field.
* @return reference to the Field value as byte[].
*/
public byte[] getBinaryValue() {
return getBinaryValue(null);
}
public byte[] getBinaryValue(byte[] result){
if (isBinary || fieldsData instanceof byte[])
return (byte[]) fieldsData;
else
return null;
}
/**
* Returns length of byte[] segment that is used as value, if Field is not binary
* returned value is undefined
* @return length of byte[] segment that represents this Field value
*/
public int getBinaryLength() {
if (isBinary) {
return binaryLength;
} else if (fieldsData instanceof byte[])
return ((byte[]) fieldsData).length;
else
return 0;
}
/**
* Returns offset into byte[] segment that is used as value, if Field is not binary
* returned value is undefined
* @return index of the first character in byte[] segment that represents this Field value
*/
public int getBinaryOffset() {
return binaryOffset;
}
/** True if norms are omitted for this indexed field */
public boolean getOmitNorms() { return omitNorms; }
/** @see #setOmitTermFreqAndPositions */
public boolean getOmitTermFreqAndPositions() { return omitTermFreqAndPositions; }
/** Expert:
*
* If set, omit normalization factors associated with this indexed field.
* This effectively disables indexing boosts and length normalization for this field.
*/
public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; }
/** Expert:
*
* If set, omit term freq, positions and payloads from
* postings for this field.
*
* <p><b>NOTE</b>: While this option reduces storage space
* required in the index, it also means any query
* requiring positional information, such as {@link
* PhraseQuery} or {@link SpanQuery} subclasses will
* silently fail to find results.
*/
public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; }
public boolean isLazy() {
return lazy;
}
/** Prints a Field for human consumption. */
@Override
public final String toString() {
StringBuilder result = new StringBuilder();
if (isStored) {
result.append("stored");
}
if (isIndexed) {
if (result.length() > 0)
result.append(",");
result.append("indexed");
}
if (isTokenized) {
if (result.length() > 0)
result.append(",");
result.append("tokenized");
}
if (storeTermVector) {
if (result.length() > 0)
result.append(",");
result.append("termVector");
}
if (storeOffsetWithTermVector) {
if (result.length() > 0)
result.append(",");
result.append("termVectorOffsets");
}
if (storePositionWithTermVector) {
if (result.length() > 0)
result.append(",");
result.append("termVectorPosition");
}
if (isBinary) {
if (result.length() > 0)
result.append(",");
result.append("binary");
}
if (omitNorms) {
result.append(",omitNorms");
}
if (omitTermFreqAndPositions) {
result.append(",omitTermFreqAndPositions");
}
if (lazy){
result.append(",lazy");
}
result.append('<');
result.append(name);
result.append(':');
if (fieldsData != null && lazy == false) {
result.append(fieldsData);
}
result.append('>');
return result.toString();
}
}

View File

@ -0,0 +1,124 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import java.util.zip.DataFormatException;
import java.io.ByteArrayOutputStream;
import org.apache.lucene.util.UnicodeUtil;
/** Simple utility class providing static methods to
* compress and decompress binary data for stored fields.
* This class uses java.util.zip.Deflater and Inflater
* classes to compress and decompress.
*/
public class CompressionTools {
// Export only static methods
private CompressionTools() {}
/** Compresses the specified byte range using the
* specified compressionLevel (constants are defined in
* java.util.zip.Deflater). */
public static byte[] compress(byte[] value, int offset, int length, int compressionLevel) {
/* Create an expandable byte array to hold the compressed data.
* You cannot use an array that's the same size as the orginal because
* there is no guarantee that the compressed data will be smaller than
* the uncompressed data. */
ByteArrayOutputStream bos = new ByteArrayOutputStream(length);
Deflater compressor = new Deflater();
try {
compressor.setLevel(compressionLevel);
compressor.setInput(value, offset, length);
compressor.finish();
// Compress the data
final byte[] buf = new byte[1024];
while (!compressor.finished()) {
int count = compressor.deflate(buf);
bos.write(buf, 0, count);
}
} finally {
compressor.end();
}
return bos.toByteArray();
}
/** Compresses the specified byte range, with default BEST_COMPRESSION level */
public static byte[] compress(byte[] value, int offset, int length) {
return compress(value, offset, length, Deflater.BEST_COMPRESSION);
}
/** Compresses all bytes in the array, with default BEST_COMPRESSION level */
public static byte[] compress(byte[] value) {
return compress(value, 0, value.length, Deflater.BEST_COMPRESSION);
}
/** Compresses the String value, with default BEST_COMPRESSION level */
public static byte[] compressString(String value) {
return compressString(value, Deflater.BEST_COMPRESSION);
}
/** Compresses the String value using the specified
* compressionLevel (constants are defined in
* java.util.zip.Deflater). */
public static byte[] compressString(String value, int compressionLevel) {
UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
return compress(result.result, 0, result.length, compressionLevel);
}
/** Decompress the byte array previously returned by
* compress */
public static byte[] decompress(byte[] value) throws DataFormatException {
// Create an expandable byte array to hold the decompressed data
ByteArrayOutputStream bos = new ByteArrayOutputStream(value.length);
Inflater decompressor = new Inflater();
try {
decompressor.setInput(value);
// Decompress the data
final byte[] buf = new byte[1024];
while (!decompressor.finished()) {
int count = decompressor.inflate(buf);
bos.write(buf, 0, count);
}
} finally {
decompressor.end();
}
return bos.toByteArray();
}
/** Decompress the byte array previously returned by
* compressString back into a String */
public static String decompressString(byte[] value) throws DataFormatException {
UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result();
final byte[] bytes = decompress(value);
UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);
return new String(result.result, 0, result.length);
}
}

View File

@ -0,0 +1,122 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
import org.apache.lucene.util.NumericUtils; // for javadocs
import java.util.Date; // for javadoc
import java.util.Calendar; // for javadoc
// do not remove in 3.0, needed for reading old indexes!
/**
* Provides support for converting dates to strings and vice-versa.
* The strings are structured so that lexicographic sorting orders by date,
* which makes them suitable for use as field values and search terms.
*
* <P>Note that this class saves dates with millisecond granularity,
* which is bad for {@link TermRangeQuery} and {@link PrefixQuery}, as those
* queries are expanded to a BooleanQuery with a potentially large number
* of terms when searching. Thus you might want to use
* {@link DateTools} instead.
*
* <P>
* Note: dates before 1970 cannot be used, and therefore cannot be
* indexed when using this class. See {@link DateTools} for an
* alternative without such a limitation.
*
* <P>
* Another approach is {@link NumericUtils}, which provides
* a sortable binary representation (prefix encoded) of numeric values, which
* date/time are.
* For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as
* <code>long</code> using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and
* index this as a numeric value with {@link NumericField}
* and use {@link NumericRangeQuery} to query it.
*
* @deprecated If you build a new index, use {@link DateTools} or
* {@link NumericField} instead.
* This class is included for use with existing
* indices and will be removed in a future release (possibly Lucene 4.0).
*/
public class DateField {
private DateField() {}
// make date strings long enough to last a millenium
private static int DATE_LEN = Long.toString(1000L*365*24*60*60*1000,
Character.MAX_RADIX).length();
public static String MIN_DATE_STRING() {
return timeToString(0);
}
public static String MAX_DATE_STRING() {
char[] buffer = new char[DATE_LEN];
char c = Character.forDigit(Character.MAX_RADIX-1, Character.MAX_RADIX);
for (int i = 0 ; i < DATE_LEN; i++)
buffer[i] = c;
return new String(buffer);
}
/**
* Converts a Date to a string suitable for indexing.
* @throws RuntimeException if the date specified in the
* method argument is before 1970
*/
public static String dateToString(Date date) {
return timeToString(date.getTime());
}
/**
* Converts a millisecond time to a string suitable for indexing.
* @throws RuntimeException if the time specified in the
* method argument is negative, that is, before 1970
*/
public static String timeToString(long time) {
if (time < 0)
throw new RuntimeException("time '" + time + "' is too early, must be >= 0");
String s = Long.toString(time, Character.MAX_RADIX);
if (s.length() > DATE_LEN)
throw new RuntimeException("time '" + time + "' is too late, length of string " +
"representation must be <= " + DATE_LEN);
// Pad with leading zeros
if (s.length() < DATE_LEN) {
StringBuilder sb = new StringBuilder(s);
while (sb.length() < DATE_LEN)
sb.insert(0, 0);
s = sb.toString();
}
return s;
}
/** Converts a string-encoded date into a millisecond time. */
public static long stringToTime(String s) {
return Long.parseLong(s, Character.MAX_RADIX);
}
/** Converts a string-encoded date into a Date object. */
public static Date stringToDate(String s) {
return new Date(stringToTime(s));
}
}

View File

@ -0,0 +1,256 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.TimeZone;
import java.util.Locale;
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
import org.apache.lucene.util.NumericUtils; // for javadocs
/**
* Provides support for converting dates to strings and vice-versa.
* The strings are structured so that lexicographic sorting orders
* them by date, which makes them suitable for use as field values
* and search terms.
*
* <P>This class also helps you to limit the resolution of your dates. Do not
* save dates with a finer resolution than you really need, as then
* RangeQuery and PrefixQuery will require more memory and become slower.
*
* <P>Compared to {@link DateField} the strings generated by the methods
* in this class take slightly more space, unless your selected resolution
* is set to <code>Resolution.DAY</code> or lower.
*
* <P>
* Another approach is {@link NumericUtils}, which provides
* a sortable binary representation (prefix encoded) of numeric values, which
* date/time are.
* For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as
* <code>long</code> using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and
* index this as a numeric value with {@link NumericField}
* and use {@link NumericRangeQuery} to query it.
*/
public class DateTools {
private final static TimeZone GMT = TimeZone.getTimeZone("GMT");
private static final SimpleDateFormat YEAR_FORMAT = new SimpleDateFormat("yyyy", Locale.US);
private static final SimpleDateFormat MONTH_FORMAT = new SimpleDateFormat("yyyyMM", Locale.US);
private static final SimpleDateFormat DAY_FORMAT = new SimpleDateFormat("yyyyMMdd", Locale.US);
private static final SimpleDateFormat HOUR_FORMAT = new SimpleDateFormat("yyyyMMddHH", Locale.US);
private static final SimpleDateFormat MINUTE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm", Locale.US);
private static final SimpleDateFormat SECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US);
private static final SimpleDateFormat MILLISECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS", Locale.US);
static {
// times need to be normalized so the value doesn't depend on the
// location the index is created/used:
YEAR_FORMAT.setTimeZone(GMT);
MONTH_FORMAT.setTimeZone(GMT);
DAY_FORMAT.setTimeZone(GMT);
HOUR_FORMAT.setTimeZone(GMT);
MINUTE_FORMAT.setTimeZone(GMT);
SECOND_FORMAT.setTimeZone(GMT);
MILLISECOND_FORMAT.setTimeZone(GMT);
}
private static final Calendar calInstance = Calendar.getInstance(GMT);
// cannot create, the class has static methods only
private DateTools() {}
/**
* Converts a Date to a string suitable for indexing.
*
* @param date the date to be converted
* @param resolution the desired resolution, see
* {@link #round(Date, DateTools.Resolution)}
* @return a string in format <code>yyyyMMddHHmmssSSS</code> or shorter,
* depending on <code>resolution</code>; using GMT as timezone
*/
public static synchronized String dateToString(Date date, Resolution resolution) {
return timeToString(date.getTime(), resolution);
}
/**
* Converts a millisecond time to a string suitable for indexing.
*
* @param time the date expressed as milliseconds since January 1, 1970, 00:00:00 GMT
* @param resolution the desired resolution, see
* {@link #round(long, DateTools.Resolution)}
* @return a string in format <code>yyyyMMddHHmmssSSS</code> or shorter,
* depending on <code>resolution</code>; using GMT as timezone
*/
public static synchronized String timeToString(long time, Resolution resolution) {
calInstance.setTimeInMillis(round(time, resolution));
Date date = calInstance.getTime();
if (resolution == Resolution.YEAR) {
return YEAR_FORMAT.format(date);
} else if (resolution == Resolution.MONTH) {
return MONTH_FORMAT.format(date);
} else if (resolution == Resolution.DAY) {
return DAY_FORMAT.format(date);
} else if (resolution == Resolution.HOUR) {
return HOUR_FORMAT.format(date);
} else if (resolution == Resolution.MINUTE) {
return MINUTE_FORMAT.format(date);
} else if (resolution == Resolution.SECOND) {
return SECOND_FORMAT.format(date);
} else if (resolution == Resolution.MILLISECOND) {
return MILLISECOND_FORMAT.format(date);
}
throw new IllegalArgumentException("unknown resolution " + resolution);
}
/**
* Converts a string produced by <code>timeToString</code> or
* <code>dateToString</code> back to a time, represented as the
* number of milliseconds since January 1, 1970, 00:00:00 GMT.
*
* @param dateString the date string to be converted
* @return the number of milliseconds since January 1, 1970, 00:00:00 GMT
* @throws ParseException if <code>dateString</code> is not in the
* expected format
*/
public static synchronized long stringToTime(String dateString) throws ParseException {
return stringToDate(dateString).getTime();
}
/**
* Converts a string produced by <code>timeToString</code> or
* <code>dateToString</code> back to a time, represented as a
* Date object.
*
* @param dateString the date string to be converted
* @return the parsed time as a Date object
* @throws ParseException if <code>dateString</code> is not in the
* expected format
*/
public static synchronized Date stringToDate(String dateString) throws ParseException {
if (dateString.length() == 4) {
return YEAR_FORMAT.parse(dateString);
} else if (dateString.length() == 6) {
return MONTH_FORMAT.parse(dateString);
} else if (dateString.length() == 8) {
return DAY_FORMAT.parse(dateString);
} else if (dateString.length() == 10) {
return HOUR_FORMAT.parse(dateString);
} else if (dateString.length() == 12) {
return MINUTE_FORMAT.parse(dateString);
} else if (dateString.length() == 14) {
return SECOND_FORMAT.parse(dateString);
} else if (dateString.length() == 17) {
return MILLISECOND_FORMAT.parse(dateString);
}
throw new ParseException("Input is not valid date string: " + dateString, 0);
}
/**
* Limit a date's resolution. For example, the date <code>2004-09-21 13:50:11</code>
* will be changed to <code>2004-09-01 00:00:00</code> when using
* <code>Resolution.MONTH</code>.
*
* @param resolution The desired resolution of the date to be returned
* @return the date with all values more precise than <code>resolution</code>
* set to 0 or 1
*/
public static synchronized Date round(Date date, Resolution resolution) {
return new Date(round(date.getTime(), resolution));
}
/**
* Limit a date's resolution. For example, the date <code>1095767411000</code>
* (which represents 2004-09-21 13:50:11) will be changed to
* <code>1093989600000</code> (2004-09-01 00:00:00) when using
* <code>Resolution.MONTH</code>.
*
* @param resolution The desired resolution of the date to be returned
* @return the date with all values more precise than <code>resolution</code>
* set to 0 or 1, expressed as milliseconds since January 1, 1970, 00:00:00 GMT
*/
public static synchronized long round(long time, Resolution resolution) {
calInstance.setTimeInMillis(time);
if (resolution == Resolution.YEAR) {
calInstance.set(Calendar.MONTH, 0);
calInstance.set(Calendar.DAY_OF_MONTH, 1);
calInstance.set(Calendar.HOUR_OF_DAY, 0);
calInstance.set(Calendar.MINUTE, 0);
calInstance.set(Calendar.SECOND, 0);
calInstance.set(Calendar.MILLISECOND, 0);
} else if (resolution == Resolution.MONTH) {
calInstance.set(Calendar.DAY_OF_MONTH, 1);
calInstance.set(Calendar.HOUR_OF_DAY, 0);
calInstance.set(Calendar.MINUTE, 0);
calInstance.set(Calendar.SECOND, 0);
calInstance.set(Calendar.MILLISECOND, 0);
} else if (resolution == Resolution.DAY) {
calInstance.set(Calendar.HOUR_OF_DAY, 0);
calInstance.set(Calendar.MINUTE, 0);
calInstance.set(Calendar.SECOND, 0);
calInstance.set(Calendar.MILLISECOND, 0);
} else if (resolution == Resolution.HOUR) {
calInstance.set(Calendar.MINUTE, 0);
calInstance.set(Calendar.SECOND, 0);
calInstance.set(Calendar.MILLISECOND, 0);
} else if (resolution == Resolution.MINUTE) {
calInstance.set(Calendar.SECOND, 0);
calInstance.set(Calendar.MILLISECOND, 0);
} else if (resolution == Resolution.SECOND) {
calInstance.set(Calendar.MILLISECOND, 0);
} else if (resolution == Resolution.MILLISECOND) {
// don't cut off anything
} else {
throw new IllegalArgumentException("unknown resolution " + resolution);
}
return calInstance.getTimeInMillis();
}
/** Specifies the time granularity. */
public static class Resolution {
public static final Resolution YEAR = new Resolution("year");
public static final Resolution MONTH = new Resolution("month");
public static final Resolution DAY = new Resolution("day");
public static final Resolution HOUR = new Resolution("hour");
public static final Resolution MINUTE = new Resolution("minute");
public static final Resolution SECOND = new Resolution("second");
public static final Resolution MILLISECOND = new Resolution("millisecond");
private String resolution;
private Resolution() {
}
private Resolution(String resolution) {
this.resolution = resolution;
}
@Override
public String toString() {
return resolution;
}
}
}

View File

@ -0,0 +1,305 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.*; // for javadoc
import org.apache.lucene.search.ScoreDoc; // for javadoc
import org.apache.lucene.search.Searcher; // for javadoc
import org.apache.lucene.index.IndexReader; // for javadoc
/** Documents are the unit of indexing and search.
*
* A Document is a set of fields. Each field has a name and a textual value.
* A field may be {@link Fieldable#isStored() stored} with the document, in which
* case it is returned with search hits on the document. Thus each document
* should typically contain one or more stored fields which uniquely identify
* it.
*
* <p>Note that fields which are <i>not</i> {@link Fieldable#isStored() stored} are
* <i>not</i> available in documents retrieved from the index, e.g. with {@link
* ScoreDoc#doc}, {@link Searcher#doc(int)} or {@link
* IndexReader#document(int)}.
*/
public final class Document implements java.io.Serializable {
List<Fieldable> fields = new ArrayList<Fieldable>();
private float boost = 1.0f;
/** Constructs a new document with no fields. */
public Document() {}
/** Sets a boost factor for hits on any field of this document. This value
* will be multiplied into the score of all hits on this document.
*
* <p>The default value is 1.0.
*
* <p>Values are multiplied into the value of {@link Fieldable#getBoost()} of
* each field in this document. Thus, this method in effect sets a default
* boost for the fields of this document.
*
* @see Fieldable#setBoost(float)
*/
public void setBoost(float boost) {
this.boost = boost;
}
/** Returns, at indexing time, the boost factor as set by {@link #setBoost(float)}.
*
* <p>Note that once a document is indexed this value is no longer available
* from the index. At search time, for retrieved documents, this method always
* returns 1. This however does not mean that the boost value set at indexing
* time was ignored - it was just combined with other indexing time factors and
* stored elsewhere, for better indexing and search performance. (For more
* information see the "norm(t,d)" part of the scoring formula in
* {@link org.apache.lucene.search.Similarity Similarity}.)
*
* @see #setBoost(float)
*/
public float getBoost() {
return boost;
}
/**
* <p>Adds a field to a document. Several fields may be added with
* the same name. In this case, if the fields are indexed, their text is
* treated as though appended for the purposes of search.</p>
* <p> Note that add like the removeField(s) methods only makes sense
* prior to adding a document to an index. These methods cannot
* be used to change the content of an existing index! In order to achieve this,
* a document has to be deleted from an index and a new changed version of that
* document has to be added.</p>
*/
public final void add(Fieldable field) {
fields.add(field);
}
/**
* <p>Removes field with the specified name from the document.
* If multiple fields exist with this name, this method removes the first field that has been added.
* If there is no field with the specified name, the document remains unchanged.</p>
* <p> Note that the removeField(s) methods like the add method only make sense
* prior to adding a document to an index. These methods cannot
* be used to change the content of an existing index! In order to achieve this,
* a document has to be deleted from an index and a new changed version of that
* document has to be added.</p>
*/
public final void removeField(String name) {
Iterator<Fieldable> it = fields.iterator();
while (it.hasNext()) {
Fieldable field = it.next();
if (field.name().equals(name)) {
it.remove();
return;
}
}
}
/**
* <p>Removes all fields with the given name from the document.
* If there is no field with the specified name, the document remains unchanged.</p>
* <p> Note that the removeField(s) methods like the add method only make sense
* prior to adding a document to an index. These methods cannot
* be used to change the content of an existing index! In order to achieve this,
* a document has to be deleted from an index and a new changed version of that
* document has to be added.</p>
*/
public final void removeFields(String name) {
Iterator<Fieldable> it = fields.iterator();
while (it.hasNext()) {
Fieldable field = it.next();
if (field.name().equals(name)) {
it.remove();
}
}
}
/** Returns a field with the given name if any exist in this document, or
* null. If multiple fields exists with this name, this method returns the
* first value added.
* Do not use this method with lazy loaded fields.
*/
public final Field getField(String name) {
return (Field) getFieldable(name);
}
/** Returns a field with the given name if any exist in this document, or
* null. If multiple fields exists with this name, this method returns the
* first value added.
*/
public Fieldable getFieldable(String name) {
for (Fieldable field : fields) {
if (field.name().equals(name))
return field;
}
return null;
}
/** Returns the string value of the field with the given name if any exist in
* this document, or null. If multiple fields exist with this name, this
* method returns the first value added. If only binary fields with this name
* exist, returns null.
*/
public final String get(String name) {
for (Fieldable field : fields) {
if (field.name().equals(name) && (!field.isBinary()))
return field.stringValue();
}
return null;
}
/** Returns a List of all the fields in a document.
* <p>Note that fields which are <i>not</i> {@link Fieldable#isStored() stored} are
* <i>not</i> available in documents retrieved from the
* index, e.g. {@link Searcher#doc(int)} or {@link
* IndexReader#document(int)}.
*/
public final List<Fieldable> getFields() {
return fields;
}
private final static Field[] NO_FIELDS = new Field[0];
/**
* Returns an array of {@link Field}s with the given name.
* Do not use with lazy loaded fields.
* This method returns an empty array when there are no
* matching fields. It never returns null.
*
* @param name the name of the field
* @return a <code>Field[]</code> array
*/
public final Field[] getFields(String name) {
List<Field> result = new ArrayList<Field>();
for (Fieldable field : fields) {
if (field.name().equals(name)) {
result.add((Field) field);
}
}
if (result.size() == 0)
return NO_FIELDS;
return result.toArray(new Field[result.size()]);
}
private final static Fieldable[] NO_FIELDABLES = new Fieldable[0];
/**
* Returns an array of {@link Fieldable}s with the given name.
* This method returns an empty array when there are no
* matching fields. It never returns null.
*
* @param name the name of the field
* @return a <code>Fieldable[]</code> array
*/
public Fieldable[] getFieldables(String name) {
List<Fieldable> result = new ArrayList<Fieldable>();
for (Fieldable field : fields) {
if (field.name().equals(name)) {
result.add(field);
}
}
if (result.size() == 0)
return NO_FIELDABLES;
return result.toArray(new Fieldable[result.size()]);
}
private final static String[] NO_STRINGS = new String[0];
/**
* Returns an array of values of the field specified as the method parameter.
* This method returns an empty array when there are no
* matching fields. It never returns null.
* @param name the name of the field
* @return a <code>String[]</code> of field values
*/
public final String[] getValues(String name) {
List<String> result = new ArrayList<String>();
for (Fieldable field : fields) {
if (field.name().equals(name) && (!field.isBinary()))
result.add(field.stringValue());
}
if (result.size() == 0)
return NO_STRINGS;
return result.toArray(new String[result.size()]);
}
private final static byte[][] NO_BYTES = new byte[0][];
/**
* Returns an array of byte arrays for of the fields that have the name specified
* as the method parameter. This method returns an empty
* array when there are no matching fields. It never
* returns null.
*
* @param name the name of the field
* @return a <code>byte[][]</code> of binary field values
*/
public final byte[][] getBinaryValues(String name) {
List<byte[]> result = new ArrayList<byte[]>();
for (Fieldable field : fields) {
if (field.name().equals(name) && (field.isBinary()))
result.add(field.getBinaryValue());
}
if (result.size() == 0)
return NO_BYTES;
return result.toArray(new byte[result.size()][]);
}
/**
* Returns an array of bytes for the first (or only) field that has the name
* specified as the method parameter. This method will return <code>null</code>
* if no binary fields with the specified name are available.
* There may be non-binary fields with the same name.
*
* @param name the name of the field.
* @return a <code>byte[]</code> containing the binary field value or <code>null</code>
*/
public final byte[] getBinaryValue(String name) {
for (Fieldable field : fields) {
if (field.name().equals(name) && (field.isBinary()))
return field.getBinaryValue();
}
return null;
}
/** Prints the fields of a document for human consumption. */
@Override
public final String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("Document<");
for (int i = 0; i < fields.size(); i++) {
Fieldable field = fields.get(i);
buffer.append(field.toString());
if (i != fields.size()-1)
buffer.append(" ");
}
buffer.append(">");
return buffer.toString();
}
}

View File

@ -0,0 +1,566 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexWriter; // for javadoc
import org.apache.lucene.util.StringHelper;
import java.io.Reader;
import java.io.Serializable;
/**
A field is a section of a Document. Each field has two parts, a name and a
value. Values may be free text, provided as a String or as a Reader, or they
may be atomic keywords, which are not further processed. Such keywords may
be used to represent dates, urls, etc. Fields are optionally stored in the
index, so that they may be returned with hits on the document.
*/
public final class Field extends AbstractField implements Fieldable, Serializable {
/** Specifies whether and how a field should be stored. */
public static enum Store {
/** Store the original field value in the index. This is useful for short texts
* like a document's title which should be displayed with the results. The
* value is stored in its original form, i.e. no analyzer is used before it is
* stored.
*/
YES {
@Override
public boolean isStored() { return true; }
},
/** Do not store the field value in the index. */
NO {
@Override
public boolean isStored() { return false; }
};
public abstract boolean isStored();
}
/** Specifies whether and how a field should be indexed. */
public static enum Index {
/** Do not index the field value. This field can thus not be searched,
* but one can still access its contents provided it is
* {@link Field.Store stored}. */
NO {
@Override
public boolean isIndexed() { return false; }
@Override
public boolean isAnalyzed() { return false; }
@Override
public boolean omitNorms() { return true; }
},
/** Index the tokens produced by running the field's
* value through an Analyzer. This is useful for
* common text. */
ANALYZED {
@Override
public boolean isIndexed() { return true; }
@Override
public boolean isAnalyzed() { return true; }
@Override
public boolean omitNorms() { return false; }
},
/** Index the field's value without using an Analyzer, so it can be searched.
* As no analyzer is used the value will be stored as a single term. This is
* useful for unique Ids like product numbers.
*/
NOT_ANALYZED {
@Override
public boolean isIndexed() { return true; }
@Override
public boolean isAnalyzed() { return false; }
@Override
public boolean omitNorms() { return false; }
},
/** Expert: Index the field's value without an Analyzer,
* and also disable the storing of norms. Note that you
* can also separately enable/disable norms by calling
* {@link Field#setOmitNorms}. No norms means that
* index-time field and document boosting and field
* length normalization are disabled. The benefit is
* less memory usage as norms take up one byte of RAM
* per indexed field for every document in the index,
* during searching. Note that once you index a given
* field <i>with</i> norms enabled, disabling norms will
* have no effect. In other words, for this to have the
* above described effect on a field, all instances of
* that field must be indexed with NOT_ANALYZED_NO_NORMS
* from the beginning. */
NOT_ANALYZED_NO_NORMS {
@Override
public boolean isIndexed() { return true; }
@Override
public boolean isAnalyzed() { return false; }
@Override
public boolean omitNorms() { return true; }
},
/** Expert: Index the tokens produced by running the
* field's value through an Analyzer, and also
* separately disable the storing of norms. See
* {@link #NOT_ANALYZED_NO_NORMS} for what norms are
* and why you may want to disable them. */
ANALYZED_NO_NORMS {
@Override
public boolean isIndexed() { return true; }
@Override
public boolean isAnalyzed() { return true; }
@Override
public boolean omitNorms() { return true; }
};
/** Get the best representation of the index given the flags. */
public static Index toIndex(boolean indexed, boolean analyzed) {
return toIndex(indexed, analyzed, false);
}
/** Expert: Get the best representation of the index given the flags. */
public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {
// If it is not indexed nothing else matters
if (!indexed) {
return Index.NO;
}
// typical, non-expert
if (!omitNorms) {
if (analyzed) {
return Index.ANALYZED;
}
return Index.NOT_ANALYZED;
}
// Expert: Norms omitted
if (analyzed) {
return Index.ANALYZED_NO_NORMS;
}
return Index.NOT_ANALYZED_NO_NORMS;
}
public abstract boolean isIndexed();
public abstract boolean isAnalyzed();
public abstract boolean omitNorms();
}
/** Specifies whether and how a field should have term vectors. */
public static enum TermVector {
/** Do not store term vectors.
*/
NO {
@Override
public boolean isStored() { return false; }
@Override
public boolean withPositions() { return false; }
@Override
public boolean withOffsets() { return false; }
},
/** Store the term vectors of each document. A term vector is a list
* of the document's terms and their number of occurrences in that document. */
YES {
@Override
public boolean isStored() { return true; }
@Override
public boolean withPositions() { return false; }
@Override
public boolean withOffsets() { return false; }
},
/**
* Store the term vector + token position information
*
* @see #YES
*/
WITH_POSITIONS {
@Override
public boolean isStored() { return true; }
@Override
public boolean withPositions() { return true; }
@Override
public boolean withOffsets() { return false; }
},
/**
* Store the term vector + Token offset information
*
* @see #YES
*/
WITH_OFFSETS {
@Override
public boolean isStored() { return true; }
@Override
public boolean withPositions() { return false; }
@Override
public boolean withOffsets() { return true; }
},
/**
* Store the term vector + Token position and offset information
*
* @see #YES
* @see #WITH_POSITIONS
* @see #WITH_OFFSETS
*/
WITH_POSITIONS_OFFSETS {
@Override
public boolean isStored() { return true; }
@Override
public boolean withPositions() { return true; }
@Override
public boolean withOffsets() { return true; }
};
/** Get the best representation of a TermVector given the flags. */
public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {
// If it is not stored, nothing else matters.
if (!stored) {
return TermVector.NO;
}
if (withOffsets) {
if (withPositions) {
return Field.TermVector.WITH_POSITIONS_OFFSETS;
}
return Field.TermVector.WITH_OFFSETS;
}
if (withPositions) {
return Field.TermVector.WITH_POSITIONS;
}
return Field.TermVector.YES;
}
public abstract boolean isStored();
public abstract boolean withPositions();
public abstract boolean withOffsets();
}
/** The value of the field as a String, or null. If null, the Reader value or
* binary value is used. Exactly one of stringValue(),
* readerValue(), and getBinaryValue() must be set. */
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
/** The value of the field as a Reader, or null. If null, the String value or
* binary value is used. Exactly one of stringValue(),
* readerValue(), and getBinaryValue() must be set. */
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
/** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
* or String value is analyzed to produce the indexed tokens. */
public TokenStream tokenStreamValue() { return tokenStream; }
/** <p>Expert: change the value of this field. This can
* be used during indexing to re-use a single Field
* instance to improve indexing speed by avoiding GC cost
* of new'ing and reclaiming Field instances. Typically
* a single {@link Document} instance is re-used as
* well. This helps most on small documents.</p>
*
* <p>Each Field instance should only be used once
* within a single {@link Document} instance. See <a
* href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
* for details.</p> */
public void setValue(String value) {
if (isBinary) {
throw new IllegalArgumentException("cannot set a String value on a binary field");
}
fieldsData = value;
}
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
public void setValue(Reader value) {
if (isBinary) {
throw new IllegalArgumentException("cannot set a Reader value on a binary field");
}
if (isStored) {
throw new IllegalArgumentException("cannot set a Reader value on a stored field");
}
fieldsData = value;
}
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
public void setValue(byte[] value) {
if (!isBinary) {
throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
}
fieldsData = value;
binaryLength = value.length;
binaryOffset = 0;
}
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
public void setValue(byte[] value, int offset, int length) {
if (!isBinary) {
throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
}
fieldsData = value;
binaryLength = length;
binaryOffset = offset;
}
/** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
* May be combined with stored values from stringValue() or getBinaryValue() */
public void setTokenStream(TokenStream tokenStream) {
this.isIndexed = true;
this.isTokenized = true;
this.tokenStream = tokenStream;
}
/**
* Create a field by specifying its name, value and how it will
* be saved in the index. Term vectors will not be stored in the index.
*
* @param name The name of the field
* @param value The string to process
* @param store Whether <code>value</code> should be stored in the index
* @param index Whether the field should be indexed, and if so, if it should
* be tokenized before indexing
* @throws NullPointerException if name or value is <code>null</code>
* @throws IllegalArgumentException if the field is neither stored nor indexed
*/
public Field(String name, String value, Store store, Index index) {
this(name, value, store, index, TermVector.NO);
}
/**
* Create a field by specifying its name, value and how it will
* be saved in the index.
*
* @param name The name of the field
* @param value The string to process
* @param store Whether <code>value</code> should be stored in the index
* @param index Whether the field should be indexed, and if so, if it should
* be tokenized before indexing
* @param termVector Whether term vector should be stored
* @throws NullPointerException if name or value is <code>null</code>
* @throws IllegalArgumentException in any of the following situations:
* <ul>
* <li>the field is neither stored nor indexed</li>
* <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
* </ul>
*/
public Field(String name, String value, Store store, Index index, TermVector termVector) {
this(name, true, value, store, index, termVector);
}
/**
* Create a field by specifying its name, value and how it will
* be saved in the index.
*
* @param name The name of the field
* @param internName Whether to .intern() name or not
* @param value The string to process
* @param store Whether <code>value</code> should be stored in the index
* @param index Whether the field should be indexed, and if so, if it should
* be tokenized before indexing
* @param termVector Whether term vector should be stored
* @throws NullPointerException if name or value is <code>null</code>
* @throws IllegalArgumentException in any of the following situations:
* <ul>
* <li>the field is neither stored nor indexed</li>
* <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
* </ul>
*/
public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (value == null)
throw new NullPointerException("value cannot be null");
if (name.length() == 0 && value.length() == 0)
throw new IllegalArgumentException("name and value cannot both be empty");
if (index == Index.NO && store == Store.NO)
throw new IllegalArgumentException("it doesn't make sense to have a field that "
+ "is neither indexed nor stored");
if (index == Index.NO && termVector != TermVector.NO)
throw new IllegalArgumentException("cannot store term vector information "
+ "for a field that is not indexed");
if (internName) // field names are optionally interned
name = StringHelper.intern(name);
this.name = name;
this.fieldsData = value;
this.isStored = store.isStored();
this.isIndexed = index.isIndexed();
this.isTokenized = index.isAnalyzed();
this.omitNorms = index.omitNorms();
if (index == Index.NO) {
this.omitTermFreqAndPositions = false;
}
this.isBinary = false;
setStoreTermVector(termVector);
}
/**
* Create a tokenized and indexed field that is not stored. Term vectors will
* not be stored. The Reader is read only when the Document is added to the index,
* i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param reader The reader with the content
* @throws NullPointerException if name or reader is <code>null</code>
*/
public Field(String name, Reader reader) {
this(name, reader, TermVector.NO);
}
/**
* Create a tokenized and indexed field that is not stored, optionally with
* storing term vectors. The Reader is read only when the Document is added to the index,
* i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param reader The reader with the content
* @param termVector Whether term vector should be stored
* @throws NullPointerException if name or reader is <code>null</code>
*/
public Field(String name, Reader reader, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (reader == null)
throw new NullPointerException("reader cannot be null");
this.name = StringHelper.intern(name); // field names are interned
this.fieldsData = reader;
this.isStored = false;
this.isIndexed = true;
this.isTokenized = true;
this.isBinary = false;
setStoreTermVector(termVector);
}
/**
* Create a tokenized and indexed field that is not stored. Term vectors will
* not be stored. This is useful for pre-analyzed fields.
* The TokenStream is read only when the Document is added to the index,
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param tokenStream The TokenStream with the content
* @throws NullPointerException if name or tokenStream is <code>null</code>
*/
public Field(String name, TokenStream tokenStream) {
this(name, tokenStream, TermVector.NO);
}
/**
* Create a tokenized and indexed field that is not stored, optionally with
* storing term vectors. This is useful for pre-analyzed fields.
* The TokenStream is read only when the Document is added to the index,
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param tokenStream The TokenStream with the content
* @param termVector Whether term vector should be stored
* @throws NullPointerException if name or tokenStream is <code>null</code>
*/
public Field(String name, TokenStream tokenStream, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (tokenStream == null)
throw new NullPointerException("tokenStream cannot be null");
this.name = StringHelper.intern(name); // field names are interned
this.fieldsData = null;
this.tokenStream = tokenStream;
this.isStored = false;
this.isIndexed = true;
this.isTokenized = true;
this.isBinary = false;
setStoreTermVector(termVector);
}
/**
* Create a stored field with binary value. Optionally the value may be compressed.
*
* @param name The name of the field
* @param value The binary value
* @param store How <code>value</code> should be stored (compressed or not)
* @throws IllegalArgumentException if store is <code>Store.NO</code>
*/
public Field(String name, byte[] value, Store store) {
this(name, value, 0, value.length, store);
}
/**
* Create a stored field with binary value. Optionally the value may be compressed.
*
* @param name The name of the field
* @param value The binary value
* @param offset Starting offset in value where this Field's bytes are
* @param length Number of bytes to use for this Field, starting at offset
* @param store How <code>value</code> should be stored (compressed or not)
* @throws IllegalArgumentException if store is <code>Store.NO</code>
*/
public Field(String name, byte[] value, int offset, int length, Store store) {
if (name == null)
throw new IllegalArgumentException("name cannot be null");
if (value == null)
throw new IllegalArgumentException("value cannot be null");
this.name = StringHelper.intern(name); // field names are interned
fieldsData = value;
if (store == Store.NO)
throw new IllegalArgumentException("binary values can't be unstored");
isStored = store.isStored();
isIndexed = false;
isTokenized = false;
omitTermFreqAndPositions = false;
omitNorms = true;
isBinary = true;
binaryLength = length;
binaryOffset = offset;
setStoreTermVector(TermVector.NO);
}
}

View File

@ -0,0 +1,34 @@
package org.apache.lucene.document;
import java.io.Serializable;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about
* what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)}
*
**/
public interface FieldSelector extends Serializable {
/**
*
* @param fieldName the field to accept or reject
* @return an instance of {@link FieldSelectorResult}
* if the {@link Field} named <code>fieldName</code> should be loaded.
*/
FieldSelectorResult accept(String fieldName);
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.document;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Provides information about what should be done with this Field
*
**/
public enum FieldSelectorResult {
/**
* Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encountered.
* {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null.
*<p/>
* {@link Document#add(Fieldable)} should be called by the Reader.
*/
LOAD,
/**
* Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until
* invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should
* return a valid instance of a {@link Fieldable}.
*<p/>
* {@link Document#add(Fieldable)} should be called by the Reader.
*/
LAZY_LOAD,
/**
* Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null.
* {@link Document#add(Fieldable)} is not called.
* <p/>
* {@link Document#add(Fieldable)} should not be called by the Reader.
*/
NO_LOAD,
/**
* Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the
* Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should
* both be valid for this {@link Field}
* <p/>
* {@link Document#add(Fieldable)} should be called by the Reader.
*/
LOAD_AND_BREAK,
/** Expert: Load the size of this {@link Field} rather than its value.
* Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value.
* The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0]
*/
SIZE,
/** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */
SIZE_AND_BREAK
}

View File

@ -0,0 +1,212 @@
package org.apache.lucene.document;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.FieldInvertState; // for javadocs
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.spans.SpanQuery;
import java.io.Reader;
import java.io.Serializable;
/**
* Synonymous with {@link Field}.
*
* <p><bold>WARNING</bold>: This interface may change within minor versions, despite Lucene's backward compatibility requirements.
* This means new methods may be added from version to version. This change only affects the Fieldable API; other backwards
* compatibility promises remain intact. For example, Lucene can still
* read and write indices created within the same major version.
* </p>
*
**/
public interface Fieldable extends Serializable {
/** Sets the boost factor hits on this field. This value will be
* multiplied into the score of all hits on this this field of this
* document.
*
* <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
* containing this field. If a document has multiple fields with the same
* name, all such values are multiplied together. This product is then
* used to compute the norm factor for the field. By
* default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(String,
* FieldInvertState)} method, the boost value is multiplied
* by the {@link
* org.apache.lucene.search.Similarity#lengthNorm(String,
* int)} and then rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(String, FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNorm(float)
*/
void setBoost(float boost);
/** Returns the boost factor for hits for this field.
*
* <p>The default value is 1.0.
*
* <p>Note: this value is not stored directly with the document in the index.
* Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and
* {@link org.apache.lucene.search.Searcher#doc(int)} may thus not have the same value present as when
* this field was indexed.
*
* @see #setBoost(float)
*/
float getBoost();
/** Returns the name of the field as an interned string.
* For example "date", "title", "body", ...
*/
String name();
/** The value of the field as a String, or null.
* <p>
* For indexing, if isStored()==true, the stringValue() will be used as the stored field value
* unless isBinary()==true, in which case getBinaryValue() will be used.
*
* If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
* If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
* else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
*/
public String stringValue();
/** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
* @see #stringValue()
*/
public Reader readerValue();
/** The TokenStream for this field to be used when indexing, or null.
* @see #stringValue()
*/
public TokenStream tokenStreamValue();
/** True if the value of the field is to be stored in the index for return
with search hits. */
boolean isStored();
/** True if the value of the field is to be indexed, so that it may be
searched on. */
boolean isIndexed();
/** True if the value of the field should be tokenized as text prior to
indexing. Un-tokenized fields are indexed as a single word and may not be
Reader-valued. */
boolean isTokenized();
/** True if the term or terms used to index this field are stored as a term
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
* These methods do not provide access to the original content of the field,
* only to terms used to index it. If the original content must be
* preserved, use the <code>stored</code> attribute instead.
*
* @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String)
*/
boolean isTermVectorStored();
/**
* True if terms are stored as term vector together with their offsets
* (start and end positon in source text).
*/
boolean isStoreOffsetWithTermVector();
/**
* True if terms are stored as term vector together with their token positions.
*/
boolean isStorePositionWithTermVector();
/** True if the value of the field is stored as binary */
boolean isBinary();
/** True if norms are omitted for this indexed field */
boolean getOmitNorms();
/** Expert:
*
* If set, omit normalization factors associated with this indexed field.
* This effectively disables indexing boosts and length normalization for this field.
*/
void setOmitNorms(boolean omitNorms);
/**
* Indicates whether a Field is Lazy or not. The semantics of Lazy loading are such that if a Field is lazily loaded, retrieving
* it's values via {@link #stringValue()} or {@link #getBinaryValue()} is only valid as long as the {@link org.apache.lucene.index.IndexReader} that
* retrieved the {@link Document} is still open.
*
* @return true if this field can be loaded lazily
*/
boolean isLazy();
/**
* Returns offset into byte[] segment that is used as value, if Field is not binary
* returned value is undefined
* @return index of the first character in byte[] segment that represents this Field value
*/
abstract int getBinaryOffset();
/**
* Returns length of byte[] segment that is used as value, if Field is not binary
* returned value is undefined
* @return length of byte[] segment that represents this Field value
*/
abstract int getBinaryLength();
/**
* Return the raw byte[] for the binary field. Note that
* you must also call {@link #getBinaryLength} and {@link
* #getBinaryOffset} to know which range of bytes in this
* returned array belong to the field.
* @return reference to the Field value as byte[].
*/
abstract byte[] getBinaryValue();
/**
* Return the raw byte[] for the binary field. Note that
* you must also call {@link #getBinaryLength} and {@link
* #getBinaryOffset} to know which range of bytes in this
* returned array belong to the field.<p>
* About reuse: if you pass in the result byte[] and it is
* used, likely the underlying implementation will hold
* onto this byte[] and return it in future calls to
* {@link #getBinaryValue()}.
* So if you subsequently re-use the same byte[] elsewhere
* it will alter this Fieldable's value.
* @param result User defined buffer that will be used if
* possible. If this is null or not large enough, a new
* buffer is allocated
* @return reference to the Field value as byte[].
*/
abstract byte[] getBinaryValue(byte[] result);
/** @see #setOmitTermFreqAndPositions */
boolean getOmitTermFreqAndPositions();
/** Expert:
*
* If set, omit term freq, positions and payloads from
* postings for this field.
*
* <p><b>NOTE</b>: While this option reduces storage space
* required in the index, it also means any query
* requiring positional information, such as {@link
* PhraseQuery} or {@link SpanQuery} subclasses will
* silently fail to find results.
*/
void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions);
}

View File

@ -0,0 +1,29 @@
package org.apache.lucene.document;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Load the First field and break.
* <p/>
* See {@link FieldSelectorResult#LOAD_AND_BREAK}
*/
public class LoadFirstFieldSelector implements FieldSelector {
public FieldSelectorResult accept(String fieldName) {
return FieldSelectorResult.LOAD_AND_BREAK;
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A {@link FieldSelector} based on a Map of field names to {@link FieldSelectorResult}s
*
*/
public class MapFieldSelector implements FieldSelector {
Map<String,FieldSelectorResult> fieldSelections;
/** Create a a MapFieldSelector
* @param fieldSelections maps from field names (String) to {@link FieldSelectorResult}s
*/
public MapFieldSelector(Map<String,FieldSelectorResult> fieldSelections) {
this.fieldSelections = fieldSelections;
}
/** Create a a MapFieldSelector
* @param fields fields to LOAD. List of Strings. All other fields are NO_LOAD.
*/
public MapFieldSelector(List<String> fields) {
fieldSelections = new HashMap<String,FieldSelectorResult>(fields.size()*5/3);
for (final String field : fields)
fieldSelections.put(field, FieldSelectorResult.LOAD);
}
/** Create a a MapFieldSelector
* @param fields fields to LOAD. All other fields are NO_LOAD.
*/
public MapFieldSelector(String... fields) {
this(Arrays.asList(fields));
}
/** Load field according to its associated value in fieldSelections
* @param field a field name
* @return the fieldSelections value that field maps to or NO_LOAD if none.
*/
public FieldSelectorResult accept(String field) {
FieldSelectorResult selection = fieldSelections.get(field);
return selection!=null ? selection : FieldSelectorResult.NO_LOAD;
}
}

View File

@ -0,0 +1,139 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.NumericField; // for javadocs
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
import org.apache.lucene.util.NumericUtils; // for javadocs
// do not remove this class in 3.0, it may be needed to decode old indexes!
/**
* Provides support for converting longs to Strings, and back again. The strings
* are structured so that lexicographic sorting order is preserved.
*
* <p>
* That is, if l1 is less than l2 for any two longs l1 and l2, then
* NumberTools.longToString(l1) is lexicographically less than
* NumberTools.longToString(l2). (Similarly for "greater than" and "equals".)
*
* <p>
* This class handles <b>all</b> long values (unlike
* {@link org.apache.lucene.document.DateField}).
*
* @deprecated For new indexes use {@link NumericUtils} instead, which
* provides a sortable binary representation (prefix encoded) of numeric
* values.
* To index and efficiently query numeric values use {@link NumericField}
* and {@link NumericRangeQuery}.
* This class is included for use with existing
* indices and will be removed in a future release (possibly Lucene 4.0).
*/
public class NumberTools {
private static final int RADIX = 36;
private static final char NEGATIVE_PREFIX = '-';
// NB: NEGATIVE_PREFIX must be < POSITIVE_PREFIX
private static final char POSITIVE_PREFIX = '0';
//NB: this must be less than
/**
* Equivalent to longToString(Long.MIN_VALUE)
*/
public static final String MIN_STRING_VALUE = NEGATIVE_PREFIX
+ "0000000000000";
/**
* Equivalent to longToString(Long.MAX_VALUE)
*/
public static final String MAX_STRING_VALUE = POSITIVE_PREFIX
+ "1y2p0ij32e8e7";
/**
* The length of (all) strings returned by {@link #longToString}
*/
public static final int STR_SIZE = MIN_STRING_VALUE.length();
/**
* Converts a long to a String suitable for indexing.
*/
public static String longToString(long l) {
if (l == Long.MIN_VALUE) {
// special case, because long is not symmetric around zero
return MIN_STRING_VALUE;
}
StringBuilder buf = new StringBuilder(STR_SIZE);
if (l < 0) {
buf.append(NEGATIVE_PREFIX);
l = Long.MAX_VALUE + l + 1;
} else {
buf.append(POSITIVE_PREFIX);
}
String num = Long.toString(l, RADIX);
int padLen = STR_SIZE - num.length() - buf.length();
while (padLen-- > 0) {
buf.append('0');
}
buf.append(num);
return buf.toString();
}
/**
* Converts a String that was returned by {@link #longToString} back to a
* long.
*
* @throws IllegalArgumentException
* if the input is null
* @throws NumberFormatException
* if the input does not parse (it was not a String returned by
* longToString()).
*/
public static long stringToLong(String str) {
if (str == null) {
throw new NullPointerException("string cannot be null");
}
if (str.length() != STR_SIZE) {
throw new NumberFormatException("string is the wrong size");
}
if (str.equals(MIN_STRING_VALUE)) {
return Long.MIN_VALUE;
}
char prefix = str.charAt(0);
long l = Long.parseLong(str.substring(1), RADIX);
if (prefix == POSITIVE_PREFIX) {
// nop
} else if (prefix == NEGATIVE_PREFIX) {
l = l - Long.MAX_VALUE - 1;
} else {
throw new NumberFormatException(
"string does not begin with the correct prefix");
}
return l;
}
}

View File

@ -0,0 +1,277 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.search.NumericRangeQuery; // javadocs
import org.apache.lucene.search.NumericRangeFilter; // javadocs
import org.apache.lucene.search.SortField; // javadocs
import org.apache.lucene.search.FieldCache; // javadocs
/**
* <p>This class provides a {@link Field} that enables indexing
* of numeric values for efficient range filtering and
* sorting. Here's an example usage, adding an int value:
* <pre>
* document.add(new NumericField(name).setIntValue(value));
* </pre>
*
* For optimal performance, re-use the
* <code>NumericField</code> and {@link Document} instance for more than
* one document:
*
* <pre>
* NumericField field = new NumericField(name);
* Document document = new Document();
* document.add(field);
*
* for(all documents) {
* ...
* field.setIntValue(value)
* writer.addDocument(document);
* ...
* }
* </pre>
*
* <p>The java native types <code>int</code>, <code>long</code>,
* <code>float</code> and <code>double</code> are
* directly supported. However, any value that can be
* converted into these native types can also be indexed.
* For example, date/time values represented by a
* {@link java.util.Date} can be translated into a long
* value using the {@link java.util.Date#getTime} method. If you
* don't need millisecond precision, you can quantize the
* value, either by dividing the result of
* {@link java.util.Date#getTime} or using the separate getters
* (for year, month, etc.) to construct an <code>int</code> or
* <code>long</code> value.</p>
*
* <p>To perform range querying or filtering against a
* <code>NumericField</code>, use {@link NumericRangeQuery} or {@link
* NumericRangeFilter}. To sort according to a
* <code>NumericField</code>, use the normal numeric sort types, eg
* {@link SortField#INT}. <code>NumericField</code> values
* can also be loaded directly from {@link FieldCache}.</p>
*
* <p>By default, a <code>NumericField</code>'s value is not stored but
* is indexed for range filtering and sorting. You can use
* the {@link #NumericField(String,Field.Store,boolean)}
* constructor if you need to change these defaults.</p>
*
* <p>You may add the same field name as a <code>NumericField</code> to
* the same document more than once. Range querying and
* filtering will be the logical OR of all values; so a range query
* will hit all documents that have at least one value in
* the range. However sort behavior is not defined. If you need to sort,
* you should separately index a single-valued <code>NumericField</code>.</p>
*
* <p>A <code>NumericField</code> will consume somewhat more disk space
* in the index than an ordinary single-valued field.
* However, for a typical index that includes substantial
* textual content per document, this increase will likely
* be in the noise. </p>
*
* <p>Within Lucene, each numeric value is indexed as a
* <em>trie</em> structure, where each term is logically
* assigned to larger and larger pre-defined brackets (which
* are simply lower-precision representations of the value).
* The step size between each successive bracket is called the
* <code>precisionStep</code>, measured in bits. Smaller
* <code>precisionStep</code> values result in larger number
* of brackets, which consumes more disk space in the index
* but may result in faster range search performance. The
* default value, 4, was selected for a reasonable tradeoff
* of disk space consumption versus performance. You can
* use the expert constructor {@link
* #NumericField(String,int,Field.Store,boolean)} if you'd
* like to change the value. Note that you must also
* specify a congruent value when creating {@link
* NumericRangeQuery} or {@link NumericRangeFilter}.
* For low cardinality fields larger precision steps are good.
* If the cardinality is &lt; 100, it is fair
* to use {@link Integer#MAX_VALUE}, which produces one
* term per value.
*
* <p>For more information on the internals of numeric trie
* indexing, including the <a
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
* configuration, see {@link NumericRangeQuery}. The format of
* indexed values is described in {@link NumericUtils}.
*
* <p>If you only need to sort by numeric value, and never
* run range querying/filtering, you can index using a
* <code>precisionStep</code> of {@link Integer#MAX_VALUE}.
* This will minimize disk space consumed. </p>
*
* <p>More advanced users can instead use {@link
* NumericTokenStream} directly, when indexing numbers. This
* class is a wrapper around this token stream type for
* easier, more intuitive usage.</p>
*
* <p><b>NOTE:</b> This class is only used during
* indexing. When retrieving the stored field value from a
* {@link Document} instance after search, you will get a
* conventional {@link Fieldable} instance where the numeric
* values are returned as {@link String}s (according to
* <code>toString(value)</code> of the used data type).
*
* <p><font color="red"><b>NOTE:</b> This API is
* experimental and might change in incompatible ways in the
* next release.</font>
*
* @since 2.9
*/
public final class NumericField extends AbstractField {
private final NumericTokenStream tokenStream;
/**
* Creates a field for numeric values using the default <code>precisionStep</code>
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with
* a numeric value, before indexing a document containing this field,
* set a value using the various set<em>???</em>Value() methods.
* This constructor creates an indexed, but not stored field.
* @param name the field name
*/
public NumericField(String name) {
this(name, NumericUtils.PRECISION_STEP_DEFAULT, Field.Store.NO, true);
}
/**
* Creates a field for numeric values using the default <code>precisionStep</code>
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with
* a numeric value, before indexing a document containing this field,
* set a value using the various set<em>???</em>Value() methods.
* @param name the field name
* @param store if the field should be stored in plain text form
* (according to <code>toString(value)</code> of the used data type)
* @param index if the field should be indexed using {@link NumericTokenStream}
*/
public NumericField(String name, Field.Store store, boolean index) {
this(name, NumericUtils.PRECISION_STEP_DEFAULT, store, index);
}
/**
* Creates a field for numeric values with the specified
* <code>precisionStep</code>. The instance is not yet initialized with
* a numeric value, before indexing a document containing this field,
* set a value using the various set<em>???</em>Value() methods.
* This constructor creates an indexed, but not stored field.
* @param name the field name
* @param precisionStep the used <a href="../search/NumericRangeQuery.html#precisionStepDesc">precision step</a>
*/
public NumericField(String name, int precisionStep) {
this(name, precisionStep, Field.Store.NO, true);
}
/**
* Creates a field for numeric values with the specified
* <code>precisionStep</code>. The instance is not yet initialized with
* a numeric value, before indexing a document containing this field,
* set a value using the various set<em>???</em>Value() methods.
* @param name the field name
* @param precisionStep the used <a href="../search/NumericRangeQuery.html#precisionStepDesc">precision step</a>
* @param store if the field should be stored in plain text form
* (according to <code>toString(value)</code> of the used data type)
* @param index if the field should be indexed using {@link NumericTokenStream}
*/
public NumericField(String name, int precisionStep, Field.Store store, boolean index) {
super(name, store, index ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NO, Field.TermVector.NO);
setOmitTermFreqAndPositions(true);
tokenStream = new NumericTokenStream(precisionStep);
}
/** Returns a {@link NumericTokenStream} for indexing the numeric value. */
public TokenStream tokenStreamValue() {
return isIndexed() ? tokenStream : null;
}
/** Returns always <code>null</code> for numeric fields */
@Override
public byte[] getBinaryValue(byte[] result){
return null;
}
/** Returns always <code>null</code> for numeric fields */
public Reader readerValue() {
return null;
}
/** Returns the numeric value as a string (how it is stored, when {@link Field.Store#YES} is chosen). */
public String stringValue() {
return (fieldsData == null) ? null : fieldsData.toString();
}
/** Returns the current numeric value as a subclass of {@link Number}, <code>null</code> if not yet initialized. */
public Number getNumericValue() {
return (Number) fieldsData;
}
/**
* Initializes the field with the supplied <code>long</code> value.
* @param value the numeric value
* @return this instance, because of this you can use it the following way:
* <code>document.add(new NumericField(name, precisionStep).setLongValue(value))</code>
*/
public NumericField setLongValue(final long value) {
tokenStream.setLongValue(value);
fieldsData = Long.valueOf(value);
return this;
}
/**
* Initializes the field with the supplied <code>int</code> value.
* @param value the numeric value
* @return this instance, because of this you can use it the following way:
* <code>document.add(new NumericField(name, precisionStep).setIntValue(value))</code>
*/
public NumericField setIntValue(final int value) {
tokenStream.setIntValue(value);
fieldsData = Integer.valueOf(value);
return this;
}
/**
* Initializes the field with the supplied <code>double</code> value.
* @param value the numeric value
* @return this instance, because of this you can use it the following way:
* <code>document.add(new NumericField(name, precisionStep).setDoubleValue(value))</code>
*/
public NumericField setDoubleValue(final double value) {
tokenStream.setDoubleValue(value);
fieldsData = Double.valueOf(value);
return this;
}
/**
* Initializes the field with the supplied <code>float</code> value.
* @param value the numeric value
* @return this instance, because of this you can use it the following way:
* <code>document.add(new NumericField(name, precisionStep).setFloatValue(value))</code>
*/
public NumericField setFloatValue(final float value) {
tokenStream.setFloatValue(value);
fieldsData = Float.valueOf(value);
return this;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.document;
import java.util.Set;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Declare what fields to load normally and what fields to load lazily
*
**/
public class SetBasedFieldSelector implements FieldSelector {
private Set<String> fieldsToLoad;
private Set<String> lazyFieldsToLoad;
/**
* Pass in the Set of {@link Field} names to load and the Set of {@link Field} names to load lazily. If both are null, the
* Document will not have any {@link Field} on it.
* @param fieldsToLoad A Set of {@link String} field names to load. May be empty, but not null
* @param lazyFieldsToLoad A Set of {@link String} field names to load lazily. May be empty, but not null
*/
public SetBasedFieldSelector(Set<String> fieldsToLoad, Set<String> lazyFieldsToLoad) {
this.fieldsToLoad = fieldsToLoad;
this.lazyFieldsToLoad = lazyFieldsToLoad;
}
/**
* Indicate whether to load the field with the given name or not. If the {@link Field#name()} is not in either of the
* initializing Sets, then {@link org.apache.lucene.document.FieldSelectorResult#NO_LOAD} is returned. If a Field name
* is in both <code>fieldsToLoad</code> and <code>lazyFieldsToLoad</code>, lazy has precedence.
*
* @param fieldName The {@link Field} name to check
* @return The {@link FieldSelectorResult}
*/
public FieldSelectorResult accept(String fieldName) {
FieldSelectorResult result = FieldSelectorResult.NO_LOAD;
if (fieldsToLoad.contains(fieldName) == true){
result = FieldSelectorResult.LOAD;
}
if (lazyFieldsToLoad.contains(fieldName) == true){
result = FieldSelectorResult.LAZY_LOAD;
}
return result;
}
}

View File

@ -0,0 +1,56 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
<p>The logical representation of a {@link org.apache.lucene.document.Document} for indexing and searching.</p>
<p>The document package provides the user level logical representation of content to be indexed and searched. The
package also provides utilities for working with {@link org.apache.lucene.document.Document}s and {@link org.apache.lucene.document.Fieldable}s.</p>
<h2>Document and Fieldable</h2>
<p>A {@link org.apache.lucene.document.Document} is a collection of {@link org.apache.lucene.document.Fieldable}s. A
{@link org.apache.lucene.document.Fieldable} is a logical representation of a user's content that needs to be indexed or stored.
{@link org.apache.lucene.document.Fieldable}s have a number of properties that tell Lucene how to treat the content (like indexed, tokenized,
stored, etc.) See the {@link org.apache.lucene.document.Field} implementation of {@link org.apache.lucene.document.Fieldable}
for specifics on these properties.
</p>
<p>Note: it is common to refer to {@link org.apache.lucene.document.Document}s having {@link org.apache.lucene.document.Field}s, even though technically they have
{@link org.apache.lucene.document.Fieldable}s.</p>
<h2>Working with Documents</h2>
<p>First and foremost, a {@link org.apache.lucene.document.Document} is something created by the user application. It is your job
to create Documents based on the content of the files you are working with in your application (Word, txt, PDF, Excel or any other format.)
How this is done is completely up to you. That being said, there are many tools available in other projects that can make
the process of taking a file and converting it into a Lucene {@link org.apache.lucene.document.Document}. To see an example of this,
take a look at the Lucene <a href="../../../../../../gettingstarted.html" target="top">demo</a> and the associated source code
for extracting content from HTML.
</p>
<p>The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times searchable
(remember, Lucene only searches text). {@link org.apache.lucene.document.NumericField} is a special helper class
to simplify indexing of numeric values (and also dates) for fast range range queries with {@link org.apache.lucene.search.NumericRangeQuery}
(using a special sortable string representation of numeric values).</p>
<p>The {@link org.apache.lucene.document.FieldSelector} class provides a mechanism to tell Lucene how to load Documents from
storage. If no FieldSelector is used, all Fieldables on a Document will be loaded. As an example of the FieldSelector usage, consider
the common use case of
displaying search results on a web page and then having users click through to see the full document. In this scenario, it is often
the case that there are many small fields and one or two large fields (containing the contents of the original file). Before the FieldSelector,
the full Document had to be loaded, including the large fields, in order to display the results. Now, using the FieldSelector, one
can {@link org.apache.lucene.document.FieldSelectorResult#LAZY_LOAD} the large fields, thus only loading the large fields
when a user clicks on the actual link to view the original content.</p>
</body>
</html>

View File

@ -0,0 +1,86 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import org.apache.lucene.util.BitVector;
import java.io.IOException;
class AllTermDocs implements TermDocs {
protected BitVector deletedDocs;
protected int maxDoc;
protected int doc = -1;
protected AllTermDocs(SegmentReader parent) {
synchronized (parent) {
this.deletedDocs = parent.deletedDocs;
}
this.maxDoc = parent.maxDoc();
}
public void seek(Term term) throws IOException {
if (term==null) {
doc = -1;
} else {
throw new UnsupportedOperationException();
}
}
public void seek(TermEnum termEnum) throws IOException {
throw new UnsupportedOperationException();
}
public int doc() {
return doc;
}
public int freq() {
return 1;
}
public boolean next() throws IOException {
return skipTo(doc+1);
}
public int read(int[] docs, int[] freqs) throws IOException {
final int length = docs.length;
int i = 0;
while (i < length && doc < maxDoc) {
if (deletedDocs == null || !deletedDocs.get(doc)) {
docs[i] = doc;
freqs[i] = 1;
++i;
}
doc++;
}
return i;
}
public boolean skipTo(int target) throws IOException {
doc = target;
while (doc < maxDoc) {
if (deletedDocs == null || !deletedDocs.get(doc)) {
return true;
}
doc++;
}
return false;
}
public void close() throws IOException {
}
}

View File

@ -0,0 +1,153 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import org.apache.lucene.search.Query;
/** Holds buffered deletes, by docID, term or query. We
* hold two instances of this class: one for the deletes
* prior to the last flush, the other for deletes after
* the last flush. This is so if we need to abort
* (discard all buffered docs) we can also discard the
* buffered deletes yet keep the deletes done during
* previously flushed segments. */
class BufferedDeletes {
int numTerms;
HashMap<Term,Num> terms = new HashMap<Term,Num>();
HashMap<Query,Integer> queries = new HashMap<Query,Integer>();
List<Integer> docIDs = new ArrayList<Integer>();
long bytesUsed;
// Number of documents a delete term applies to.
final static class Num {
private int num;
Num(int num) {
this.num = num;
}
int getNum() {
return num;
}
void setNum(int num) {
// Only record the new number if it's greater than the
// current one. This is important because if multiple
// threads are replacing the same doc at nearly the
// same time, it's possible that one thread that got a
// higher docID is scheduled before the other
// threads.
if (num > this.num)
this.num = num;
}
}
int size() {
// We use numTerms not terms.size() intentionally, so
// that deletes by the same term multiple times "count",
// ie if you ask to flush every 1000 deletes then even
// dup'd terms are counted towards that 1000
return numTerms + queries.size() + docIDs.size();
}
void update(BufferedDeletes in) {
numTerms += in.numTerms;
bytesUsed += in.bytesUsed;
terms.putAll(in.terms);
queries.putAll(in.queries);
docIDs.addAll(in.docIDs);
in.clear();
}
void clear() {
terms.clear();
queries.clear();
docIDs.clear();
numTerms = 0;
bytesUsed = 0;
}
void addBytesUsed(long b) {
bytesUsed += b;
}
boolean any() {
return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0;
}
// Remaps all buffered deletes based on a completed
// merge
synchronized void remap(MergeDocIDRemapper mapper,
SegmentInfos infos,
int[][] docMaps,
int[] delCounts,
MergePolicy.OneMerge merge,
int mergeDocCount) {
final HashMap<Term,Num> newDeleteTerms;
// Remap delete-by-term
if (terms.size() > 0) {
newDeleteTerms = new HashMap<Term, Num>();
for(Entry<Term,Num> entry : terms.entrySet()) {
Num num = entry.getValue();
newDeleteTerms.put(entry.getKey(),
new Num(mapper.remap(num.getNum())));
}
} else
newDeleteTerms = null;
// Remap delete-by-docID
final List<Integer> newDeleteDocIDs;
if (docIDs.size() > 0) {
newDeleteDocIDs = new ArrayList<Integer>(docIDs.size());
for (Integer num : docIDs) {
newDeleteDocIDs.add(Integer.valueOf(mapper.remap(num.intValue())));
}
} else
newDeleteDocIDs = null;
// Remap delete-by-query
final HashMap<Query,Integer> newDeleteQueries;
if (queries.size() > 0) {
newDeleteQueries = new HashMap<Query, Integer>(queries.size());
for(Entry<Query,Integer> entry: queries.entrySet()) {
Integer num = entry.getValue();
newDeleteQueries.put(entry.getKey(),
Integer.valueOf(mapper.remap(num.intValue())));
}
} else
newDeleteQueries = null;
if (newDeleteTerms != null)
terms = newDeleteTerms;
if (newDeleteDocIDs != null)
docIDs = newDeleteDocIDs;
if (newDeleteQueries != null)
queries = newDeleteQueries;
}
}

View File

@ -0,0 +1,147 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Class that Posting and PostingVector use to write byte
* streams into shared fixed-size byte[] arrays. The idea
* is to allocate slices of increasing lengths For
* example, the first slice is 5 bytes, the next slice is
* 14, etc. We start by writing our bytes into the first
* 5 bytes. When we hit the end of the slice, we allocate
* the next slice and then write the address of the new
* slice into the last 4 bytes of the previous slice (the
* "forwarding address").
*
* Each slice is filled with 0's initially, and we mark
* the end with a non-zero byte. This way the methods
* that are writing into the slice don't need to record
* its length and instead allocate a new slice once they
* hit a non-zero byte. */
import java.util.Arrays;
final class ByteBlockPool {
abstract static class Allocator {
abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
abstract byte[] getByteBlock(boolean trackAllocations);
}
public byte[][] buffers = new byte[10][];
int bufferUpto = -1; // Which buffer we are upto
public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer
public byte[] buffer; // Current head buffer
public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset
private final boolean trackAllocations;
private final Allocator allocator;
public ByteBlockPool(Allocator allocator, boolean trackAllocations) {
this.allocator = allocator;
this.trackAllocations = trackAllocations;
}
public void reset() {
if (bufferUpto != -1) {
// We allocated at least one buffer
for(int i=0;i<bufferUpto;i++)
// Fully zero fill buffers that we fully used
Arrays.fill(buffers[i], (byte) 0);
// Partial zero fill the final buffer
Arrays.fill(buffers[bufferUpto], 0, byteUpto, (byte) 0);
if (bufferUpto > 0)
// Recycle all but the first buffer
allocator.recycleByteBlocks(buffers, 1, 1+bufferUpto);
// Re-use the first buffer
bufferUpto = 0;
byteUpto = 0;
byteOffset = 0;
buffer = buffers[0];
}
}
public void nextBuffer() {
if (1+bufferUpto == buffers.length) {
byte[][] newBuffers = new byte[(int) (buffers.length*1.5)][];
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
buffers = newBuffers;
}
buffer = buffers[1+bufferUpto] = allocator.getByteBlock(trackAllocations);
bufferUpto++;
byteUpto = 0;
byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE;
}
public int newSlice(final int size) {
if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size)
nextBuffer();
final int upto = byteUpto;
byteUpto += size;
buffer[byteUpto-1] = 16;
return upto;
}
// Size of each slice. These arrays should be at most 16
// elements (index is encoded with 4 bits). First array
// is just a compact way to encode X+1 with a max. Second
// array is the length of each slice, ie first slice is 5
// bytes, next slice is 14 bytes, etc.
final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
public int allocSlice(final byte[] slice, final int upto) {
final int level = slice[upto] & 15;
final int newLevel = nextLevelArray[level];
final int newSize = levelSizeArray[newLevel];
// Maybe allocate another block
if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize)
nextBuffer();
final int newUpto = byteUpto;
final int offset = newUpto + byteOffset;
byteUpto += newSize;
// Copy forward the past 3 bytes (which we are about
// to overwrite with the forwarding address):
buffer[newUpto] = slice[upto-3];
buffer[newUpto+1] = slice[upto-2];
buffer[newUpto+2] = slice[upto-1];
// Write forwarding address at end of last slice:
slice[upto-3] = (byte) (offset >>> 24);
slice[upto-2] = (byte) (offset >>> 16);
slice[upto-1] = (byte) (offset >>> 8);
slice[upto] = (byte) offset;
// Write new level:
buffer[byteUpto-1] = (byte) (16|newLevel);
return newUpto+3;
}
}

View File

@ -0,0 +1,149 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import java.io.IOException;
/* IndexInput that knows how to read the byte slices written
* by Posting and PostingVector. We read the bytes in
* each slice until we hit the end of that slice at which
* point we read the forwarding address of the next slice
* and then jump to it.*/
final class ByteSliceReader extends IndexInput {
ByteBlockPool pool;
int bufferUpto;
byte[] buffer;
public int upto;
int limit;
int level;
public int bufferOffset;
public int endIndex;
public void init(ByteBlockPool pool, int startIndex, int endIndex) {
assert endIndex-startIndex >= 0;
assert startIndex >= 0;
assert endIndex >= 0;
this.pool = pool;
this.endIndex = endIndex;
level = 0;
bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK;
final int firstSize = ByteBlockPool.levelSizeArray[0];
if (startIndex+firstSize >= endIndex) {
// There is only this one slice to read
limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK;
} else
limit = upto+firstSize-4;
}
public boolean eof() {
assert upto + bufferOffset <= endIndex;
return upto + bufferOffset == endIndex;
}
@Override
public byte readByte() {
assert !eof();
assert upto <= limit;
if (upto == limit)
nextSlice();
return buffer[upto++];
}
public long writeTo(IndexOutput out) throws IOException {
long size = 0;
while(true) {
if (limit + bufferOffset == endIndex) {
assert endIndex - bufferOffset >= upto;
out.writeBytes(buffer, upto, limit-upto);
size += limit-upto;
break;
} else {
out.writeBytes(buffer, upto, limit-upto);
size += limit-upto;
nextSlice();
}
}
return size;
}
public void nextSlice() {
// Skip to our next slice
final int nextIndex = ((buffer[limit]&0xff)<<24) + ((buffer[1+limit]&0xff)<<16) + ((buffer[2+limit]&0xff)<<8) + (buffer[3+limit]&0xff);
level = ByteBlockPool.nextLevelArray[level];
final int newSize = ByteBlockPool.levelSizeArray[level];
bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK;
if (nextIndex + newSize >= endIndex) {
// We are advancing to the final slice
assert endIndex - nextIndex > 0;
limit = endIndex - bufferOffset;
} else {
// This is not the final slice (subtract 4 for the
// forwarding address at the end of this new slice)
limit = upto+newSize-4;
}
}
@Override
public void readBytes(byte[] b, int offset, int len) {
while(len > 0) {
final int numLeft = limit-upto;
if (numLeft < len) {
// Read entire slice
System.arraycopy(buffer, upto, b, offset, numLeft);
offset += numLeft;
len -= numLeft;
nextSlice();
} else {
// This slice is the last one
System.arraycopy(buffer, upto, b, offset, len);
upto += len;
break;
}
}
}
@Override
public long getFilePointer() {throw new RuntimeException("not implemented");}
@Override
public long length() {throw new RuntimeException("not implemented");}
@Override
public void seek(long pos) {throw new RuntimeException("not implemented");}
@Override
public void close() {throw new RuntimeException("not implemented");}
}

View File

@ -0,0 +1,89 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Class to write byte streams into slices of shared
* byte[]. This is used by DocumentsWriter to hold the
* posting list for many terms in RAM.
*/
final class ByteSliceWriter {
private byte[] slice;
private int upto;
private final ByteBlockPool pool;
int offset0;
public ByteSliceWriter(ByteBlockPool pool) {
this.pool = pool;
}
/**
* Set up the writer to write at address.
*/
public void init(int address) {
slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT];
assert slice != null;
upto = address & DocumentsWriter.BYTE_BLOCK_MASK;
offset0 = address;
assert upto < slice.length;
}
/** Write byte into byte slice stream */
public void writeByte(byte b) {
assert slice != null;
if (slice[upto] != 0) {
upto = pool.allocSlice(slice, upto);
slice = pool.buffer;
offset0 = pool.byteOffset;
assert slice != null;
}
slice[upto++] = b;
assert upto != slice.length;
}
public void writeBytes(final byte[] b, int offset, final int len) {
final int offsetEnd = offset + len;
while(offset < offsetEnd) {
if (slice[upto] != 0) {
// End marker
upto = pool.allocSlice(slice, upto);
slice = pool.buffer;
offset0 = pool.byteOffset;
}
slice[upto++] = b[offset++];
assert upto != slice.length;
}
}
public int getAddress() {
return upto + (offset0 & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
}
public void writeVInt(int i) {
while ((i & ~0x7F) != 0) {
writeByte((byte)((i & 0x7f) | 0x80));
i >>>= 7;
}
writeByte((byte) i);
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
final class CharBlockPool {
public char[][] buffers = new char[10][];
int numBuffer;
int bufferUpto = -1; // Which buffer we are upto
public int charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer
public char[] buffer; // Current head buffer
public int charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset
final private DocumentsWriter docWriter;
public CharBlockPool(DocumentsWriter docWriter) {
this.docWriter = docWriter;
}
public void reset() {
docWriter.recycleCharBlocks(buffers, 1+bufferUpto);
bufferUpto = -1;
charUpto = DocumentsWriter.CHAR_BLOCK_SIZE;
charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE;
}
public void nextBuffer() {
if (1+bufferUpto == buffers.length) {
char[][] newBuffers = new char[(int) (buffers.length*1.5)][];
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
buffers = newBuffers;
}
buffer = buffers[1+bufferUpto] = docWriter.getCharBlock();
bufferUpto++;
charUpto = 0;
charOffset += DocumentsWriter.CHAR_BLOCK_SIZE;
}
}

View File

@ -0,0 +1,911 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.document.AbstractField; // for javadocs
import org.apache.lucene.document.Document;
import java.text.NumberFormat;
import java.io.PrintStream;
import java.io.IOException;
import java.io.File;
import java.util.Collection;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
/**
* Basic tool and API to check the health of an index and
* write a new segments file that removes reference to
* problematic segments.
*
* <p>As this tool checks every byte in the index, on a large
* index it can take quite a long time to run.
*
* <p><b>WARNING</b>: this tool and API is new and
* experimental and is subject to suddenly change in the
* next release. Please make a complete backup of your
* index before using this to fix your index!
*/
public class CheckIndex {
private PrintStream infoStream;
private Directory dir;
/**
* Returned from {@link #checkIndex()} detailing the health and status of the index.
*
* <p><b>WARNING</b>: this API is new and experimental and is
* subject to suddenly change in the next release.
**/
public static class Status {
/** True if no problems were found with the index. */
public boolean clean;
/** True if we were unable to locate and load the segments_N file. */
public boolean missingSegments;
/** True if we were unable to open the segments_N file. */
public boolean cantOpenSegments;
/** True if we were unable to read the version number from segments_N file. */
public boolean missingSegmentVersion;
/** Name of latest segments_N file in the index. */
public String segmentsFileName;
/** Number of segments in the index. */
public int numSegments;
/** String description of the version of the index. */
public String segmentFormat;
/** Empty unless you passed specific segments list to check as optional 3rd argument.
* @see CheckIndex#checkIndex(List) */
public List<String> segmentsChecked = new ArrayList<String>();
/** True if the index was created with a newer version of Lucene than the CheckIndex tool. */
public boolean toolOutOfDate;
/** List of {@link SegmentInfoStatus} instances, detailing status of each segment. */
public List<SegmentInfoStatus> segmentInfos = new ArrayList<SegmentInfoStatus>();
/** Directory index is in. */
public Directory dir;
/**
* SegmentInfos instance containing only segments that
* had no problems (this is used with the {@link CheckIndex#fixIndex}
* method to repair the index.
*/
SegmentInfos newSegments;
/** How many documents will be lost to bad segments. */
public int totLoseDocCount;
/** How many bad segments were found. */
public int numBadSegments;
/** True if we checked only specific segments ({@link
* #checkIndex(List)}) was called with non-null
* argument). */
public boolean partial;
/** Holds the userData of the last commit in the index */
public Map<String, String> userData;
/** Holds the status of each segment in the index.
* See {@link #segmentInfos}.
*
* <p><b>WARNING</b>: this API is new and experimental and is
* subject to suddenly change in the next release.
*/
public static class SegmentInfoStatus {
/** Name of the segment. */
public String name;
/** Document count (does not take deletions into account). */
public int docCount;
/** True if segment is compound file format. */
public boolean compound;
/** Number of files referenced by this segment. */
public int numFiles;
/** Net size (MB) of the files referenced by this
* segment. */
public double sizeMB;
/** Doc store offset, if this segment shares the doc
* store files (stored fields and term vectors) with
* other segments. This is -1 if it does not share. */
public int docStoreOffset = -1;
/** String of the shared doc store segment, or null if
* this segment does not share the doc store files. */
public String docStoreSegment;
/** True if the shared doc store files are compound file
* format. */
public boolean docStoreCompoundFile;
/** True if this segment has pending deletions. */
public boolean hasDeletions;
/** Name of the current deletions file name. */
public String deletionsFileName;
/** Number of deleted documents. */
public int numDeleted;
/** True if we were able to open a SegmentReader on this
* segment. */
public boolean openReaderPassed;
/** Number of fields in this segment. */
int numFields;
/** True if at least one of the fields in this segment
* does not omitTermFreqAndPositions.
* @see AbstractField#setOmitTermFreqAndPositions */
public boolean hasProx;
/** Map that includes certain
* debugging details that IndexWriter records into
* each segment it creates */
public Map<String,String> diagnostics;
/** Status for testing of field norms (null if field norms could not be tested). */
public FieldNormStatus fieldNormStatus;
/** Status for testing of indexed terms (null if indexed terms could not be tested). */
public TermIndexStatus termIndexStatus;
/** Status for testing of stored fields (null if stored fields could not be tested). */
public StoredFieldStatus storedFieldStatus;
/** Status for testing of term vectors (null if term vectors could not be tested). */
public TermVectorStatus termVectorStatus;
}
/**
* Status from testing field norms.
*/
public static final class FieldNormStatus {
/** Number of fields successfully tested */
public long totFields = 0L;
/** Exception thrown during term index test (null on success) */
public Throwable error = null;
}
/**
* Status from testing term index.
*/
public static final class TermIndexStatus {
/** Total term count */
public long termCount = 0L;
/** Total frequency across all terms. */
public long totFreq = 0L;
/** Total number of positions. */
public long totPos = 0L;
/** Exception thrown during term index test (null on success) */
public Throwable error = null;
}
/**
* Status from testing stored fields.
*/
public static final class StoredFieldStatus {
/** Number of documents tested. */
public int docCount = 0;
/** Total number of stored fields tested. */
public long totFields = 0;
/** Exception thrown during stored fields test (null on success) */
public Throwable error = null;
}
/**
* Status from testing stored fields.
*/
public static final class TermVectorStatus {
/** Number of documents tested. */
public int docCount = 0;
/** Total number of term vectors tested. */
public long totVectors = 0;
/** Exception thrown during term vector test (null on success) */
public Throwable error = null;
}
}
/** Create a new CheckIndex on the directory. */
public CheckIndex(Directory dir) {
this.dir = dir;
infoStream = null;
}
/** Set infoStream where messages should go. If null, no
* messages are printed */
public void setInfoStream(PrintStream out) {
infoStream = out;
}
private void msg(String msg) {
if (infoStream != null)
infoStream.println(msg);
}
private static class MySegmentTermDocs extends SegmentTermDocs {
int delCount;
MySegmentTermDocs(SegmentReader p) {
super(p);
}
@Override
public void seek(Term term) throws IOException {
super.seek(term);
delCount = 0;
}
@Override
protected void skippingDoc() throws IOException {
delCount++;
}
}
/** Returns a {@link Status} instance detailing
* the state of the index.
*
* <p>As this method checks every byte in the index, on a large
* index it can take quite a long time to run.
*
* <p><b>WARNING</b>: make sure
* you only call this when the index is not opened by any
* writer. */
public Status checkIndex() throws IOException {
return checkIndex(null);
}
/** Returns a {@link Status} instance detailing
* the state of the index.
*
* @param onlySegments list of specific segment names to check
*
* <p>As this method checks every byte in the specified
* segments, on a large index it can take quite a long
* time to run.
*
* <p><b>WARNING</b>: make sure
* you only call this when the index is not opened by any
* writer. */
public Status checkIndex(List<String> onlySegments) throws IOException {
NumberFormat nf = NumberFormat.getInstance();
SegmentInfos sis = new SegmentInfos();
Status result = new Status();
result.dir = dir;
try {
sis.read(dir);
} catch (Throwable t) {
msg("ERROR: could not read any segments file in directory");
result.missingSegments = true;
if (infoStream != null)
t.printStackTrace(infoStream);
return result;
}
final int numSegments = sis.size();
final String segmentsFileName = sis.getCurrentSegmentFileName();
IndexInput input = null;
try {
input = dir.openInput(segmentsFileName);
} catch (Throwable t) {
msg("ERROR: could not open segments file in directory");
if (infoStream != null)
t.printStackTrace(infoStream);
result.cantOpenSegments = true;
return result;
}
int format = 0;
try {
format = input.readInt();
} catch (Throwable t) {
msg("ERROR: could not read segment file version in directory");
if (infoStream != null)
t.printStackTrace(infoStream);
result.missingSegmentVersion = true;
return result;
} finally {
if (input != null)
input.close();
}
String sFormat = "";
boolean skip = false;
if (format == SegmentInfos.FORMAT)
sFormat = "FORMAT [Lucene Pre-2.1]";
if (format == SegmentInfos.FORMAT_LOCKLESS)
sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
else {
if (format == SegmentInfos.FORMAT_CHECKSUM)
sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
else if (format == SegmentInfos.FORMAT_DEL_COUNT)
sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
else if (format == SegmentInfos.FORMAT_HAS_PROX)
sFormat = "FORMAT_HAS_PROX [Lucene 2.4]";
else if (format == SegmentInfos.FORMAT_USER_DATA)
sFormat = "FORMAT_USER_DATA [Lucene 2.9]";
else if (format == SegmentInfos.FORMAT_DIAGNOSTICS)
sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]";
else if (format < SegmentInfos.CURRENT_FORMAT) {
sFormat = "int=" + format + " [newer version of Lucene than this tool]";
skip = true;
} else {
sFormat = format + " [Lucene 1.3 or prior]";
}
}
result.segmentsFileName = segmentsFileName;
result.numSegments = numSegments;
result.segmentFormat = sFormat;
result.userData = sis.getUserData();
String userDataString;
if (sis.getUserData().size() > 0) {
userDataString = " userData=" + sis.getUserData();
} else {
userDataString = "";
}
msg("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat + userDataString);
if (onlySegments != null) {
result.partial = true;
if (infoStream != null)
infoStream.print("\nChecking only these segments:");
for (String s : onlySegments) {
if (infoStream != null)
infoStream.print(" " + s);
}
result.segmentsChecked.addAll(onlySegments);
msg(":");
}
if (skip) {
msg("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
result.toolOutOfDate = true;
return result;
}
result.newSegments = (SegmentInfos) sis.clone();
result.newSegments.clear();
for(int i=0;i<numSegments;i++) {
final SegmentInfo info = sis.info(i);
if (onlySegments != null && !onlySegments.contains(info.name))
continue;
Status.SegmentInfoStatus segInfoStat = new Status.SegmentInfoStatus();
result.segmentInfos.add(segInfoStat);
msg(" " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
segInfoStat.name = info.name;
segInfoStat.docCount = info.docCount;
int toLoseDocCount = info.docCount;
SegmentReader reader = null;
try {
msg(" compound=" + info.getUseCompoundFile());
segInfoStat.compound = info.getUseCompoundFile();
msg(" hasProx=" + info.getHasProx());
segInfoStat.hasProx = info.getHasProx();
msg(" numFiles=" + info.files().size());
segInfoStat.numFiles = info.files().size();
msg(" size (MB)=" + nf.format(info.sizeInBytes()/(1024.*1024.)));
segInfoStat.sizeMB = info.sizeInBytes()/(1024.*1024.);
Map<String,String> diagnostics = info.getDiagnostics();
segInfoStat.diagnostics = diagnostics;
if (diagnostics.size() > 0) {
msg(" diagnostics = " + diagnostics);
}
final int docStoreOffset = info.getDocStoreOffset();
if (docStoreOffset != -1) {
msg(" docStoreOffset=" + docStoreOffset);
segInfoStat.docStoreOffset = docStoreOffset;
msg(" docStoreSegment=" + info.getDocStoreSegment());
segInfoStat.docStoreSegment = info.getDocStoreSegment();
msg(" docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile());
segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile();
}
final String delFileName = info.getDelFileName();
if (delFileName == null){
msg(" no deletions");
segInfoStat.hasDeletions = false;
}
else{
msg(" has deletions [delFileName=" + delFileName + "]");
segInfoStat.hasDeletions = true;
segInfoStat.deletionsFileName = delFileName;
}
if (infoStream != null)
infoStream.print(" test: open reader.........");
reader = SegmentReader.get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR);
segInfoStat.openReaderPassed = true;
final int numDocs = reader.numDocs();
toLoseDocCount = numDocs;
if (reader.hasDeletions()) {
if (reader.deletedDocs.count() != info.getDelCount()) {
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
}
if (reader.deletedDocs.count() > reader.maxDoc()) {
throw new RuntimeException("too many deleted docs: maxDoc()=" + reader.maxDoc() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
}
if (info.docCount - numDocs != info.getDelCount()){
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
}
segInfoStat.numDeleted = info.docCount - numDocs;
msg("OK [" + (segInfoStat.numDeleted) + " deleted docs]");
} else {
if (info.getDelCount() != 0) {
throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
}
msg("OK");
}
if (reader.maxDoc() != info.docCount)
throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.docCount);
// Test getFieldNames()
if (infoStream != null) {
infoStream.print(" test: fields..............");
}
Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
msg("OK [" + fieldNames.size() + " fields]");
segInfoStat.numFields = fieldNames.size();
// Test Field Norms
segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader);
// Test the Term Index
segInfoStat.termIndexStatus = testTermIndex(info, reader);
// Test Stored Fields
segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
// Test Term Vectors
segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);
// Rethrow the first exception we encountered
// This will cause stats for failed segments to be incremented properly
if (segInfoStat.fieldNormStatus.error != null) {
throw new RuntimeException("Field Norm test failed");
} else if (segInfoStat.termIndexStatus.error != null) {
throw new RuntimeException("Term Index test failed");
} else if (segInfoStat.storedFieldStatus.error != null) {
throw new RuntimeException("Stored Field test failed");
} else if (segInfoStat.termVectorStatus.error != null) {
throw new RuntimeException("Term Vector test failed");
}
msg("");
} catch (Throwable t) {
msg("FAILED");
String comment;
comment = "fixIndex() would remove reference to this segment";
msg(" WARNING: " + comment + "; full exception:");
if (infoStream != null)
t.printStackTrace(infoStream);
msg("");
result.totLoseDocCount += toLoseDocCount;
result.numBadSegments++;
continue;
} finally {
if (reader != null)
reader.close();
}
// Keeper
result.newSegments.add((SegmentInfo) info.clone());
}
if (0 == result.numBadSegments) {
result.clean = true;
msg("No problems were detected with this index.\n");
} else
msg("WARNING: " + result.numBadSegments + " broken segments (containing " + result.totLoseDocCount + " documents) detected");
return result;
}
/**
* Test field norms.
*/
private Status.FieldNormStatus testFieldNorms(Collection<String> fieldNames, SegmentReader reader) {
final Status.FieldNormStatus status = new Status.FieldNormStatus();
try {
// Test Field Norms
if (infoStream != null) {
infoStream.print(" test: field norms.........");
}
final byte[] b = new byte[reader.maxDoc()];
for (final String fieldName : fieldNames) {
reader.norms(fieldName, b, 0);
++status.totFields;
}
msg("OK [" + status.totFields + " fields]");
} catch (Throwable e) {
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
}
}
return status;
}
/**
* Test the term index.
*/
private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) {
final Status.TermIndexStatus status = new Status.TermIndexStatus();
try {
if (infoStream != null) {
infoStream.print(" test: terms, freq, prox...");
}
final TermEnum termEnum = reader.terms();
final TermPositions termPositions = reader.termPositions();
// Used only to count up # deleted docs for this term
final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
final int maxDoc = reader.maxDoc();
while (termEnum.next()) {
status.termCount++;
final Term term = termEnum.term();
final int docFreq = termEnum.docFreq();
termPositions.seek(term);
int lastDoc = -1;
int freq0 = 0;
status.totFreq += docFreq;
while (termPositions.next()) {
freq0++;
final int doc = termPositions.doc();
final int freq = termPositions.freq();
if (doc <= lastDoc)
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
if (doc >= maxDoc)
throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
lastDoc = doc;
if (freq <= 0)
throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
int lastPos = -1;
status.totPos += freq;
for(int j=0;j<freq;j++) {
final int pos = termPositions.nextPosition();
if (pos < -1)
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
if (pos < lastPos)
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
lastPos = pos;
}
}
// Now count how many deleted docs occurred in
// this term:
final int delCount;
if (reader.hasDeletions()) {
myTermDocs.seek(term);
while(myTermDocs.next()) { }
delCount = myTermDocs.delCount;
} else {
delCount = 0;
}
if (freq0 + delCount != docFreq) {
throw new RuntimeException("term " + term + " docFreq=" +
docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
}
}
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
} catch (Throwable e) {
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
}
}
return status;
}
/**
* Test stored fields for a segment.
*/
private Status.StoredFieldStatus testStoredFields(SegmentInfo info, SegmentReader reader, NumberFormat format) {
final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
try {
if (infoStream != null) {
infoStream.print(" test: stored fields.......");
}
// Scan stored fields for all documents
for (int j = 0; j < info.docCount; ++j) {
if (!reader.isDeleted(j)) {
status.docCount++;
Document doc = reader.document(j);
status.totFields += doc.getFields().size();
}
}
// Validate docCount
if (status.docCount != reader.numDocs()) {
throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
}
msg("OK [" + status.totFields + " total field count; avg " +
format.format((((float) status.totFields)/status.docCount)) + " fields per doc]");
} catch (Throwable e) {
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
}
}
return status;
}
/**
* Test term vectors for a segment.
*/
private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
final Status.TermVectorStatus status = new Status.TermVectorStatus();
try {
if (infoStream != null) {
infoStream.print(" test: term vectors........");
}
for (int j = 0; j < info.docCount; ++j) {
if (!reader.isDeleted(j)) {
status.docCount++;
TermFreqVector[] tfv = reader.getTermFreqVectors(j);
if (tfv != null) {
status.totVectors += tfv.length;
}
}
}
msg("OK [" + status.totVectors + " total vector count; avg " +
format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
} catch (Throwable e) {
msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
}
}
return status;
}
/** Repairs the index using previously returned result
* from {@link #checkIndex}. Note that this does not
* remove any of the unreferenced files after it's done;
* you must separately open an {@link IndexWriter}, which
* deletes unreferenced files when it's created.
*
* <p><b>WARNING</b>: this writes a
* new segments file into the index, effectively removing
* all documents in broken segments from the index.
* BE CAREFUL.
*
* <p><b>WARNING</b>: Make sure you only call this when the
* index is not opened by any writer. */
public void fixIndex(Status result) throws IOException {
if (result.partial)
throw new IllegalArgumentException("can only fix an index that was fully checked (this status checked a subset of segments)");
result.newSegments.commit(result.dir);
}
private static boolean assertsOn;
private static boolean testAsserts() {
assertsOn = true;
return true;
}
private static boolean assertsOn() {
assert testAsserts();
return assertsOn;
}
/** Command-line interface to check and fix an index.
<p>
Run it like this:
<pre>
java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
</pre>
<ul>
<li><code>-fix</code>: actually write a new segments_N file, removing any problematic segments
<li><code>-segment X</code>: only check the specified
segment(s). This can be specified multiple times,
to check more than one segment, eg <code>-segment _2
-segment _a</code>. You can't use this with the -fix
option.
</ul>
<p><b>WARNING</b>: <code>-fix</code> should only be used on an emergency basis as it will cause
documents (perhaps many) to be permanently removed from the index. Always make
a backup copy of your index before running this! Do not run this tool on an index
that is actively being written to. You have been warned!
<p> Run without -fix, this tool will open the index, report version information
and report any exceptions it hits and what action it would take if -fix were
specified. With -fix, this tool will remove any segments that have issues and
write a new segments_N file. This means all documents contained in the affected
segments will be removed.
<p>
This tool exits with exit code 1 if the index cannot be opened or has any
corruption, else 0.
*/
public static void main(String[] args) throws IOException, InterruptedException {
boolean doFix = false;
List<String> onlySegments = new ArrayList<String>();
String indexPath = null;
int i = 0;
while(i < args.length) {
if (args[i].equals("-fix")) {
doFix = true;
i++;
} else if (args[i].equals("-segment")) {
if (i == args.length-1) {
System.out.println("ERROR: missing name for -segment option");
System.exit(1);
}
onlySegments.add(args[i+1]);
i += 2;
} else {
if (indexPath != null) {
System.out.println("ERROR: unexpected extra argument '" + args[i] + "'");
System.exit(1);
}
indexPath = args[i];
i++;
}
}
if (indexPath == null) {
System.out.println("\nERROR: index path not specified");
System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" +
"\n" +
" -fix: actually write a new segments_N file, removing any problematic segments\n" +
" -segment X: only check the specified segments. This can be specified multiple\n" +
" times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
" You can't use this with the -fix option\n" +
"\n" +
"**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
"documents (perhaps many) to be permanently removed from the index. Always make\n" +
"a backup copy of your index before running this! Do not run this tool on an index\n" +
"that is actively being written to. You have been warned!\n" +
"\n" +
"Run without -fix, this tool will open the index, report version information\n" +
"and report any exceptions it hits and what action it would take if -fix were\n" +
"specified. With -fix, this tool will remove any segments that have issues and\n" +
"write a new segments_N file. This means all documents contained in the affected\n" +
"segments will be removed.\n" +
"\n" +
"This tool exits with exit code 1 if the index cannot be opened or has any\n" +
"corruption, else 0.\n");
System.exit(1);
}
if (!assertsOn())
System.out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
if (onlySegments.size() == 0)
onlySegments = null;
else if (doFix) {
System.out.println("ERROR: cannot specify both -fix and -segment");
System.exit(1);
}
System.out.println("\nOpening index @ " + indexPath + "\n");
Directory dir = null;
try {
dir = FSDirectory.open(new File(indexPath));
} catch (Throwable t) {
System.out.println("ERROR: could not open directory \"" + indexPath + "\"; exiting");
t.printStackTrace(System.out);
System.exit(1);
}
CheckIndex checker = new CheckIndex(dir);
checker.setInfoStream(System.out);
Status result = checker.checkIndex(onlySegments);
if (result.missingSegments) {
System.exit(1);
}
if (!result.clean) {
if (!doFix) {
System.out.println("WARNING: would write new segments file, and " + result.totLoseDocCount + " documents would be lost, if -fix were specified\n");
} else {
System.out.println("WARNING: " + result.totLoseDocCount + " documents will be lost\n");
System.out.println("NOTE: will write new segments file in 5 seconds; this will remove " + result.totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
for(int s=0;s<5;s++) {
Thread.sleep(1000);
System.out.println(" " + (5-s) + "...");
}
System.out.println("Writing...");
checker.fixIndex(result);
System.out.println("OK");
System.out.println("Wrote new segments file \"" + result.newSegments.getCurrentSegmentFileName() + "\"");
}
}
System.out.println("");
final int exitCode;
if (result != null && result.clean == true)
exitCode = 0;
else
exitCode = 1;
System.exit(exitCode);
}
}

View File

@ -0,0 +1,281 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Lock;
import java.util.HashMap;
import java.io.IOException;
/**
* Class for accessing a compound stream.
* This class implements a directory, but is limited to only read operations.
* Directory methods that would normally modify data throw an exception.
*/
class CompoundFileReader extends Directory {
private int readBufferSize;
private static final class FileEntry {
long offset;
long length;
}
// Base info
private Directory directory;
private String fileName;
private IndexInput stream;
private HashMap<String,FileEntry> entries = new HashMap<String,FileEntry>();
public CompoundFileReader(Directory dir, String name) throws IOException {
this(dir, name, BufferedIndexInput.BUFFER_SIZE);
}
public CompoundFileReader(Directory dir, String name, int readBufferSize)
throws IOException
{
directory = dir;
fileName = name;
this.readBufferSize = readBufferSize;
boolean success = false;
try {
stream = dir.openInput(name, readBufferSize);
// read the directory and init files
int count = stream.readVInt();
FileEntry entry = null;
for (int i=0; i<count; i++) {
long offset = stream.readLong();
String id = stream.readString();
if (entry != null) {
// set length of the previous entry
entry.length = offset - entry.offset;
}
entry = new FileEntry();
entry.offset = offset;
entries.put(id, entry);
}
// set the length of the final entry
if (entry != null) {
entry.length = stream.length() - entry.offset;
}
success = true;
} finally {
if (! success && (stream != null)) {
try {
stream.close();
} catch (IOException e) { }
}
}
}
public Directory getDirectory() {
return directory;
}
public String getName() {
return fileName;
}
@Override
public synchronized void close() throws IOException {
if (stream == null)
throw new IOException("Already closed");
entries.clear();
stream.close();
stream = null;
}
@Override
public synchronized IndexInput openInput(String id)
throws IOException
{
// Default to readBufferSize passed in when we were opened
return openInput(id, readBufferSize);
}
@Override
public synchronized IndexInput openInput(String id, int readBufferSize)
throws IOException
{
if (stream == null)
throw new IOException("Stream closed");
FileEntry entry = entries.get(id);
if (entry == null)
throw new IOException("No sub-file with id " + id + " found");
return new CSIndexInput(stream, entry.offset, entry.length, readBufferSize);
}
/** Returns an array of strings, one for each file in the directory. */
@Override
public String[] listAll() {
String res[] = new String[entries.size()];
return entries.keySet().toArray(res);
}
/** Returns true iff a file with the given name exists. */
@Override
public boolean fileExists(String name) {
return entries.containsKey(name);
}
/** Returns the time the compound file was last modified. */
@Override
public long fileModified(String name) throws IOException {
return directory.fileModified(fileName);
}
/** Set the modified time of the compound file to now. */
@Override
public void touchFile(String name) throws IOException {
directory.touchFile(fileName);
}
/** Not implemented
* @throws UnsupportedOperationException */
@Override
public void deleteFile(String name)
{
throw new UnsupportedOperationException();
}
/** Not implemented
* @throws UnsupportedOperationException */
public void renameFile(String from, String to)
{
throw new UnsupportedOperationException();
}
/** Returns the length of a file in the directory.
* @throws IOException if the file does not exist */
@Override
public long fileLength(String name)
throws IOException
{
FileEntry e = entries.get(name);
if (e == null)
throw new IOException("File " + name + " does not exist");
return e.length;
}
/** Not implemented
* @throws UnsupportedOperationException */
@Override
public IndexOutput createOutput(String name)
{
throw new UnsupportedOperationException();
}
/** Not implemented
* @throws UnsupportedOperationException */
@Override
public Lock makeLock(String name)
{
throw new UnsupportedOperationException();
}
/** Implementation of an IndexInput that reads from a portion of the
* compound file. The visibility is left as "package" *only* because
* this helps with testing since JUnit test cases in a different class
* can then access package fields of this class.
*/
static final class CSIndexInput extends BufferedIndexInput {
IndexInput base;
long fileOffset;
long length;
CSIndexInput(final IndexInput base, final long fileOffset, final long length)
{
this(base, fileOffset, length, BufferedIndexInput.BUFFER_SIZE);
}
CSIndexInput(final IndexInput base, final long fileOffset, final long length, int readBufferSize)
{
super(readBufferSize);
this.base = (IndexInput)base.clone();
this.fileOffset = fileOffset;
this.length = length;
}
@Override
public Object clone() {
CSIndexInput clone = (CSIndexInput)super.clone();
clone.base = (IndexInput)base.clone();
clone.fileOffset = fileOffset;
clone.length = length;
return clone;
}
/** Expert: implements buffer refill. Reads bytes from the current
* position in the input.
* @param b the array to read bytes into
* @param offset the offset in the array to start storing bytes
* @param len the number of bytes to read
*/
@Override
protected void readInternal(byte[] b, int offset, int len)
throws IOException
{
long start = getFilePointer();
if(start + len > length)
throw new IOException("read past EOF");
base.seek(fileOffset + start);
base.readBytes(b, offset, len, false);
}
/** Expert: implements seek. Sets current position in this file, where
* the next {@link #readInternal(byte[],int,int)} will occur.
* @see #readInternal(byte[],int,int)
*/
@Override
protected void seekInternal(long pos) {}
/** Closes the stream to further operations. */
@Override
public void close() throws IOException {
base.close();
}
@Override
public long length() {
return length;
}
}
}

View File

@ -0,0 +1,247 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.IndexInput;
import java.util.LinkedList;
import java.util.HashSet;
import java.io.IOException;
/**
* Combines multiple files into a single compound file.
* The file format:<br>
* <ul>
* <li>VInt fileCount</li>
* <li>{Directory}
* fileCount entries with the following structure:</li>
* <ul>
* <li>long dataOffset</li>
* <li>String fileName</li>
* </ul>
* <li>{File Data}
* fileCount entries with the raw data of the corresponding file</li>
* </ul>
*
* The fileCount integer indicates how many files are contained in this compound
* file. The {directory} that follows has that many entries. Each directory entry
* contains a long pointer to the start of this file's data section, and a String
* with that file's name.
*/
final class CompoundFileWriter {
private static final class FileEntry {
/** source file */
String file;
/** temporary holder for the start of directory entry for this file */
long directoryOffset;
/** temporary holder for the start of this file's data section */
long dataOffset;
}
private Directory directory;
private String fileName;
private HashSet<String> ids;
private LinkedList<FileEntry> entries;
private boolean merged = false;
private SegmentMerger.CheckAbort checkAbort;
/** Create the compound stream in the specified file. The file name is the
* entire name (no extensions are added).
* @throws NullPointerException if <code>dir</code> or <code>name</code> is null
*/
public CompoundFileWriter(Directory dir, String name) {
this(dir, name, null);
}
CompoundFileWriter(Directory dir, String name, SegmentMerger.CheckAbort checkAbort) {
if (dir == null)
throw new NullPointerException("directory cannot be null");
if (name == null)
throw new NullPointerException("name cannot be null");
this.checkAbort = checkAbort;
directory = dir;
fileName = name;
ids = new HashSet<String>();
entries = new LinkedList<FileEntry>();
}
/** Returns the directory of the compound file. */
public Directory getDirectory() {
return directory;
}
/** Returns the name of the compound file. */
public String getName() {
return fileName;
}
/** Add a source stream. <code>file</code> is the string by which the
* sub-stream will be known in the compound stream.
*
* @throws IllegalStateException if this writer is closed
* @throws NullPointerException if <code>file</code> is null
* @throws IllegalArgumentException if a file with the same name
* has been added already
*/
public void addFile(String file) {
if (merged)
throw new IllegalStateException(
"Can't add extensions after merge has been called");
if (file == null)
throw new NullPointerException(
"file cannot be null");
if (! ids.add(file))
throw new IllegalArgumentException(
"File " + file + " already added");
FileEntry entry = new FileEntry();
entry.file = file;
entries.add(entry);
}
/** Merge files with the extensions added up to now.
* All files with these extensions are combined sequentially into the
* compound stream. After successful merge, the source files
* are deleted.
* @throws IllegalStateException if close() had been called before or
* if no file has been added to this object
*/
public void close() throws IOException {
if (merged)
throw new IllegalStateException(
"Merge already performed");
if (entries.isEmpty())
throw new IllegalStateException(
"No entries to merge have been defined");
merged = true;
// open the compound stream
IndexOutput os = null;
try {
os = directory.createOutput(fileName);
// Write the number of entries
os.writeVInt(entries.size());
// Write the directory with all offsets at 0.
// Remember the positions of directory entries so that we can
// adjust the offsets later
long totalSize = 0;
for (FileEntry fe : entries) {
fe.directoryOffset = os.getFilePointer();
os.writeLong(0); // for now
os.writeString(fe.file);
totalSize += directory.fileLength(fe.file);
}
// Pre-allocate size of file as optimization --
// this can potentially help IO performance as
// we write the file and also later during
// searching. It also uncovers a disk-full
// situation earlier and hopefully without
// actually filling disk to 100%:
final long finalLength = totalSize+os.getFilePointer();
os.setLength(finalLength);
// Open the files and copy their data into the stream.
// Remember the locations of each file's data section.
byte buffer[] = new byte[16384];
for (FileEntry fe : entries) {
fe.dataOffset = os.getFilePointer();
copyFile(fe, os, buffer);
}
// Write the data offsets into the directory of the compound stream
for (FileEntry fe : entries) {
os.seek(fe.directoryOffset);
os.writeLong(fe.dataOffset);
}
assert finalLength == os.length();
// Close the output stream. Set the os to null before trying to
// close so that if an exception occurs during the close, the
// finally clause below will not attempt to close the stream
// the second time.
IndexOutput tmp = os;
os = null;
tmp.close();
} finally {
if (os != null) try { os.close(); } catch (IOException e) { }
}
}
/** Copy the contents of the file with specified extension into the
* provided output stream. Use the provided buffer for moving data
* to reduce memory allocation.
*/
private void copyFile(FileEntry source, IndexOutput os, byte buffer[])
throws IOException
{
IndexInput is = null;
try {
long startPtr = os.getFilePointer();
is = directory.openInput(source.file);
long length = is.length();
long remainder = length;
int chunk = buffer.length;
while(remainder > 0) {
int len = (int) Math.min(chunk, remainder);
is.readBytes(buffer, 0, len, false);
os.writeBytes(buffer, len);
remainder -= len;
if (checkAbort != null)
// Roughly every 2 MB we will check if
// it's time to abort
checkAbort.work(80);
}
// Verify that remainder is 0
if (remainder != 0)
throw new IOException(
"Non-zero remainder length after copying: " + remainder
+ " (id: " + source.file + ", length: " + length
+ ", buffer size: " + chunk + ")");
// Verify that the output length diff is equal to original file
long endPtr = os.getFilePointer();
long diff = endPtr - startPtr;
if (diff != length)
throw new IOException(
"Difference in the output file offsets " + diff
+ " does not match the original file length " + length);
} finally {
if (is != null) is.close();
}
}
}

View File

@ -0,0 +1,409 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ThreadInterruptedException;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
/** A {@link MergeScheduler} that runs each merge using a
* separate thread, up until a maximum number of threads
* ({@link #setMaxThreadCount}) at which when a merge is
* needed, the thread(s) that are updating the index will
* pause until one or more merges completes. This is a
* simple way to use concurrency in the indexing process
* without having to create and manage application level
* threads. */
public class ConcurrentMergeScheduler extends MergeScheduler {
private int mergeThreadPriority = -1;
protected List<MergeThread> mergeThreads = new ArrayList<MergeThread>();
// Max number of threads allowed to be merging at once
private int maxThreadCount = 1;
protected Directory dir;
private boolean closed;
protected IndexWriter writer;
protected int mergeThreadCount;
public ConcurrentMergeScheduler() {
if (allInstances != null) {
// Only for testing
addMyself();
}
}
/** Sets the max # simultaneous threads that may be
* running. If a merge is necessary yet we already have
* this many threads running, the incoming thread (that
* is calling add/updateDocument) will block until
* a merge thread has completed. */
public void setMaxThreadCount(int count) {
if (count < 1)
throw new IllegalArgumentException("count should be at least 1");
maxThreadCount = count;
}
/** Get the max # simultaneous threads that may be
* running. @see #setMaxThreadCount. */
public int getMaxThreadCount() {
return maxThreadCount;
}
/** Return the priority that merge threads run at. By
* default the priority is 1 plus the priority of (ie,
* slightly higher priority than) the first thread that
* calls merge. */
public synchronized int getMergeThreadPriority() {
initMergeThreadPriority();
return mergeThreadPriority;
}
/** Set the priority that merge threads run at. */
public synchronized void setMergeThreadPriority(int pri) {
if (pri > Thread.MAX_PRIORITY || pri < Thread.MIN_PRIORITY)
throw new IllegalArgumentException("priority must be in range " + Thread.MIN_PRIORITY + " .. " + Thread.MAX_PRIORITY + " inclusive");
mergeThreadPriority = pri;
final int numThreads = mergeThreadCount();
for(int i=0;i<numThreads;i++) {
MergeThread merge = mergeThreads.get(i);
merge.setThreadPriority(pri);
}
}
private boolean verbose() {
return writer != null && writer.verbose();
}
private void message(String message) {
if (verbose())
writer.message("CMS: " + message);
}
private synchronized void initMergeThreadPriority() {
if (mergeThreadPriority == -1) {
// Default to slightly higher priority than our
// calling thread
mergeThreadPriority = 1+Thread.currentThread().getPriority();
if (mergeThreadPriority > Thread.MAX_PRIORITY)
mergeThreadPriority = Thread.MAX_PRIORITY;
}
}
@Override
public void close() {
closed = true;
}
public synchronized void sync() {
while(mergeThreadCount() > 0) {
if (verbose())
message("now wait for threads; currently " + mergeThreads.size() + " still running");
final int count = mergeThreads.size();
if (verbose()) {
for(int i=0;i<count;i++)
message(" " + i + ": " + mergeThreads.get(i));
}
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
}
}
private synchronized int mergeThreadCount() {
int count = 0;
final int numThreads = mergeThreads.size();
for(int i=0;i<numThreads;i++)
if (mergeThreads.get(i).isAlive())
count++;
return count;
}
@Override
public void merge(IndexWriter writer)
throws CorruptIndexException, IOException {
assert !Thread.holdsLock(writer);
this.writer = writer;
initMergeThreadPriority();
dir = writer.getDirectory();
// First, quickly run through the newly proposed merges
// and add any orthogonal merges (ie a merge not
// involving segments already pending to be merged) to
// the queue. If we are way behind on merging, many of
// these newly proposed merges will likely already be
// registered.
if (verbose()) {
message("now merge");
message(" index: " + writer.segString());
}
// Iterate, pulling from the IndexWriter's queue of
// pending merges, until it's empty:
while(true) {
// TODO: we could be careful about which merges to do in
// the BG (eg maybe the "biggest" ones) vs FG, which
// merges to do first (the easiest ones?), etc.
MergePolicy.OneMerge merge = writer.getNextMerge();
if (merge == null) {
if (verbose())
message(" no more merges pending; now return");
return;
}
// We do this w/ the primary thread to keep
// deterministic assignment of segment names
writer.mergeInit(merge);
boolean success = false;
try {
synchronized(this) {
final MergeThread merger;
while (mergeThreadCount() >= maxThreadCount) {
if (verbose())
message(" too many merge threads running; stalling...");
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
}
if (verbose())
message(" consider merge " + merge.segString(dir));
assert mergeThreadCount() < maxThreadCount;
// OK to spawn a new merge thread to handle this
// merge:
merger = getMergeThread(writer, merge);
mergeThreads.add(merger);
if (verbose())
message(" launch new thread [" + merger.getName() + "]");
merger.start();
success = true;
}
} finally {
if (!success) {
writer.mergeFinish(merge);
}
}
}
}
/** Does the actual merge, by calling {@link IndexWriter#merge} */
protected void doMerge(MergePolicy.OneMerge merge)
throws IOException {
writer.merge(merge);
}
/** Create and return a new MergeThread */
protected synchronized MergeThread getMergeThread(IndexWriter writer, MergePolicy.OneMerge merge) throws IOException {
final MergeThread thread = new MergeThread(writer, merge);
thread.setThreadPriority(mergeThreadPriority);
thread.setDaemon(true);
thread.setName("Lucene Merge Thread #" + mergeThreadCount++);
return thread;
}
protected class MergeThread extends Thread {
IndexWriter writer;
MergePolicy.OneMerge startMerge;
MergePolicy.OneMerge runningMerge;
public MergeThread(IndexWriter writer, MergePolicy.OneMerge startMerge) throws IOException {
this.writer = writer;
this.startMerge = startMerge;
}
public synchronized void setRunningMerge(MergePolicy.OneMerge merge) {
runningMerge = merge;
}
public synchronized MergePolicy.OneMerge getRunningMerge() {
return runningMerge;
}
public void setThreadPriority(int pri) {
try {
setPriority(pri);
} catch (NullPointerException npe) {
// Strangely, Sun's JDK 1.5 on Linux sometimes
// throws NPE out of here...
} catch (SecurityException se) {
// Ignore this because we will still run fine with
// normal thread priority
}
}
@Override
public void run() {
// First time through the while loop we do the merge
// that we were started with:
MergePolicy.OneMerge merge = this.startMerge;
try {
if (verbose())
message(" merge thread: start");
while(true) {
setRunningMerge(merge);
doMerge(merge);
// Subsequent times through the loop we do any new
// merge that writer says is necessary:
merge = writer.getNextMerge();
if (merge != null) {
writer.mergeInit(merge);
if (verbose())
message(" merge thread: do another merge " + merge.segString(dir));
} else
break;
}
if (verbose())
message(" merge thread: done");
} catch (Throwable exc) {
// Ignore the exception if it was due to abort:
if (!(exc instanceof MergePolicy.MergeAbortedException)) {
if (!suppressExceptions) {
// suppressExceptions is normally only set during
// testing.
anyExceptions = true;
handleMergeException(exc);
}
}
} finally {
synchronized(ConcurrentMergeScheduler.this) {
ConcurrentMergeScheduler.this.notifyAll();
boolean removed = mergeThreads.remove(this);
assert removed;
}
}
}
@Override
public String toString() {
MergePolicy.OneMerge merge = getRunningMerge();
if (merge == null)
merge = startMerge;
return "merge thread: " + merge.segString(dir);
}
}
/** Called when an exception is hit in a background merge
* thread */
protected void handleMergeException(Throwable exc) {
try {
// When an exception is hit during merge, IndexWriter
// removes any partial files and then allows another
// merge to run. If whatever caused the error is not
// transient then the exception will keep happening,
// so, we sleep here to avoid saturating CPU in such
// cases:
Thread.sleep(250);
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
throw new MergePolicy.MergeException(exc, dir);
}
static boolean anyExceptions = false;
/** Used for testing */
public static boolean anyUnhandledExceptions() {
if (allInstances == null) {
throw new RuntimeException("setTestMode() was not called; often this is because your test case's setUp method fails to call super.setUp in LuceneTestCase");
}
synchronized(allInstances) {
final int count = allInstances.size();
// Make sure all outstanding threads are done so we see
// any exceptions they may produce:
for(int i=0;i<count;i++)
allInstances.get(i).sync();
boolean v = anyExceptions;
anyExceptions = false;
return v;
}
}
public static void clearUnhandledExceptions() {
synchronized(allInstances) {
anyExceptions = false;
}
}
/** Used for testing */
private void addMyself() {
synchronized(allInstances) {
final int size = allInstances.size();
int upto = 0;
for(int i=0;i<size;i++) {
final ConcurrentMergeScheduler other = allInstances.get(i);
if (!(other.closed && 0 == other.mergeThreadCount()))
// Keep this one for now: it still has threads or
// may spawn new threads
allInstances.set(upto++, other);
}
allInstances.subList(upto, allInstances.size()).clear();
allInstances.add(this);
}
}
private boolean suppressExceptions;
/** Used for testing */
void setSuppressExceptions() {
suppressExceptions = true;
}
/** Used for testing */
void clearSuppressExceptions() {
suppressExceptions = false;
}
/** Used for testing */
private static List<ConcurrentMergeScheduler> allInstances;
public static void setTestMode() {
allInstances = new ArrayList<ConcurrentMergeScheduler>();
}
}

Some files were not shown because too many files have changed in this diff Show More