mirror of https://github.com/apache/lucene.git
Initial revision
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149570 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
749b4aaf7e
commit
bd3948c539
|
@ -0,0 +1,87 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.lucene.store.Directory;
|
||||
import com.lucene.store.FSDirectory;
|
||||
import com.lucene.index.IndexReader;
|
||||
import com.lucene.index.Term;
|
||||
|
||||
class DeleteFiles {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Directory directory = FSDirectory.getDirectory("demo index", false);
|
||||
IndexReader reader = IndexReader.open(directory);
|
||||
|
||||
// Term term = new Term("path", "pizza");
|
||||
// int deleted = reader.delete(term);
|
||||
|
||||
// System.out.println("deleted " + deleted +
|
||||
// " documents containing " + term);
|
||||
|
||||
for (int i = 0; i < reader.maxDoc(); i++)
|
||||
reader.delete(i);
|
||||
|
||||
reader.close();
|
||||
directory.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import com.lucene.document.Document;
|
||||
import com.lucene.document.Field;
|
||||
import com.lucene.document.DateField;
|
||||
|
||||
/** A utility for making Lucene Documents from a File. */
|
||||
|
||||
public class FileDocument {
|
||||
/** Makes a document for a File.
|
||||
<p>
|
||||
The document has three fields:
|
||||
<ul>
|
||||
<li><code>path</code>--containing the pathname of the file, as a stored,
|
||||
tokenized field;
|
||||
<li><code>modified</code>--containing the last modified date of the file as
|
||||
a keyword field as encoded by <a
|
||||
href="lucene.document.DateField.html">DateField</a>; and
|
||||
<li><code>contents</code>--containing the full contents of the file, as a
|
||||
Reader field;
|
||||
*/
|
||||
public static Document Document(File f)
|
||||
throws java.io.FileNotFoundException {
|
||||
|
||||
// make a new, empty document
|
||||
Document doc = new Document();
|
||||
|
||||
// Add the path of the file as a field named "path". Use a Text field, so
|
||||
// that the index stores the path, and so that the path is searchable
|
||||
doc.add(Field.Text("path", f.getPath()));
|
||||
|
||||
// Add the last modified date of the file a field named "modified". Use a
|
||||
// Keyword field, so that it's searchable, but so that no attempt is made
|
||||
// to tokenize the field into words.
|
||||
doc.add(Field.Keyword("modified",
|
||||
DateField.timeToString(f.lastModified())));
|
||||
|
||||
// Add the contents of the file a field named "contents". Use a Text
|
||||
// field, specifying a Reader, so that the text of the file is tokenized.
|
||||
// ?? why doesn't FileReader work here ??
|
||||
FileInputStream is = new FileInputStream(f);
|
||||
Reader reader = new BufferedReader(new InputStreamReader(is));
|
||||
doc.add(Field.Text("contents", reader));
|
||||
|
||||
// return the document
|
||||
return doc;
|
||||
}
|
||||
|
||||
private FileDocument() {}
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
import com.lucene.document.*;
|
||||
import demo.HTMLParser.HTMLParser;
|
||||
|
||||
/** A utility for making Lucene Documents for HTML documents. */
|
||||
|
||||
public class HTMLDocument {
|
||||
static char dirSep = System.getProperty("file.separator").charAt(0);
|
||||
|
||||
public static String uid(File f) {
|
||||
// Append path and date into a string in such a way that lexicographic
|
||||
// sorting gives the same results as a walk of the file hierarchy. Thus
|
||||
// null (\u0000) is used both to separate directory components and to
|
||||
// separate the path from the date.
|
||||
return f.getPath().replace(dirSep, '\u0000') +
|
||||
"\u0000" +
|
||||
DateField.timeToString(f.lastModified());
|
||||
}
|
||||
|
||||
public static String uid2url(String uid) {
|
||||
String url = uid.replace('\u0000', '/'); // replace nulls with slashes
|
||||
return url.substring(0, url.lastIndexOf('/')); // remove date from end
|
||||
}
|
||||
|
||||
public static Document Document(File f)
|
||||
throws IOException, InterruptedException {
|
||||
// make a new, empty document
|
||||
Document doc = new Document();
|
||||
|
||||
// Add the url as a field named "url". Use an UnIndexed field, so
|
||||
// that the url is just stored with the document, but is not searchable.
|
||||
doc.add(Field.UnIndexed("url", f.getPath().replace(dirSep, '/')));
|
||||
|
||||
// Add the last modified date of the file a field named "modified". Use a
|
||||
// Keyword field, so that it's searchable, but so that no attempt is made
|
||||
// to tokenize the field into words.
|
||||
doc.add(Field.Keyword("modified",
|
||||
DateField.timeToString(f.lastModified())));
|
||||
|
||||
// Add the uid as a field, so that index can be incrementally maintained.
|
||||
// This field is not stored with document, it is indexed, but it is not
|
||||
// tokenized prior to indexing.
|
||||
doc.add(new Field("uid", uid(f), false, true, false));
|
||||
|
||||
HTMLParser parser = new HTMLParser(f);
|
||||
|
||||
// Add the tag-stripped contents as a Reader-valued Text field so it will
|
||||
// get tokenized and indexed.
|
||||
doc.add(Field.Text("contents", parser.getReader()));
|
||||
|
||||
// Add the summary as an UnIndexed field, so that it is stored and returned
|
||||
// with hit documents for display.
|
||||
doc.add(Field.UnIndexed("summary", parser.getSummary()));
|
||||
|
||||
// Add the title as a separate Text field, so that it can be searched
|
||||
// separately.
|
||||
doc.add(Field.Text("title", parser.getTitle()));
|
||||
|
||||
// return the document
|
||||
return doc;
|
||||
}
|
||||
|
||||
private HTMLDocument() {}
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
HTMLParser.java
|
||||
HTMLParserTokenManager.java
|
||||
TokenMgrError.java
|
||||
ParseException.java
|
||||
Token.java
|
||||
ASCII_CharStream.java
|
||||
HTMLParserConstants.java
|
|
@ -0,0 +1,365 @@
|
|||
package demo.HTMLParser;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class Entities {
|
||||
static final Hashtable decoder = new Hashtable(300);
|
||||
static final String[] encoder = new String[0x100];
|
||||
|
||||
static final String decode(String entity) {
|
||||
if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
|
||||
entity = entity.substring(0, entity.length()-1);
|
||||
if (entity.charAt(1) == '#') {
|
||||
int start = 2;
|
||||
int radix = 10;
|
||||
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
|
||||
start++;
|
||||
radix = 16;
|
||||
}
|
||||
Character c =
|
||||
new Character((char)Integer.parseInt(entity.substring(start), radix));
|
||||
return c.toString();
|
||||
} else {
|
||||
String s = (String)decoder.get(entity);
|
||||
if (s != null)
|
||||
return s;
|
||||
else return "";
|
||||
}
|
||||
}
|
||||
|
||||
static final public String encode(String s) {
|
||||
int length = s.length();
|
||||
StringBuffer buffer = new StringBuffer(length * 2);
|
||||
for (int i = 0; i < length; i++) {
|
||||
char c = s.charAt(i);
|
||||
int j = (int)c;
|
||||
if (j < 0x100 && encoder[j] != null) {
|
||||
buffer.append(encoder[j]); // have a named encoding
|
||||
buffer.append(';');
|
||||
} else if (j < 0x80) {
|
||||
buffer.append(c); // use ASCII value
|
||||
} else {
|
||||
buffer.append("&#"); // use numeric encoding
|
||||
buffer.append((int)c);
|
||||
buffer.append(';');
|
||||
}
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
static final void add(String entity, int value) {
|
||||
decoder.put(entity, (new Character((char)value)).toString());
|
||||
if (value < 0x100)
|
||||
encoder[value] = entity;
|
||||
}
|
||||
|
||||
static {
|
||||
add(" ", 160);
|
||||
add("¡", 161);
|
||||
add("¢", 162);
|
||||
add("£", 163);
|
||||
add("¤", 164);
|
||||
add("¥", 165);
|
||||
add("¦", 166);
|
||||
add("§", 167);
|
||||
add("¨", 168);
|
||||
add("©", 169);
|
||||
add("ª", 170);
|
||||
add("«", 171);
|
||||
add("¬", 172);
|
||||
add("­", 173);
|
||||
add("®", 174);
|
||||
add("¯", 175);
|
||||
add("°", 176);
|
||||
add("±", 177);
|
||||
add("²", 178);
|
||||
add("³", 179);
|
||||
add("´", 180);
|
||||
add("µ", 181);
|
||||
add("¶", 182);
|
||||
add("·", 183);
|
||||
add("¸", 184);
|
||||
add("¹", 185);
|
||||
add("º", 186);
|
||||
add("»", 187);
|
||||
add("¼", 188);
|
||||
add("½", 189);
|
||||
add("¾", 190);
|
||||
add("¿", 191);
|
||||
add("À", 192);
|
||||
add("Á", 193);
|
||||
add("Â", 194);
|
||||
add("Ã", 195);
|
||||
add("Ä", 196);
|
||||
add("Å", 197);
|
||||
add("Æ", 198);
|
||||
add("Ç", 199);
|
||||
add("È", 200);
|
||||
add("É", 201);
|
||||
add("Ê", 202);
|
||||
add("Ë", 203);
|
||||
add("Ì", 204);
|
||||
add("Í", 205);
|
||||
add("Î", 206);
|
||||
add("Ï", 207);
|
||||
add("Ð", 208);
|
||||
add("Ñ", 209);
|
||||
add("Ò", 210);
|
||||
add("Ó", 211);
|
||||
add("Ô", 212);
|
||||
add("Õ", 213);
|
||||
add("Ö", 214);
|
||||
add("×", 215);
|
||||
add("Ø", 216);
|
||||
add("Ù", 217);
|
||||
add("Ú", 218);
|
||||
add("Û", 219);
|
||||
add("Ü", 220);
|
||||
add("Ý", 221);
|
||||
add("Þ", 222);
|
||||
add("ß", 223);
|
||||
add("à", 224);
|
||||
add("á", 225);
|
||||
add("â", 226);
|
||||
add("ã", 227);
|
||||
add("ä", 228);
|
||||
add("å", 229);
|
||||
add("æ", 230);
|
||||
add("ç", 231);
|
||||
add("è", 232);
|
||||
add("é", 233);
|
||||
add("ê", 234);
|
||||
add("ë", 235);
|
||||
add("ì", 236);
|
||||
add("í", 237);
|
||||
add("î", 238);
|
||||
add("ï", 239);
|
||||
add("ð", 240);
|
||||
add("ñ", 241);
|
||||
add("ò", 242);
|
||||
add("ó", 243);
|
||||
add("ô", 244);
|
||||
add("õ", 245);
|
||||
add("ö", 246);
|
||||
add("÷", 247);
|
||||
add("ø", 248);
|
||||
add("ù", 249);
|
||||
add("ú", 250);
|
||||
add("û", 251);
|
||||
add("ü", 252);
|
||||
add("ý", 253);
|
||||
add("þ", 254);
|
||||
add("ÿ", 255);
|
||||
add("&fnof", 402);
|
||||
add("&Alpha", 913);
|
||||
add("&Beta", 914);
|
||||
add("&Gamma", 915);
|
||||
add("&Delta", 916);
|
||||
add("&Epsilon",917);
|
||||
add("&Zeta", 918);
|
||||
add("&Eta", 919);
|
||||
add("&Theta", 920);
|
||||
add("&Iota", 921);
|
||||
add("&Kappa", 922);
|
||||
add("&Lambda", 923);
|
||||
add("&Mu", 924);
|
||||
add("&Nu", 925);
|
||||
add("&Xi", 926);
|
||||
add("&Omicron",927);
|
||||
add("&Pi", 928);
|
||||
add("&Rho", 929);
|
||||
add("&Sigma", 931);
|
||||
add("&Tau", 932);
|
||||
add("&Upsilon",933);
|
||||
add("&Phi", 934);
|
||||
add("&Chi", 935);
|
||||
add("&Psi", 936);
|
||||
add("&Omega", 937);
|
||||
add("&alpha", 945);
|
||||
add("&beta", 946);
|
||||
add("&gamma", 947);
|
||||
add("&delta", 948);
|
||||
add("&epsilon",949);
|
||||
add("&zeta", 950);
|
||||
add("&eta", 951);
|
||||
add("&theta", 952);
|
||||
add("&iota", 953);
|
||||
add("&kappa", 954);
|
||||
add("&lambda", 955);
|
||||
add("&mu", 956);
|
||||
add("&nu", 957);
|
||||
add("&xi", 958);
|
||||
add("&omicron",959);
|
||||
add("&pi", 960);
|
||||
add("&rho", 961);
|
||||
add("&sigmaf", 962);
|
||||
add("&sigma", 963);
|
||||
add("&tau", 964);
|
||||
add("&upsilon",965);
|
||||
add("&phi", 966);
|
||||
add("&chi", 967);
|
||||
add("&psi", 968);
|
||||
add("&omega", 969);
|
||||
add("&thetasym",977);
|
||||
add("&upsih", 978);
|
||||
add("&piv", 982);
|
||||
add("&bull", 8226);
|
||||
add("&hellip", 8230);
|
||||
add("&prime", 8242);
|
||||
add("&Prime", 8243);
|
||||
add("&oline", 8254);
|
||||
add("&frasl", 8260);
|
||||
add("&weierp", 8472);
|
||||
add("&image", 8465);
|
||||
add("&real", 8476);
|
||||
add("&trade", 8482);
|
||||
add("&alefsym",8501);
|
||||
add("&larr", 8592);
|
||||
add("&uarr", 8593);
|
||||
add("&rarr", 8594);
|
||||
add("&darr", 8595);
|
||||
add("&harr", 8596);
|
||||
add("&crarr", 8629);
|
||||
add("&lArr", 8656);
|
||||
add("&uArr", 8657);
|
||||
add("&rArr", 8658);
|
||||
add("&dArr", 8659);
|
||||
add("&hArr", 8660);
|
||||
add("&forall", 8704);
|
||||
add("&part", 8706);
|
||||
add("&exist", 8707);
|
||||
add("&empty", 8709);
|
||||
add("&nabla", 8711);
|
||||
add("&isin", 8712);
|
||||
add("¬in", 8713);
|
||||
add("&ni", 8715);
|
||||
add("&prod", 8719);
|
||||
add("&sum", 8721);
|
||||
add("&minus", 8722);
|
||||
add("&lowast", 8727);
|
||||
add("&radic", 8730);
|
||||
add("&prop", 8733);
|
||||
add("&infin", 8734);
|
||||
add("&ang", 8736);
|
||||
add("&and", 8743);
|
||||
add("&or", 8744);
|
||||
add("&cap", 8745);
|
||||
add("&cup", 8746);
|
||||
add("&int", 8747);
|
||||
add("&there4", 8756);
|
||||
add("&sim", 8764);
|
||||
add("&cong", 8773);
|
||||
add("&asymp", 8776);
|
||||
add("&ne", 8800);
|
||||
add("&equiv", 8801);
|
||||
add("&le", 8804);
|
||||
add("&ge", 8805);
|
||||
add("&sub", 8834);
|
||||
add("&sup", 8835);
|
||||
add("&nsub", 8836);
|
||||
add("&sube", 8838);
|
||||
add("&supe", 8839);
|
||||
add("&oplus", 8853);
|
||||
add("&otimes", 8855);
|
||||
add("&perp", 8869);
|
||||
add("&sdot", 8901);
|
||||
add("&lceil", 8968);
|
||||
add("&rceil", 8969);
|
||||
add("&lfloor", 8970);
|
||||
add("&rfloor", 8971);
|
||||
add("&lang", 9001);
|
||||
add("&rang", 9002);
|
||||
add("&loz", 9674);
|
||||
add("&spades", 9824);
|
||||
add("&clubs", 9827);
|
||||
add("&hearts", 9829);
|
||||
add("&diams", 9830);
|
||||
add(""", 34);
|
||||
add("&", 38);
|
||||
add("<", 60);
|
||||
add(">", 62);
|
||||
add("&OElig", 338);
|
||||
add("&oelig", 339);
|
||||
add("&Scaron", 352);
|
||||
add("&scaron", 353);
|
||||
add("&Yuml", 376);
|
||||
add("&circ", 710);
|
||||
add("&tilde", 732);
|
||||
add("&ensp", 8194);
|
||||
add("&emsp", 8195);
|
||||
add("&thinsp", 8201);
|
||||
add("&zwnj", 8204);
|
||||
add("&zwj", 8205);
|
||||
add("&lrm", 8206);
|
||||
add("&rlm", 8207);
|
||||
add("&ndash", 8211);
|
||||
add("&mdash", 8212);
|
||||
add("&lsquo", 8216);
|
||||
add("&rsquo", 8217);
|
||||
add("&sbquo", 8218);
|
||||
add("&ldquo", 8220);
|
||||
add("&rdquo", 8221);
|
||||
add("&bdquo", 8222);
|
||||
add("&dagger", 8224);
|
||||
add("&Dagger", 8225);
|
||||
add("&permil", 8240);
|
||||
add("&lsaquo", 8249);
|
||||
add("&rsaquo", 8250);
|
||||
add("&euro", 8364);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,347 @@
|
|||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
// HTMLParser.jj
|
||||
|
||||
options {
|
||||
STATIC = false;
|
||||
OPTIMIZE_TOKEN_MANAGER = true;
|
||||
//DEBUG_LOOKAHEAD = true;
|
||||
//DEBUG_TOKEN_MANAGER = true;
|
||||
}
|
||||
|
||||
PARSER_BEGIN(HTMLParser)
|
||||
|
||||
package org.apache.lucene.HTMLParser;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
public class HTMLParser {
|
||||
public static int SUMMARY_LENGTH = 200;
|
||||
|
||||
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
|
||||
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
|
||||
int length = 0;
|
||||
boolean titleComplete = false;
|
||||
boolean inTitle = false;
|
||||
boolean inScript = false;
|
||||
boolean afterTag = false;
|
||||
boolean afterSpace = false;
|
||||
String eol = System.getProperty("line.separator");
|
||||
PipedReader pipeIn = null;
|
||||
PipedWriter pipeOut;
|
||||
|
||||
public HTMLParser(File file) throws FileNotFoundException {
|
||||
this(new FileInputStream(file));
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || (length > SUMMARY_LENGTH))
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
public String getSummary() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (summary.length() >= SUMMARY_LENGTH)
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
if (summary.length() > SUMMARY_LENGTH)
|
||||
summary.setLength(SUMMARY_LENGTH);
|
||||
|
||||
String sum = summary.toString().trim();
|
||||
String tit = getTitle();
|
||||
if (sum.startsWith(tit))
|
||||
return sum.substring(tit.length());
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
|
||||
public Reader getReader() throws IOException {
|
||||
if (pipeIn == null) {
|
||||
pipeIn = new PipedReader();
|
||||
pipeOut = new PipedWriter(pipeIn);
|
||||
|
||||
Thread thread = new ParserThread(this);
|
||||
thread.start(); // start parsing
|
||||
}
|
||||
|
||||
return pipeIn;
|
||||
}
|
||||
|
||||
void addToSummary(String text) {
|
||||
if (summary.length() < SUMMARY_LENGTH) {
|
||||
summary.append(text);
|
||||
if (summary.length() >= SUMMARY_LENGTH) {
|
||||
synchronized(this) {
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addText(String text) throws IOException {
|
||||
if (inScript)
|
||||
return;
|
||||
if (inTitle)
|
||||
title.append(text);
|
||||
else {
|
||||
addToSummary(text);
|
||||
if (!titleComplete && !title.equals("")) { // finished title
|
||||
synchronized(this) {
|
||||
titleComplete = true; // tell waiting threads
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length += text.length();
|
||||
pipeOut.write(text);
|
||||
|
||||
afterSpace = false;
|
||||
}
|
||||
|
||||
void addSpace() throws IOException {
|
||||
if (inScript)
|
||||
return;
|
||||
if (!afterSpace) {
|
||||
if (inTitle)
|
||||
title.append(" ");
|
||||
else
|
||||
addToSummary(" ");
|
||||
|
||||
String space = afterTag ? eol : " ";
|
||||
length += space.length();
|
||||
pipeOut.write(space);
|
||||
afterSpace = true;
|
||||
}
|
||||
}
|
||||
|
||||
// void handleException(Exception e) {
|
||||
// System.out.println(e.toString()); // print the error message
|
||||
// System.out.println("Skipping...");
|
||||
// Token t;
|
||||
// do {
|
||||
// t = getNextToken();
|
||||
// } while (t.kind != TagEnd);
|
||||
// }
|
||||
}
|
||||
|
||||
PARSER_END(HTMLParser)
|
||||
|
||||
|
||||
void HTMLDocument() throws IOException :
|
||||
{
|
||||
Token t;
|
||||
}
|
||||
{
|
||||
// try {
|
||||
( Tag() { afterTag = true; }
|
||||
| t=Decl() { afterTag = true; }
|
||||
| CommentTag() { afterTag = true; }
|
||||
| t=<Word> { addText(t.image); afterTag = false; }
|
||||
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
|
||||
| t=<Punct> { addText(t.image); afterTag = false; }
|
||||
| <Space> { addSpace(); afterTag = false; }
|
||||
)* <EOF>
|
||||
// } catch (ParseException e) {
|
||||
// handleException(e);
|
||||
// }
|
||||
}
|
||||
|
||||
void Tag() throws IOException :
|
||||
{
|
||||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
}
|
||||
{
|
||||
t1=<TagName> {
|
||||
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
if (inScript) { // keep track if in <SCRIPT>
|
||||
inScript = !t1.image.equalsIgnoreCase("</script");
|
||||
} else {
|
||||
inScript = t1.image.equalsIgnoreCase("<script");
|
||||
}
|
||||
}
|
||||
(t1=<ArgName>
|
||||
(<ArgEquals>
|
||||
(t2=ArgValue() // save ALT text in IMG tag
|
||||
{
|
||||
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
|
||||
addText("[" + t2.image + "]");
|
||||
}
|
||||
)?
|
||||
)?
|
||||
)*
|
||||
<TagEnd>
|
||||
}
|
||||
|
||||
Token ArgValue() :
|
||||
{
|
||||
Token t = null;
|
||||
}
|
||||
{
|
||||
t=<ArgValue> { return t; }
|
||||
| LOOKAHEAD(2)
|
||||
<ArgQuote1> <CloseQuote1> { return t; }
|
||||
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
|
||||
| LOOKAHEAD(2)
|
||||
<ArgQuote2> <CloseQuote2> { return t; }
|
||||
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
|
||||
}
|
||||
|
||||
|
||||
Token Decl() :
|
||||
{
|
||||
Token t;
|
||||
}
|
||||
{
|
||||
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
|
||||
{ return t; }
|
||||
}
|
||||
|
||||
|
||||
void CommentTag() :
|
||||
{}
|
||||
{
|
||||
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
||||
|
|
||||
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
|
||||
}
|
||||
|
||||
|
||||
TOKEN :
|
||||
{
|
||||
< TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
|
||||
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
|
||||
|
||||
| < Comment1: "<!--" > : WithinComment1
|
||||
| < Comment2: "<!" > : WithinComment2
|
||||
|
||||
| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
|
||||
<LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
|
||||
| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
|
||||
| < #NUM: ["0"-"9"] >
|
||||
|
||||
| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? ) >
|
||||
|
||||
| < Space: (<SP>)+ >
|
||||
| < #SP: [" ","\t","\r","\n"] >
|
||||
|
||||
| < Punct: ~[] > // Keep this last. It is a catch-all.
|
||||
}
|
||||
|
||||
|
||||
<WithinTag> TOKEN:
|
||||
{
|
||||
< ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
|
||||
(~[" ","\t","\r","\n","=",">"])* >
|
||||
| < ArgEquals: "=" > : AfterEquals
|
||||
| < TagEnd: ">" | "=>" > : DEFAULT
|
||||
}
|
||||
|
||||
<AfterEquals> TOKEN:
|
||||
{
|
||||
< ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
|
||||
(~[" ","\t","\r","\n",">"])* > : WithinTag
|
||||
}
|
||||
|
||||
<WithinTag, AfterEquals> TOKEN:
|
||||
{
|
||||
< ArgQuote1: "'" > : WithinQuote1
|
||||
| < ArgQuote2: "\"" > : WithinQuote2
|
||||
}
|
||||
|
||||
<WithinTag, AfterEquals> SKIP:
|
||||
{
|
||||
< <Space> >
|
||||
}
|
||||
|
||||
<WithinQuote1> TOKEN:
|
||||
{
|
||||
< Quote1Text: (~["'"])+ >
|
||||
| < CloseQuote1: <ArgQuote1> > : WithinTag
|
||||
}
|
||||
|
||||
<WithinQuote2> TOKEN:
|
||||
{
|
||||
< Quote2Text: (~["\""])+ >
|
||||
| < CloseQuote2: <ArgQuote2> > : WithinTag
|
||||
}
|
||||
|
||||
|
||||
<WithinComment1> TOKEN :
|
||||
{
|
||||
< CommentText1: (~["-"])+ | "-" >
|
||||
| < CommentEnd1: "-->" > : DEFAULT
|
||||
}
|
||||
|
||||
<WithinComment2> TOKEN :
|
||||
{
|
||||
< CommentText2: (~[">"])+ >
|
||||
| < CommentEnd2: ">" > : DEFAULT
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
# sub-directory makefile for lucene
|
||||
ROOT = ../..
|
||||
include ../../com/lucene/rules.mk
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.HTMLParser;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
class ParserThread extends Thread {
|
||||
HTMLParser parser;
|
||||
|
||||
ParserThread(HTMLParser p) {
|
||||
parser = p;
|
||||
}
|
||||
|
||||
public void run() { // convert pipeOut to pipeIn
|
||||
try {
|
||||
try { // parse document to pipeOut
|
||||
parser.HTMLDocument();
|
||||
} catch (ParseException e) {
|
||||
System.out.println("Parse Aborted: " + e.getMessage());
|
||||
} catch (TokenMgrError e) {
|
||||
System.out.println("Parse Aborted: " + e.getMessage());
|
||||
} finally {
|
||||
parser.pipeOut.close();
|
||||
synchronized (parser) {
|
||||
parser.summary.setLength(parser.SUMMARY_LENGTH);
|
||||
parser.titleComplete = true;
|
||||
parser.notifyAll();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package org.apache.lucene.HTMLParser;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
class Test {
|
||||
public static void main(String[] argv) throws Exception {
|
||||
if ("-dir".equals(argv[0])) {
|
||||
String[] files = new File(argv[1]).list();
|
||||
java.util.Arrays.sort(files);
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
System.err.println(files[i]);
|
||||
File file = new File(argv[1], files[i]);
|
||||
parse(file);
|
||||
}
|
||||
} else
|
||||
parse(new File(argv[0]));
|
||||
}
|
||||
|
||||
public static void parse(File file) throws Exception {
|
||||
HTMLParser parser = new HTMLParser(file);
|
||||
System.out.println("Title: " + Entities.encode(parser.getTitle()));
|
||||
System.out.println("Summary: " + Entities.encode(parser.getSummary()));
|
||||
LineNumberReader reader = new LineNumberReader(parser.getReader());
|
||||
for (String l = reader.readLine(); l != null; l = reader.readLine())
|
||||
System.out.println(l);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.lucene.analysis.StopAnalyzer;
|
||||
import com.lucene.index.IndexWriter;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
|
||||
class IndexFiles {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Date start = new Date();
|
||||
|
||||
IndexWriter writer = new IndexWriter("index", new StopAnalyzer(), true);
|
||||
writer.mergeFactor = 20;
|
||||
|
||||
indexDocs(writer, new File(args[0]));
|
||||
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
Date end = new Date();
|
||||
|
||||
System.out.print(end.getTime() - start.getTime());
|
||||
System.out.println(" total milliseconds");
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public static void indexDocs(IndexWriter writer, File file)
|
||||
throws Exception {
|
||||
if (file.isDirectory()) {
|
||||
String[] files = file.list();
|
||||
for (int i = 0; i < files.length; i++)
|
||||
indexDocs(writer, new File(file, files[i]));
|
||||
} else {
|
||||
System.out.println("adding " + file);
|
||||
writer.addDocument(FileDocument.Document(file));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,195 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.lucene.analysis.StopAnalyzer;
|
||||
import com.lucene.index.*;
|
||||
import com.lucene.document.Document;
|
||||
import com.lucene.util.Arrays;
|
||||
import demo.HTMLParser.HTMLParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
|
||||
class IndexHTML {
|
||||
private static boolean deleting = false; // true during deletion pass
|
||||
private static IndexReader reader; // existing index
|
||||
private static IndexWriter writer; // new index being built
|
||||
private static TermEnum uidIter; // document id iterator
|
||||
|
||||
public static void main(String[] argv) {
|
||||
try {
|
||||
String index = "index";
|
||||
boolean create = false;
|
||||
File root = null;
|
||||
|
||||
String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
|
||||
|
||||
if (argv.length == 0) {
|
||||
System.err.println("Usage: " + usage);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < argv.length; i++) {
|
||||
if (argv[i].equals("-index")) { // parse -index option
|
||||
index = argv[++i];
|
||||
} else if (argv[i].equals("-create")) { // parse -create option
|
||||
create = true;
|
||||
} else if (i != argv.length-1) {
|
||||
System.err.println("Usage: " + usage);
|
||||
return;
|
||||
} else
|
||||
root = new File(argv[i]);
|
||||
}
|
||||
|
||||
Date start = new Date();
|
||||
|
||||
if (!create) { // delete stale docs
|
||||
deleting = true;
|
||||
indexDocs(root, index, create);
|
||||
}
|
||||
|
||||
writer = new IndexWriter(index, new StopAnalyzer(), create);
|
||||
writer.mergeFactor = 20;
|
||||
writer.maxFieldLength = 1000000;
|
||||
|
||||
indexDocs(root, index, create); // add new docs
|
||||
|
||||
System.out.println("Optimizing index...");
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
Date end = new Date();
|
||||
|
||||
System.out.print(end.getTime() - start.getTime());
|
||||
System.out.println(" total milliseconds");
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/* Walk directory hierarchy in uid order, while keeping uid iterator from
|
||||
/* existing index in sync. Mismatches indicate one of: (a) old documents to
|
||||
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
|
||||
/* documents, to be indexed.
|
||||
*/
|
||||
|
||||
private static void indexDocs(File file, String index, boolean create)
|
||||
throws Exception {
|
||||
if (!create) { // incrementally update
|
||||
|
||||
reader = IndexReader.open(index); // open existing index
|
||||
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
|
||||
|
||||
indexDocs(file);
|
||||
|
||||
if (deleting) { // delete rest of stale docs
|
||||
while (uidIter.term() != null && uidIter.term().field() == "uid") {
|
||||
System.out.println("deleting " +
|
||||
HTMLDocument.uid2url(uidIter.term().text()));
|
||||
reader.delete(uidIter.term());
|
||||
uidIter.next();
|
||||
}
|
||||
deleting = false;
|
||||
}
|
||||
|
||||
uidIter.close(); // close uid iterator
|
||||
reader.close(); // close existing index
|
||||
|
||||
} else // don't have exisiting
|
||||
indexDocs(file);
|
||||
}
|
||||
|
||||
private static void indexDocs(File file) throws Exception {
|
||||
if (file.isDirectory()) { // if a directory
|
||||
String[] files = file.list(); // list its files
|
||||
Arrays.sort(files); // sort the files
|
||||
for (int i = 0; i < files.length; i++) // recursively index them
|
||||
indexDocs(new File(file, files[i]));
|
||||
|
||||
} else if (file.getPath().endsWith(".html") || // index .html files
|
||||
file.getPath().endsWith(".htm") || // index .htm files
|
||||
file.getPath().endsWith(".txt")) { // index .txt files
|
||||
|
||||
if (uidIter != null) {
|
||||
String uid = HTMLDocument.uid(file); // construct uid for doc
|
||||
|
||||
while (uidIter.term() != null && uidIter.term().field() == "uid" &&
|
||||
uidIter.term().text().compareTo(uid) < 0) {
|
||||
if (deleting) { // delete stale docs
|
||||
System.out.println("deleting " +
|
||||
HTMLDocument.uid2url(uidIter.term().text()));
|
||||
reader.delete(uidIter.term());
|
||||
}
|
||||
uidIter.next();
|
||||
}
|
||||
if (uidIter.term() != null && uidIter.term().field() == "uid" &&
|
||||
uidIter.term().text().compareTo(uid) == 0) {
|
||||
uidIter.next(); // keep matching docs
|
||||
} else if (!deleting) { // add new docs
|
||||
Document doc = HTMLDocument.Document(file);
|
||||
System.out.println("adding " + doc.get("url"));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
} else { // creating a new index
|
||||
Document doc = HTMLDocument.Document(file);
|
||||
System.out.println("adding " + doc.get("url"));
|
||||
writer.addDocument(doc); // add docs unconditionally
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
# sub-directory makefile for lucene
|
||||
ROOT = ..
|
||||
include ../com/lucene/rules.mk
|
|
@ -0,0 +1,17 @@
|
|||
<HTML>
|
||||
<HEAD>
|
||||
<TITLE>Lucene Search Demo</TITLE>
|
||||
</HEAD>
|
||||
<BODY>
|
||||
|
||||
<CENTER>
|
||||
<H1>
|
||||
Lucene Search Demo</H1>
|
||||
|
||||
<form name=search action=http://localhost:8080/Search.jhtml method=get>
|
||||
<input name=query size=44> <input type=submit value=Search></form>
|
||||
|
||||
</CENTER>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
|
@ -0,0 +1,166 @@
|
|||
<HTML><!-- -*-java-*- -->
|
||||
<!-- Lucene Search Demo via CompiledPageServlet -->
|
||||
<!-- Copyright (c) 1998,2000 Douglass R. Cutting. -->
|
||||
|
||||
<java type=import>
|
||||
javax.servlet.*
|
||||
javax.servlet.http.*
|
||||
java.io.*
|
||||
com.lucene.analysis.*
|
||||
com.lucene.document.*
|
||||
com.lucene.index.*
|
||||
com.lucene.search.*
|
||||
com.lucene.queryParser.*
|
||||
demo.HTMLParser.Entities
|
||||
</java>
|
||||
|
||||
<java>
|
||||
// get index from request
|
||||
String indexName = request.getParameter("index");
|
||||
if (indexName == null) // default to "index"
|
||||
indexName = "index";
|
||||
Searcher searcher = // make searcher
|
||||
new IndexSearcher(getReader(indexName));
|
||||
|
||||
// get query from request
|
||||
String queryString = request.getParameter("query");
|
||||
if (queryString == null)
|
||||
throw new ServletException("no query specified");
|
||||
|
||||
int start = 0; // first hit to display
|
||||
String startString = request.getParameter("start");
|
||||
if (startString != null)
|
||||
start = Integer.parseInt(startString);
|
||||
|
||||
int hitsPerPage = 10; // number of hits to display
|
||||
String hitsString = request.getParameter("hitsPerPage");
|
||||
if (hitsString != null)
|
||||
hitsPerPage = Integer.parseInt(hitsString);
|
||||
|
||||
boolean showSummaries = true; // show summaries?
|
||||
if ("false".equals(request.getParameter("showSummaries")))
|
||||
showSummaries = false;
|
||||
|
||||
Query query = null;
|
||||
try { // parse query
|
||||
query = QueryParser.parse(queryString, "contents", analyzer);
|
||||
} catch (ParseException e) { // error parsing query
|
||||
</java>
|
||||
<HEAD><TITLE>Error Parsing Query</TITLE></HEAD><BODY>
|
||||
<p>While parsing `queryString`: `e.getMessage()`
|
||||
<java>
|
||||
return;
|
||||
}
|
||||
|
||||
String servletPath = request.getRequestURI(); // getServletPath should work
|
||||
int j = servletPath.indexOf('?'); // here but doesn't, so we
|
||||
if (j != -1) // remove query by hand...
|
||||
servletPath = servletPath.substring(0, j);
|
||||
|
||||
</java>
|
||||
|
||||
<head><title>Lucene Search Results</title></head><body>
|
||||
|
||||
<center>
|
||||
<form name=search action=`servletPath` method=get>
|
||||
<input name=query size=44 value='`queryString`'>
|
||||
<input type=hidden name=index value="`indexName`">
|
||||
<input type=hidden name=hitsPerPage value=`hitsPerPage`>
|
||||
<input type=hidden name=showSummaries value=`showSummaries`>
|
||||
<input type=submit value=Search>
|
||||
</form>
|
||||
</center>
|
||||
<java>
|
||||
Hits hits = searcher.search(query); // perform query
|
||||
int end = Math.min(hits.length(), start + hitsPerPage);
|
||||
</java>
|
||||
|
||||
<p>Hits <b><java type=print>start+1</java>-<java type=print>end</java></b>
|
||||
(out of <java type=print>hits.length()</java> total matching documents):
|
||||
|
||||
<ul>
|
||||
<java>
|
||||
for (int i = start; i < end; i++) { // display the hits
|
||||
Document doc = hits.doc(i);
|
||||
String title = doc.get("title");
|
||||
if (title.equals("")) // use url for docs w/o title
|
||||
title = doc.get("url");
|
||||
</java>
|
||||
<p><b><java type=print>(int)(hits.score(i) * 100.0f)</java>%
|
||||
<a href="`doc.get("url")`">
|
||||
<java type=print>Entities.encode(title)</java>
|
||||
</b></a>
|
||||
<java>
|
||||
if (showSummaries) { // maybe show summary
|
||||
</java>
|
||||
<ul><i>Summary</i>:
|
||||
<java type=print>Entities.encode(doc.get("summary"))</java>
|
||||
</ul>
|
||||
<java>
|
||||
}
|
||||
}
|
||||
</java>
|
||||
</ul>
|
||||
|
||||
<java>
|
||||
if (end < hits.length()) { // insert next page button
|
||||
</java>
|
||||
<center>
|
||||
<form name=search action=`servletPath` method=get>
|
||||
<input type=hidden name=query value='`queryString`'>
|
||||
<input type=hidden name=start value=`end`>
|
||||
<input type=hidden name=index value="`indexName`">
|
||||
<input type=hidden name=hitsPerPage value=`hitsPerPage`>
|
||||
<input type=hidden name=showSummaries value=`showSummaries`>
|
||||
<input type=submit value=Next>
|
||||
</form>
|
||||
</center>
|
||||
<java>
|
||||
}
|
||||
</java>
|
||||
|
||||
</body>
|
||||
|
||||
<java type=class>
|
||||
|
||||
Analyzer analyzer = new StopAnalyzer(); // used to tokenize queries
|
||||
|
||||
/** Keep a cache of open IndexReader's, so that an index does not have to
|
||||
opened for each query. The cache re-opens an index when it has changed
|
||||
so that additions and deletions are visible ASAP. */
|
||||
|
||||
static Hashtable indexCache = new Hashtable(); // name->CachedIndex
|
||||
|
||||
class CachedIndex { // an entry in the cache
|
||||
IndexReader reader; // an open reader
|
||||
long modified; // reader's modified date
|
||||
|
||||
CachedIndex(String name) throws IOException {
|
||||
modified = IndexReader.lastModified(name); // get modified date
|
||||
reader = IndexReader.open(name); // open reader
|
||||
}
|
||||
}
|
||||
|
||||
IndexReader getReader(String name) throws ServletException {
|
||||
CachedIndex index = // look in cache
|
||||
(CachedIndex)indexCache.get(name);
|
||||
|
||||
try {
|
||||
if (index != null && // check up-to-date
|
||||
(index.modified == IndexReader.lastModified(name)))
|
||||
return index.reader; // cache hit
|
||||
else {
|
||||
index = new CachedIndex(name); // cache miss
|
||||
}
|
||||
} catch (IOException e) {
|
||||
StringWriter writer = new StringWriter();
|
||||
PrintWriter pw = new PrintWriter(writer);
|
||||
throw new ServletException("Could not open index " + name + ": " +
|
||||
e.getClass().getName() + "--" +
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
indexCache.put(name, index); // add to cache
|
||||
return index.reader;
|
||||
}
|
||||
</java>
|
|
@ -0,0 +1,110 @@
|
|||
package org.apache.lucene;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import com.lucene.analysis.Analyzer;
|
||||
import com.lucene.analysis.StopAnalyzer;
|
||||
import com.lucene.document.Document;
|
||||
import com.lucene.search.Searcher;
|
||||
import com.lucene.search.IndexSearcher;
|
||||
import com.lucene.search.Query;
|
||||
import com.lucene.search.Hits;
|
||||
import com.lucene.queryParser.QueryParser;
|
||||
|
||||
class SearchFiles {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Searcher searcher = new IndexSearcher("index");
|
||||
Analyzer analyzer = new StopAnalyzer();
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
|
||||
while (true) {
|
||||
System.out.print("Query: ");
|
||||
String line = in.readLine();
|
||||
|
||||
if (line.length() == -1)
|
||||
break;
|
||||
|
||||
Query query = QueryParser.parse(line, "contents", analyzer);
|
||||
System.out.println("Searching for: " + query.toString("contents"));
|
||||
|
||||
Hits hits = searcher.search(query);
|
||||
System.out.println(hits.length() + " total matching documents");
|
||||
|
||||
final int HITS_PER_PAGE = 10;
|
||||
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
|
||||
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
|
||||
for (int i = start; i < end; i++)
|
||||
System.out.println(i + ". " + hits.doc(i).get("path"));
|
||||
if (hits.length() > end) {
|
||||
System.out.print("more (y/n) ? ");
|
||||
line = in.readLine();
|
||||
if (line.length() == 0 || line.charAt(0) == 'n')
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
searcher.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
# top-level makefile for lucene
|
||||
|
||||
all: jar doc
|
||||
|
||||
# root is two levels up
|
||||
ROOT = ../..
|
||||
|
||||
include rules.mk
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
|
||||
* policy for extracting index terms from text.
|
||||
* <p>
|
||||
* Typical implementations first build a Tokenizer, which breaks the stream of
|
||||
* characters from the Reader into raw Tokens. One or more TokenFilters may
|
||||
* then be applied to the output of the Tokenizer.
|
||||
* <p>
|
||||
* WARNING: You must override one of the methods defined by this class in your
|
||||
* subclass or the Analyzer will enter an infinite loop.
|
||||
*/
|
||||
abstract public class Analyzer {
|
||||
/** Creates a TokenStream which tokenizes all the text in the provided
|
||||
Reader. Default implementation forwards to tokenStream(Reader) for
|
||||
compatibility with older version. Override to allow Analyzer to choose
|
||||
strategy based on document and/or field. Must be able to handle null
|
||||
field name for backward compatibility. */
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
// implemented for backward compatibility
|
||||
return tokenStream(reader);
|
||||
}
|
||||
|
||||
/** Creates a TokenStream which tokenizes all the text in the provided
|
||||
* Reader. Provided for backward compatibility only.
|
||||
* @deprecated use tokenStream(String, Reader) instead.
|
||||
* @see tokenStream(String, Reader)
|
||||
*/
|
||||
public TokenStream tokenStream(Reader reader)
|
||||
{
|
||||
return tokenStream(null, reader);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
|
||||
to say, it defines tokens as maximal strings of adjacent letters, as defined
|
||||
by java.lang.Character.isLetter() predicate.
|
||||
|
||||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public final class LetterTokenizer extends Tokenizer {
|
||||
public LetterTokenizer(Reader in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
int length = 0;
|
||||
int start = offset;
|
||||
while (true) {
|
||||
final char c;
|
||||
|
||||
offset++;
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
};
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
if (Character.isLetter(c)) { // if it's a letter
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
|
||||
buffer[length++] = c; // buffer it
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/** Normalizes token text to lower case. */
|
||||
|
||||
public final class LowerCaseFilter extends TokenFilter {
|
||||
public LowerCaseFilter(TokenStream in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
Token t = input.next();
|
||||
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
t.termText = t.termText.toLowerCase();
|
||||
|
||||
return t;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,116 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
and LowerCaseFilter together. It divides text at non-letters and converts
|
||||
them to lower case. While it is functionally equivalent to the combination
|
||||
of LetterTokenizer and LowerCaseFilter, there is a performance advantage
|
||||
to doing the two tasks at once, hence this (redundent) implementation.
|
||||
|
||||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public final class LowerCaseTokenizer extends Tokenizer {
|
||||
public LowerCaseTokenizer(Reader in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
int length = 0;
|
||||
int start = offset;
|
||||
while (true) {
|
||||
final char c;
|
||||
|
||||
offset++;
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
};
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
if (Character.isLetter(c)) { // if it's a letter
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
|
||||
buffer[length++] = Character.toLowerCase(c);
|
||||
// buffer it
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
# sub-directory makefile for lucene
|
||||
include ../rules.mk
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/** Transforms the token stream as per the Porter stemming algorithm.
|
||||
Note: the input to the stemming filter must already be in lower case,
|
||||
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
|
||||
down the Tokenizer chain in order for this to work properly!
|
||||
|
||||
To use this filter with other analyzers, you'll want to write an
|
||||
Analyzer class that sets up the TokenStream chain as you want it.
|
||||
To use this with LowerCaseTokenizer, for example, you'd write an
|
||||
analyzer like this:
|
||||
|
||||
class MyAnalyzer extends Analyzer {
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new PorterStemFilter(new LowerCaseTokenizer(reader));
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
public final class PorterStemFilter extends TokenFilter {
|
||||
private PorterStemmer stemmer;
|
||||
|
||||
public PorterStemFilter(TokenStream in) {
|
||||
stemmer = new PorterStemmer();
|
||||
input = in;
|
||||
}
|
||||
|
||||
/** Returns the next input Token, after being stemmed */
|
||||
public final Token next() throws IOException {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
return null;
|
||||
else {
|
||||
String s = stemmer.stem(token.termText);
|
||||
if (s != token.termText) // Yes, I mean object reference comparison here
|
||||
token.termText = s;
|
||||
return token;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,584 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
Porter stemmer in Java. The original paper is in
|
||||
|
||||
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
||||
no. 3, pp 130-137,
|
||||
|
||||
See also http://www.muscat.com/~martin/stem.html
|
||||
|
||||
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
|
||||
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
|
||||
is then out outside the bounds of b.
|
||||
|
||||
Similarly,
|
||||
|
||||
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
|
||||
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
|
||||
b[j] is then outside the bounds of b.
|
||||
|
||||
Release 3.
|
||||
|
||||
[ This version is derived from Release 3, modified by Brian Goetz to
|
||||
optimize for fewer object creations. ]
|
||||
|
||||
*/
|
||||
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* Stemmer, implementing the Porter Stemming Algorithm
|
||||
*
|
||||
* The Stemmer class transforms a word into its root form. The input
|
||||
* word can be provided a character at time (by calling add()), or at once
|
||||
* by calling one of the various stem(something) methods.
|
||||
*/
|
||||
|
||||
class PorterStemmer
|
||||
{
|
||||
private char[] b;
|
||||
private int i, /* offset into b */
|
||||
j, k, k0;
|
||||
private boolean dirty = false;
|
||||
private static final int INC = 50; /* unit of size whereby b is increased */
|
||||
private static final int EXTRA = 1;
|
||||
|
||||
public PorterStemmer() {
|
||||
b = new char[INC];
|
||||
i = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* reset() resets the stemmer so it can stem another word. If you invoke
|
||||
* the stemmer by calling add(char) and then stem(), you must call reset()
|
||||
* before starting another word.
|
||||
*/
|
||||
public void reset() { i = 0; dirty = false; }
|
||||
|
||||
/**
|
||||
* Add a character to the word being stemmed. When you are finished
|
||||
* adding characters, you can call stem(void) to process the word.
|
||||
*/
|
||||
public void add(char ch) {
|
||||
if (b.length <= i + EXTRA) {
|
||||
char[] new_b = new char[b.length+INC];
|
||||
for (int c = 0; c < b.length; c++)
|
||||
new_b[c] = b[c];
|
||||
b = new_b;
|
||||
}
|
||||
b[i++] = ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* After a word has been stemmed, it can be retrieved by toString(),
|
||||
* or a reference to the internal buffer can be retrieved by getResultBuffer
|
||||
* and getResultLength (which is generally more efficient.)
|
||||
*/
|
||||
public String toString() { return new String(b,0,i); }
|
||||
|
||||
/**
|
||||
* Returns the length of the word resulting from the stemming process.
|
||||
*/
|
||||
public int getResultLength() { return i; }
|
||||
|
||||
/**
|
||||
* Returns a reference to a character buffer containing the results of
|
||||
* the stemming process. You also need to consult getResultLength()
|
||||
* to determine the length of the result.
|
||||
*/
|
||||
public char[] getResultBuffer() { return b; }
|
||||
|
||||
/* cons(i) is true <=> b[i] is a consonant. */
|
||||
|
||||
private final boolean cons(int i) {
|
||||
switch (b[i]) {
|
||||
case 'a': case 'e': case 'i': case 'o': case 'u':
|
||||
return false;
|
||||
case 'y':
|
||||
return (i==k0) ? true : !cons(i-1);
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* m() measures the number of consonant sequences between k0 and j. if c is
|
||||
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
|
||||
presence,
|
||||
|
||||
<c><v> gives 0
|
||||
<c>vc<v> gives 1
|
||||
<c>vcvc<v> gives 2
|
||||
<c>vcvcvc<v> gives 3
|
||||
....
|
||||
*/
|
||||
|
||||
private final int m() {
|
||||
int n = 0;
|
||||
int i = k0;
|
||||
while(true) {
|
||||
if (i > j)
|
||||
return n;
|
||||
if (! cons(i))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
while(true) {
|
||||
while(true) {
|
||||
if (i > j)
|
||||
return n;
|
||||
if (cons(i))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
n++;
|
||||
while(true) {
|
||||
if (i > j)
|
||||
return n;
|
||||
if (! cons(i))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
/* vowelinstem() is true <=> k0,...j contains a vowel */
|
||||
|
||||
private final boolean vowelinstem() {
|
||||
int i;
|
||||
for (i = k0; i <= j; i++)
|
||||
if (! cons(i))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
|
||||
|
||||
private final boolean doublec(int j) {
|
||||
if (j < k0+1)
|
||||
return false;
|
||||
if (b[j] != b[j-1])
|
||||
return false;
|
||||
return cons(j);
|
||||
}
|
||||
|
||||
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
|
||||
and also if the second c is not w,x or y. this is used when trying to
|
||||
restore an e at the end of a short word. e.g.
|
||||
|
||||
cav(e), lov(e), hop(e), crim(e), but
|
||||
snow, box, tray.
|
||||
|
||||
*/
|
||||
|
||||
private final boolean cvc(int i) {
|
||||
if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
|
||||
return false;
|
||||
else {
|
||||
int ch = b[i];
|
||||
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private final boolean ends(String s) {
|
||||
int l = s.length();
|
||||
int o = k-l+1;
|
||||
if (o < k0)
|
||||
return false;
|
||||
for (int i = 0; i < l; i++)
|
||||
if (b[o+i] != s.charAt(i))
|
||||
return false;
|
||||
j = k-l;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
|
||||
k. */
|
||||
|
||||
void setto(String s) {
|
||||
int l = s.length();
|
||||
int o = j+1;
|
||||
for (int i = 0; i < l; i++)
|
||||
b[o+i] = s.charAt(i);
|
||||
k = j+l;
|
||||
dirty = true;
|
||||
}
|
||||
|
||||
/* r(s) is used further down. */
|
||||
|
||||
void r(String s) { if (m() > 0) setto(s); }
|
||||
|
||||
/* step1() gets rid of plurals and -ed or -ing. e.g.
|
||||
|
||||
caresses -> caress
|
||||
ponies -> poni
|
||||
ties -> ti
|
||||
caress -> caress
|
||||
cats -> cat
|
||||
|
||||
feed -> feed
|
||||
agreed -> agree
|
||||
disabled -> disable
|
||||
|
||||
matting -> mat
|
||||
mating -> mate
|
||||
meeting -> meet
|
||||
milling -> mill
|
||||
messing -> mess
|
||||
|
||||
meetings -> meet
|
||||
|
||||
*/
|
||||
|
||||
private final void step1() {
|
||||
if (b[k] == 's') {
|
||||
if (ends("sses")) k -= 2;
|
||||
else if (ends("ies")) setto("i");
|
||||
else if (b[k-1] != 's') k--;
|
||||
}
|
||||
if (ends("eed")) {
|
||||
if (m() > 0)
|
||||
k--;
|
||||
}
|
||||
else if ((ends("ed") || ends("ing")) && vowelinstem()) {
|
||||
k = j;
|
||||
if (ends("at")) setto("ate");
|
||||
else if (ends("bl")) setto("ble");
|
||||
else if (ends("iz")) setto("ize");
|
||||
else if (doublec(k)) {
|
||||
int ch = b[k--];
|
||||
if (ch == 'l' || ch == 's' || ch == 'z')
|
||||
k++;
|
||||
}
|
||||
else if (m() == 1 && cvc(k))
|
||||
setto("e");
|
||||
}
|
||||
}
|
||||
|
||||
/* step2() turns terminal y to i when there is another vowel in the stem. */
|
||||
|
||||
private final void step2() {
|
||||
if (ends("y") && vowelinstem()) {
|
||||
b[k] = 'i';
|
||||
dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
|
||||
-ation) maps to -ize etc. note that the string before the suffix must give
|
||||
m() > 0. */
|
||||
|
||||
private final void step3() {
|
||||
if (k == k0) return; /* For Bug 1 */
|
||||
switch (b[k-1]) {
|
||||
case 'a':
|
||||
if (ends("ational")) { r("ate"); break; }
|
||||
if (ends("tional")) { r("tion"); break; }
|
||||
break;
|
||||
case 'c':
|
||||
if (ends("enci")) { r("ence"); break; }
|
||||
if (ends("anci")) { r("ance"); break; }
|
||||
break;
|
||||
case 'e':
|
||||
if (ends("izer")) { r("ize"); break; }
|
||||
break;
|
||||
case 'l':
|
||||
if (ends("bli")) { r("ble"); break; }
|
||||
if (ends("alli")) { r("al"); break; }
|
||||
if (ends("entli")) { r("ent"); break; }
|
||||
if (ends("eli")) { r("e"); break; }
|
||||
if (ends("ousli")) { r("ous"); break; }
|
||||
break;
|
||||
case 'o':
|
||||
if (ends("ization")) { r("ize"); break; }
|
||||
if (ends("ation")) { r("ate"); break; }
|
||||
if (ends("ator")) { r("ate"); break; }
|
||||
break;
|
||||
case 's':
|
||||
if (ends("alism")) { r("al"); break; }
|
||||
if (ends("iveness")) { r("ive"); break; }
|
||||
if (ends("fulness")) { r("ful"); break; }
|
||||
if (ends("ousness")) { r("ous"); break; }
|
||||
break;
|
||||
case 't':
|
||||
if (ends("aliti")) { r("al"); break; }
|
||||
if (ends("iviti")) { r("ive"); break; }
|
||||
if (ends("biliti")) { r("ble"); break; }
|
||||
break;
|
||||
case 'g':
|
||||
if (ends("logi")) { r("log"); break; }
|
||||
}
|
||||
}
|
||||
|
||||
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
|
||||
|
||||
private final void step4() {
|
||||
switch (b[k]) {
|
||||
case 'e':
|
||||
if (ends("icate")) { r("ic"); break; }
|
||||
if (ends("ative")) { r(""); break; }
|
||||
if (ends("alize")) { r("al"); break; }
|
||||
break;
|
||||
case 'i':
|
||||
if (ends("iciti")) { r("ic"); break; }
|
||||
break;
|
||||
case 'l':
|
||||
if (ends("ical")) { r("ic"); break; }
|
||||
if (ends("ful")) { r(""); break; }
|
||||
break;
|
||||
case 's':
|
||||
if (ends("ness")) { r(""); break; }
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
|
||||
|
||||
private final void step5() {
|
||||
if (k == k0) return; /* for Bug 1 */
|
||||
switch (b[k-1]) {
|
||||
case 'a':
|
||||
if (ends("al")) break;
|
||||
return;
|
||||
case 'c':
|
||||
if (ends("ance")) break;
|
||||
if (ends("ence")) break;
|
||||
return;
|
||||
case 'e':
|
||||
if (ends("er")) break; return;
|
||||
case 'i':
|
||||
if (ends("ic")) break; return;
|
||||
case 'l':
|
||||
if (ends("able")) break;
|
||||
if (ends("ible")) break; return;
|
||||
case 'n':
|
||||
if (ends("ant")) break;
|
||||
if (ends("ement")) break;
|
||||
if (ends("ment")) break;
|
||||
/* element etc. not stripped before the m */
|
||||
if (ends("ent")) break;
|
||||
return;
|
||||
case 'o':
|
||||
if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
|
||||
/* j >= 0 fixes Bug 2 */
|
||||
if (ends("ou")) break;
|
||||
return;
|
||||
/* takes care of -ous */
|
||||
case 's':
|
||||
if (ends("ism")) break;
|
||||
return;
|
||||
case 't':
|
||||
if (ends("ate")) break;
|
||||
if (ends("iti")) break;
|
||||
return;
|
||||
case 'u':
|
||||
if (ends("ous")) break;
|
||||
return;
|
||||
case 'v':
|
||||
if (ends("ive")) break;
|
||||
return;
|
||||
case 'z':
|
||||
if (ends("ize")) break;
|
||||
return;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
if (m() > 1)
|
||||
k = j;
|
||||
}
|
||||
|
||||
/* step6() removes a final -e if m() > 1. */
|
||||
|
||||
private final void step6() {
|
||||
j = k;
|
||||
if (b[k] == 'e') {
|
||||
int a = m();
|
||||
if (a > 1 || a == 1 && !cvc(k-1))
|
||||
k--;
|
||||
}
|
||||
if (b[k] == 'l' && doublec(k) && m() > 1)
|
||||
k--;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Stem a word provided as a String. Returns the result as a String.
|
||||
*/
|
||||
public String stem(String s) {
|
||||
if (stem(s.toCharArray(), s.length()))
|
||||
return toString();
|
||||
else
|
||||
return s;
|
||||
}
|
||||
|
||||
/** Stem a word contained in a char[]. Returns true if the stemming process
|
||||
* resulted in a word different from the input. You can retrieve the
|
||||
* result with getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem(char[] word) {
|
||||
return stem(word, word.length);
|
||||
}
|
||||
|
||||
/** Stem a word contained in a portion of a char[] array. Returns
|
||||
* true if the stemming process resulted in a word different from
|
||||
* the input. You can retrieve the result with
|
||||
* getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem(char[] wordBuffer, int offset, int wordLen) {
|
||||
reset();
|
||||
if (b.length < wordLen) {
|
||||
char[] new_b = new char[wordLen + EXTRA];
|
||||
b = new_b;
|
||||
}
|
||||
for (int j=0; j<wordLen; j++)
|
||||
b[j] = wordBuffer[offset+j];
|
||||
i = wordLen;
|
||||
return stem(0);
|
||||
}
|
||||
|
||||
/** Stem a word contained in a leading portion of a char[] array.
|
||||
* Returns true if the stemming process resulted in a word different
|
||||
* from the input. You can retrieve the result with
|
||||
* getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem(char[] word, int wordLen) {
|
||||
return stem(word, 0, wordLen);
|
||||
}
|
||||
|
||||
/** Stem the word placed into the Stemmer buffer through calls to add().
|
||||
* Returns true if the stemming process resulted in a word different
|
||||
* from the input. You can retrieve the result with
|
||||
* getResultLength()/getResultBuffer() or toString().
|
||||
*/
|
||||
public boolean stem() {
|
||||
return stem(0);
|
||||
}
|
||||
|
||||
public boolean stem(int i0) {
|
||||
k = i - 1;
|
||||
k0 = i0;
|
||||
if (k > k0+1) {
|
||||
step1(); step2(); step3(); step4(); step5(); step6();
|
||||
}
|
||||
// Also, a word is considered dirty if we lopped off letters
|
||||
// Thanks to Ifigenia Vairelles for pointing this out.
|
||||
if (i != k+1)
|
||||
dirty = true;
|
||||
i = k+1;
|
||||
return dirty;
|
||||
}
|
||||
|
||||
/** Test program for demonstrating the Stemmer. It reads a file and
|
||||
* stems each word, writing the result to standard out.
|
||||
* Usage: Stemmer file-name
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
PorterStemmer s = new PorterStemmer();
|
||||
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
try {
|
||||
InputStream in = new FileInputStream(args[i]);
|
||||
byte[] buffer = new byte[1024];
|
||||
int bufferLen, offset, ch;
|
||||
|
||||
bufferLen = in.read(buffer);
|
||||
offset = 0;
|
||||
s.reset();
|
||||
|
||||
while(true) {
|
||||
if (offset < bufferLen)
|
||||
ch = buffer[offset++];
|
||||
else {
|
||||
bufferLen = in.read(buffer);
|
||||
offset = 0;
|
||||
if (bufferLen < 0)
|
||||
ch = -1;
|
||||
else
|
||||
ch = buffer[offset++];
|
||||
}
|
||||
|
||||
if (Character.isLetter((char) ch)) {
|
||||
s.add(Character.toLowerCase((char) ch));
|
||||
}
|
||||
else {
|
||||
s.stem();
|
||||
System.out.print(s.toString());
|
||||
s.reset();
|
||||
if (ch < 0)
|
||||
break;
|
||||
else {
|
||||
System.out.print((char) ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
in.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
System.out.println("error reading " + args[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
|
||||
|
||||
public final class SimpleAnalyzer extends Analyzer {
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new LowerCaseTokenizer(reader);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
|
||||
|
||||
public final class StopAnalyzer extends Analyzer {
|
||||
private Hashtable stopTable;
|
||||
|
||||
/** An array containing some common English words that are not usually useful
|
||||
for searching. */
|
||||
public static final String[] ENGLISH_STOP_WORDS = {
|
||||
"a", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "s", "such",
|
||||
"t", "that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
|
||||
public StopAnalyzer() {
|
||||
stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in the provided array. */
|
||||
public StopAnalyzer(String[] stopWords) {
|
||||
stopTable = StopFilter.makeStopTable(stopWords);
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/** Removes stop words from a token stream. */
|
||||
|
||||
public final class StopFilter extends TokenFilter {
|
||||
|
||||
private Hashtable table;
|
||||
|
||||
/** Constructs a filter which removes words from the input
|
||||
TokenStream that are named in the array of words. */
|
||||
public StopFilter(TokenStream in, String[] stopWords) {
|
||||
input = in;
|
||||
table = makeStopTable(stopWords);
|
||||
}
|
||||
|
||||
/** Constructs a filter which removes words from the input
|
||||
TokenStream that are named in the Hashtable. */
|
||||
public StopFilter(TokenStream in, Hashtable stopTable) {
|
||||
input = in;
|
||||
table = stopTable;
|
||||
}
|
||||
|
||||
/** Builds a Hashtable from an array of stop words, appropriate for passing
|
||||
into the StopFilter constructor. This permits this table construction to
|
||||
be cached once when an Analyzer is constructed. */
|
||||
public final static Hashtable makeStopTable(String[] stopWords) {
|
||||
Hashtable stopTable = new Hashtable(stopWords.length);
|
||||
for (int i = 0; i < stopWords.length; i++)
|
||||
stopTable.put(stopWords[i], stopWords[i]);
|
||||
return stopTable;
|
||||
}
|
||||
|
||||
/** Returns the next input Token whose termText() is not a stop word. */
|
||||
public final Token next() throws IOException {
|
||||
// return the first non-stop word found
|
||||
for (Token token = input.next(); token != null; token = input.next())
|
||||
if (table.get(token.termText) == null)
|
||||
return token;
|
||||
// reached EOS -- return null
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/** A Token is an occurence of a term from the text of a field. It consists of
|
||||
a term's text, the start and end offset of the term in the text of the field,
|
||||
and a type string.
|
||||
|
||||
The start and end offsets permit applications to re-associate a token with
|
||||
its source text, e.g., to display highlighted query terms in a document
|
||||
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
||||
display, etc.
|
||||
|
||||
The type is an interned string, assigned by a lexical analyzer
|
||||
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
|
||||
belongs to. For example an end of sentence marker token might be implemented
|
||||
with type "eos". The default token type is "word". */
|
||||
|
||||
public final class Token {
|
||||
String termText; // the text of the term
|
||||
int startOffset; // start in source text
|
||||
int endOffset; // end in source text
|
||||
String type = "word"; // lexical type
|
||||
|
||||
/** Constructs a Token with the given term text, and start & end offsets.
|
||||
The type defaults to "word." */
|
||||
public Token(String text, int start, int end) {
|
||||
termText = text;
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Constructs a Token with the given text, start and end offsets, & type. */
|
||||
public Token(String text, int start, int end, String typ) {
|
||||
termText = text;
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
type = typ;
|
||||
}
|
||||
|
||||
/** Returns the Token's term text. */
|
||||
public final String termText() { return termText; }
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public final int startOffset() { return startOffset; }
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. */
|
||||
public final int endOffset() { return endOffset; }
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public final String type() { return type; }
|
||||
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** A TokenFilter is a TokenStream whose input is another token stream.
|
||||
<p>
|
||||
This is an abstract class.
|
||||
*/
|
||||
|
||||
abstract public class TokenFilter extends TokenStream {
|
||||
/** The source of tokens for this filter. */
|
||||
protected TokenStream input;
|
||||
|
||||
/** Close the input TokenStream. */
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** A TokenStream enumerates the sequence of tokens, either from
|
||||
fields of a document or from query text.
|
||||
<p>
|
||||
This is an abstract class. Concrete subclasses are:
|
||||
<ul>
|
||||
<li>{@link Tokenizer}, a TokenStream
|
||||
whose input is a Reader; and
|
||||
<li>{@link TokenFilter}, a TokenStream
|
||||
whose input is another TokenStream.
|
||||
</ul>
|
||||
*/
|
||||
|
||||
abstract public class TokenStream {
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
abstract public Token next() throws IOException;
|
||||
|
||||
/** Releases resources associated with this stream. */
|
||||
public void close() throws IOException {}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/** A Tokenizer is a TokenStream whose input is a Reader.
|
||||
<p>
|
||||
This is an abstract class.
|
||||
*/
|
||||
|
||||
abstract public class Tokenizer extends TokenStream {
|
||||
/** The text source for this Tokenizer. */
|
||||
protected Reader input;
|
||||
|
||||
/** By default, closes the input Reader. */
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="Author" content="Doug Cutting">
|
||||
</head>
|
||||
<body>
|
||||
API and code to convert text into indexable tokens.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,6 @@
|
|||
Token.java
|
||||
StandardTokenizer.java
|
||||
StandardTokenizerTokenManager.java
|
||||
TokenMgrError.java
|
||||
CharStream.java
|
||||
StandardTokenizerConstants.java
|
|
@ -0,0 +1,159 @@
|
|||
// FastCharStream.java
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
|
||||
* this does not do line-number counting, but instead keeps track of the
|
||||
* character position of the token in the input, as required by Lucene's {@link
|
||||
* org.apache.lucene.analysis.Token} API. */
|
||||
public final class FastCharStream implements CharStream {
|
||||
char[] buffer = null;
|
||||
|
||||
int bufferLength = 0; // end of valid chars
|
||||
int bufferPosition = 0; // next char to read
|
||||
|
||||
int tokenStart = 0; // offset in buffer
|
||||
int bufferStart = 0; // position in file of buffer
|
||||
|
||||
Reader input; // source of chars
|
||||
|
||||
/** Constructs from a Reader. */
|
||||
public FastCharStream(Reader r) {
|
||||
input = r;
|
||||
}
|
||||
|
||||
public final char readChar() throws IOException {
|
||||
if (bufferPosition >= bufferLength)
|
||||
refill();
|
||||
return buffer[bufferPosition++];
|
||||
}
|
||||
|
||||
private final void refill() throws IOException {
|
||||
int newPosition = bufferLength - tokenStart;
|
||||
|
||||
if (tokenStart == 0) { // token won't fit in buffer
|
||||
if (buffer == null) { // first time: alloc buffer
|
||||
buffer = new char[2048];
|
||||
} else if (bufferLength == buffer.length) { // grow buffer
|
||||
char[] newBuffer = new char[buffer.length*2];
|
||||
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
|
||||
buffer = newBuffer;
|
||||
}
|
||||
} else { // shift token to front
|
||||
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
|
||||
}
|
||||
|
||||
bufferLength = newPosition; // update state
|
||||
bufferPosition = newPosition;
|
||||
bufferStart += tokenStart;
|
||||
tokenStart = 0;
|
||||
|
||||
int charsRead = // fill space in buffer
|
||||
input.read(buffer, newPosition, buffer.length-newPosition);
|
||||
if (charsRead == -1)
|
||||
throw new IOException("read past eof");
|
||||
else
|
||||
bufferLength += charsRead;
|
||||
}
|
||||
|
||||
public final char BeginToken() throws IOException {
|
||||
tokenStart = bufferPosition;
|
||||
return readChar();
|
||||
}
|
||||
|
||||
public final void backup(int amount) {
|
||||
bufferPosition -= amount;
|
||||
}
|
||||
|
||||
public final String GetImage() {
|
||||
return new String(buffer, tokenStart, bufferPosition - tokenStart);
|
||||
}
|
||||
|
||||
public final char[] GetSuffix(int len) {
|
||||
char[] value = new char[len];
|
||||
System.arraycopy(buffer, bufferPosition - len, value, 0, len);
|
||||
return value;
|
||||
}
|
||||
|
||||
public final void Done() {
|
||||
try {
|
||||
input.close();
|
||||
} catch (IOException e) {
|
||||
System.err.println("Caught: " + e + "; ignoring.");
|
||||
}
|
||||
}
|
||||
|
||||
public final int getColumn() {
|
||||
return bufferStart + bufferPosition;
|
||||
}
|
||||
public final int getLine() {
|
||||
return 1;
|
||||
}
|
||||
public final int getEndColumn() {
|
||||
return bufferStart + bufferPosition;
|
||||
}
|
||||
public final int getEndLine() {
|
||||
return 1;
|
||||
}
|
||||
public final int getBeginColumn() {
|
||||
return bufferStart + tokenStart;
|
||||
}
|
||||
public final int getBeginLine() {
|
||||
return 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
ROOT = ../../../..
|
||||
|
||||
include ../../rules.mk
|
||||
|
||||
# Don't delete ParseException.java -- we've changed it by hand.
|
||||
DIRT := $(patsubst ParseException.java,,${DIRT})
|
||||
|
|
@ -0,0 +1,191 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 0.7pre6 */
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* This exception is thrown when parse errors are encountered.
|
||||
* You can explicitly create objects of this exception type by
|
||||
* calling the method generateParseException in the generated
|
||||
* parser.
|
||||
*
|
||||
* You can modify this class to customize your error reporting
|
||||
* mechanisms so long as you retain the public fields.
|
||||
*/
|
||||
public class ParseException extends java.io.IOException {
|
||||
|
||||
/**
|
||||
* This constructor is used by the method "generateParseException"
|
||||
* in the generated parser. Calling this constructor generates
|
||||
* a new object of this type with the fields "currentToken",
|
||||
* "expectedTokenSequences", and "tokenImage" set. The boolean
|
||||
* flag "specialConstructor" is also set to true to indicate that
|
||||
* this constructor was used to create this object.
|
||||
* This constructor calls its super class with the empty string
|
||||
* to force the "toString" method of parent class "Throwable" to
|
||||
* print the error message in the form:
|
||||
* ParseException: <result of getMessage>
|
||||
*/
|
||||
public ParseException(Token currentTokenVal,
|
||||
int[][] expectedTokenSequencesVal,
|
||||
String[] tokenImageVal
|
||||
)
|
||||
{
|
||||
super("");
|
||||
specialConstructor = true;
|
||||
currentToken = currentTokenVal;
|
||||
expectedTokenSequences = expectedTokenSequencesVal;
|
||||
tokenImage = tokenImageVal;
|
||||
}
|
||||
|
||||
/**
|
||||
* The following constructors are for use by you for whatever
|
||||
* purpose you can think of. Constructing the exception in this
|
||||
* manner makes the exception behave in the normal way - i.e., as
|
||||
* documented in the class "Throwable". The fields "errorToken",
|
||||
* "expectedTokenSequences", and "tokenImage" do not contain
|
||||
* relevant information. The JavaCC generated code does not use
|
||||
* these constructors.
|
||||
*/
|
||||
|
||||
public ParseException() {
|
||||
super();
|
||||
specialConstructor = false;
|
||||
}
|
||||
|
||||
public ParseException(String message) {
|
||||
super(message);
|
||||
specialConstructor = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* This variable determines which constructor was used to create
|
||||
* this object and thereby affects the semantics of the
|
||||
* "getMessage" method (see below).
|
||||
*/
|
||||
protected boolean specialConstructor;
|
||||
|
||||
/**
|
||||
* This is the last token that has been consumed successfully. If
|
||||
* this object has been created due to a parse error, the token
|
||||
* followng this token will (therefore) be the first error token.
|
||||
*/
|
||||
public Token currentToken;
|
||||
|
||||
/**
|
||||
* Each entry in this array is an array of integers. Each array
|
||||
* of integers represents a sequence of tokens (by their ordinal
|
||||
* values) that is expected at this point of the parse.
|
||||
*/
|
||||
public int[][] expectedTokenSequences;
|
||||
|
||||
/**
|
||||
* This is a reference to the "tokenImage" array of the generated
|
||||
* parser within which the parse error occurred. This array is
|
||||
* defined in the generated ...Constants interface.
|
||||
*/
|
||||
public String[] tokenImage;
|
||||
|
||||
/**
|
||||
* This method has the standard behavior when this object has been
|
||||
* created using the standard constructors. Otherwise, it uses
|
||||
* "currentToken" and "expectedTokenSequences" to generate a parse
|
||||
* error message and returns it. If this object has been created
|
||||
* due to a parse error, and you do not catch it (it gets thrown
|
||||
* from the parser), then this method is called during the printing
|
||||
* of the final stack trace, and hence the correct error message
|
||||
* gets displayed.
|
||||
*/
|
||||
public String getMessage() {
|
||||
if (!specialConstructor) {
|
||||
return super.getMessage();
|
||||
}
|
||||
String expected = "";
|
||||
int maxSize = 0;
|
||||
for (int i = 0; i < expectedTokenSequences.length; i++) {
|
||||
if (maxSize < expectedTokenSequences[i].length) {
|
||||
maxSize = expectedTokenSequences[i].length;
|
||||
}
|
||||
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
|
||||
expected += tokenImage[expectedTokenSequences[i][j]] + " ";
|
||||
}
|
||||
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
|
||||
expected += "...";
|
||||
}
|
||||
expected += eol + " ";
|
||||
}
|
||||
String retval = "Encountered \"";
|
||||
Token tok = currentToken.next;
|
||||
for (int i = 0; i < maxSize; i++) {
|
||||
if (i != 0) retval += " ";
|
||||
if (tok.kind == 0) {
|
||||
retval += tokenImage[0];
|
||||
break;
|
||||
}
|
||||
retval += add_escapes(tok.image);
|
||||
tok = tok.next;
|
||||
}
|
||||
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn + "." + eol;
|
||||
if (expectedTokenSequences.length == 1) {
|
||||
retval += "Was expecting:" + eol + " ";
|
||||
} else {
|
||||
retval += "Was expecting one of:" + eol + " ";
|
||||
}
|
||||
retval += expected;
|
||||
return retval;
|
||||
}
|
||||
|
||||
/**
|
||||
* The end of line string for this machine.
|
||||
*/
|
||||
protected String eol = System.getProperty("line.separator", "\n");
|
||||
|
||||
/**
|
||||
* Used to convert raw characters to their escaped version
|
||||
* when these raw version cannot be used as part of an ASCII
|
||||
* string literal.
|
||||
*/
|
||||
protected String add_escapes(String str) {
|
||||
StringBuffer retval = new StringBuffer();
|
||||
char ch;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
switch (str.charAt(i))
|
||||
{
|
||||
case 0 :
|
||||
continue;
|
||||
case '\b':
|
||||
retval.append("\\b");
|
||||
continue;
|
||||
case '\t':
|
||||
retval.append("\\t");
|
||||
continue;
|
||||
case '\n':
|
||||
retval.append("\\n");
|
||||
continue;
|
||||
case '\f':
|
||||
retval.append("\\f");
|
||||
continue;
|
||||
case '\r':
|
||||
retval.append("\\r");
|
||||
continue;
|
||||
case '\"':
|
||||
retval.append("\\\"");
|
||||
continue;
|
||||
case '\'':
|
||||
retval.append("\\\'");
|
||||
continue;
|
||||
case '\\':
|
||||
retval.append("\\\\");
|
||||
continue;
|
||||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return retval.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||
* LowerCaseFilter} and {@link StopFilter}. */
|
||||
public final class StandardAnalyzer extends Analyzer {
|
||||
private Hashtable stopTable;
|
||||
|
||||
/** An array containing some common English words that are not usually useful
|
||||
for searching. */
|
||||
public static final String[] STOP_WORDS = {
|
||||
"a", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "s", "such",
|
||||
"t", "that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
/** Builds an analyzer. */
|
||||
public StandardAnalyzer() {
|
||||
this(STOP_WORDS);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the given stop words. */
|
||||
public StandardAnalyzer(String[] stopWords) {
|
||||
stopTable = StopFilter.makeStopTable(stopWords);
|
||||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
* StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopTable);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
||||
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
||||
|
||||
public final class StandardFilter extends TokenFilter
|
||||
implements StandardTokenizerConstants {
|
||||
|
||||
|
||||
/** Construct filtering <i>in</i>. */
|
||||
public StandardFilter(TokenStream in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = tokenImage[APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = tokenImage[ACRONYM];
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
*/
|
||||
public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
|
||||
org.apache.lucene.analysis.Token t = input.next();
|
||||
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
String text = t.termText();
|
||||
String type = t.type();
|
||||
|
||||
if (type == APOSTROPHE_TYPE && // remove 's
|
||||
(text.endsWith("'s") || text.endsWith("'S"))) {
|
||||
return new org.apache.lucene.analysis.Token
|
||||
(text.substring(0,text.length()-2),
|
||||
t.startOffset(), t.endOffset(), type);
|
||||
|
||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||
StringBuffer trimmed = new StringBuffer();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
if (c != '.')
|
||||
trimmed.append(c);
|
||||
}
|
||||
return new org.apache.lucene.analysis.Token
|
||||
(trimmed.toString(), t.startOffset(), t.endOffset(), type);
|
||||
|
||||
} else {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,197 @@
|
|||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
options {
|
||||
STATIC = false;
|
||||
//IGNORE_CASE = true;
|
||||
//BUILD_PARSER = false;
|
||||
//UNICODE_INPUT = true;
|
||||
USER_CHAR_STREAM = true;
|
||||
OPTIMIZE_TOKEN_MANAGER = true;
|
||||
//DEBUG_TOKEN_MANAGER = true;
|
||||
}
|
||||
PARSER_BEGIN(StandardTokenizer)
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JavaCC.
|
||||
*
|
||||
* <p> This should be a good tokenizer for most European-language documents.
|
||||
*
|
||||
* <p>Many applications have specific tokenizer needs. If this tokenizer does
|
||||
* not suit your application, please consider copying this source code
|
||||
* directory to your project and maintaining your own grammar-based tokenizer.
|
||||
*/
|
||||
public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
|
||||
|
||||
/** Constructs a tokenizer for this Reader. */
|
||||
public StandardTokenizer(Reader reader) {
|
||||
this(new FastCharStream(reader));
|
||||
this.input = reader;
|
||||
}
|
||||
}
|
||||
|
||||
PARSER_END(StandardTokenizer)
|
||||
|
||||
TOKEN : { // token patterns
|
||||
|
||||
// basic word: a sequence of digits & letters
|
||||
<ALPHANUM: (<LETTER>|<DIGIT>)+ >
|
||||
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
// use a post-filter to remove possesives
|
||||
| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
|
||||
|
||||
// acronyms: U.S.A., I.B.M., etc.
|
||||
// use a post-filter to remove dots
|
||||
| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
|
||||
|
||||
// company names like AT&T and Excite@Home.
|
||||
| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
|
||||
|
||||
// email addresses
|
||||
| <EMAIL: <ALPHANUM> "@" <ALPHANUM> ("." <ALPHANUM>)+ >
|
||||
|
||||
// hostname
|
||||
| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
|
||||
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
|
||||
| <HAS_DIGIT> <P> <ALPHANUM>
|
||||
| <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
|
||||
| <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
|
||||
| <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
|
||||
| <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
|
||||
)
|
||||
>
|
||||
| <#P: ("_"|"-"|"/"|"."|",") >
|
||||
| <#HAS_DIGIT: // at least one digit
|
||||
(<LETTER>|<DIGIT>)*
|
||||
<DIGIT>
|
||||
(<LETTER>|<DIGIT>)*
|
||||
>
|
||||
|
||||
| < #ALPHA: (<LETTER>)+>
|
||||
| < #LETTER: // unicode letters
|
||||
[
|
||||
"\u0041"-"\u005a",
|
||||
"\u0061"-"\u007a",
|
||||
"\u00c0"-"\u00d6",
|
||||
"\u00d8"-"\u00f6",
|
||||
"\u00f8"-"\u00ff",
|
||||
"\u0100"-"\u1fff",
|
||||
"\u3040"-"\u318f",
|
||||
"\u3300"-"\u337f",
|
||||
"\u3400"-"\u3d2d",
|
||||
"\u4e00"-"\u9fff",
|
||||
"\uf900"-"\ufaff"
|
||||
]
|
||||
>
|
||||
| < #DIGIT: // unicode digits
|
||||
[
|
||||
"\u0030"-"\u0039",
|
||||
"\u0660"-"\u0669",
|
||||
"\u06f0"-"\u06f9",
|
||||
"\u0966"-"\u096f",
|
||||
"\u09e6"-"\u09ef",
|
||||
"\u0a66"-"\u0a6f",
|
||||
"\u0ae6"-"\u0aef",
|
||||
"\u0b66"-"\u0b6f",
|
||||
"\u0be7"-"\u0bef",
|
||||
"\u0c66"-"\u0c6f",
|
||||
"\u0ce6"-"\u0cef",
|
||||
"\u0d66"-"\u0d6f",
|
||||
"\u0e50"-"\u0e59",
|
||||
"\u0ed0"-"\u0ed9",
|
||||
"\u1040"-"\u1049"
|
||||
]
|
||||
>
|
||||
}
|
||||
|
||||
SKIP : { // skip unrecognized chars
|
||||
<NOISE: ~[] >
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>The returned token's type is set to an element of {@link
|
||||
* StandardTokenizerConstants.tokenImage}.
|
||||
*/
|
||||
org.apache.lucene.analysis.Token next() throws IOException :
|
||||
{
|
||||
Token token = null;
|
||||
}
|
||||
{
|
||||
( token = <ALPHANUM> |
|
||||
token = <APOSTROPHE> |
|
||||
token = <ACRONYM> |
|
||||
token = <COMPANY> |
|
||||
token = <EMAIL> |
|
||||
token = <HOST> |
|
||||
token = <NUM> |
|
||||
token = <EOF>
|
||||
)
|
||||
{
|
||||
if (token.kind == EOF) {
|
||||
return null;
|
||||
} else {
|
||||
return
|
||||
new org.apache.lucene.analysis.Token(token.image,
|
||||
token.beginColumn,token.endColumn,
|
||||
tokenImage[token.kind]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="Author" content="Doug Cutting">
|
||||
</head>
|
||||
<body>
|
||||
A grammar-based tokenizer constructed with JavaCC.
|
||||
<p>Note that JavaCC defines lots of public, classes, methods and fields
|
||||
that do not need to be public. These clutter the documentation.
|
||||
Sorry.
|
||||
<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>org.apache.lucene.analysis.Token</tt>
|
||||
must always be fully qualified in sourced code in this package.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,109 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
/** Provides support for converting dates to strings and vice-versa. The
|
||||
* strings are structured so that lexicographic sorting orders by date. This
|
||||
* makes them suitable for use as field values and search terms. */
|
||||
public class DateField {
|
||||
private DateField() {};
|
||||
|
||||
// make date strings long enough to last a millenium
|
||||
private static int DATE_LEN = Long.toString(1000L*365*24*60*60*1000,
|
||||
Character.MAX_RADIX).length();
|
||||
|
||||
public static String MIN_DATE_STRING() {
|
||||
return timeToString(0);
|
||||
}
|
||||
|
||||
public static String MAX_DATE_STRING() {
|
||||
char[] buffer = new char[DATE_LEN];
|
||||
char c = Character.forDigit(Character.MAX_RADIX-1, Character.MAX_RADIX);
|
||||
for (int i = 0 ; i < DATE_LEN; i++)
|
||||
buffer[i] = c;
|
||||
return new String(buffer);
|
||||
}
|
||||
|
||||
/** Converts a Date to a string suitable for indexing. */
|
||||
public static String dateToString(Date date) {
|
||||
return timeToString(date.getTime());
|
||||
}
|
||||
/** Converts a millisecond time to a string suitable for indexing. */
|
||||
public static String timeToString(long time) {
|
||||
if (time < 0)
|
||||
throw new RuntimeException("time too early");
|
||||
|
||||
String s = Long.toString(time, Character.MAX_RADIX);
|
||||
|
||||
if (s.length() > DATE_LEN)
|
||||
throw new RuntimeException("time too late");
|
||||
|
||||
while (s.length() < DATE_LEN)
|
||||
s = "0" + s; // pad with leading zeros
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/** Converts a string-encoded date into a millisecond time. */
|
||||
public static long stringToTime(String s) {
|
||||
return Long.parseLong(s, Character.MAX_RADIX);
|
||||
}
|
||||
/** Converts a string-encoded date into a Date object. */
|
||||
public static Date stringToDate(String s) {
|
||||
return new Date(stringToTime(s));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Enumeration;
|
||||
|
||||
/** Documents are the unit of indexing and search.
|
||||
*
|
||||
* A Document is a set of fields. Each field has a name and a textual value.
|
||||
* A field may be stored with the document, in which case it is returned with
|
||||
* search hits on the document. Thus each document should typically contain
|
||||
* stored fields which uniquely identify it.
|
||||
* */
|
||||
|
||||
public final class Document {
|
||||
DocumentFieldList fieldList = null;
|
||||
|
||||
/** Constructs a new document with no fields. */
|
||||
public Document() {}
|
||||
|
||||
/** Adds a field to a document. Several fields may be added with
|
||||
* the same name. In this case, if the fields are indexed, their text is
|
||||
* treated as though appended for the purposes of search. */
|
||||
public final void add(Field field) {
|
||||
fieldList = new DocumentFieldList(field, fieldList);
|
||||
}
|
||||
|
||||
/** Returns a field with the given name if any exist in this document, or
|
||||
null. If multiple fields may exist with this name, this method returns the
|
||||
last added such added. */
|
||||
public final Field getField(String name) {
|
||||
for (DocumentFieldList list = fieldList; list != null; list = list.next)
|
||||
if (list.field.name().equals(name))
|
||||
return list.field;
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns the string value of the field with the given name if any exist in
|
||||
this document, or null. If multiple fields may exist with this name, this
|
||||
method returns the last added such added. */
|
||||
public final String get(String name) {
|
||||
Field field = getField(name);
|
||||
if (field != null)
|
||||
return field.stringValue();
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns an Enumeration of all the fields in a document. */
|
||||
public final Enumeration fields() {
|
||||
return new DocumentFieldEnumeration(this);
|
||||
}
|
||||
|
||||
/** Prints the fields of a document for human consumption. */
|
||||
public final String toString() {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
buffer.append("Document<");
|
||||
for (DocumentFieldList list = fieldList; list != null; list = list.next) {
|
||||
buffer.append(list.field.toString());
|
||||
if (list.next != null)
|
||||
buffer.append(" ");
|
||||
}
|
||||
buffer.append(">");
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
final class DocumentFieldList {
|
||||
DocumentFieldList(Field f, DocumentFieldList n) {
|
||||
field = f;
|
||||
next = n;
|
||||
}
|
||||
Field field;
|
||||
DocumentFieldList next;
|
||||
}
|
||||
|
||||
final class DocumentFieldEnumeration implements Enumeration {
|
||||
DocumentFieldList fields;
|
||||
DocumentFieldEnumeration(Document d) {
|
||||
fields = d.fieldList;
|
||||
}
|
||||
|
||||
public final boolean hasMoreElements() {
|
||||
return fields == null ? false : true;
|
||||
}
|
||||
|
||||
public final Object nextElement() {
|
||||
Field result = fields.field;
|
||||
fields = fields.next;
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
A field is a section of a Document. Each field has two parts, a name and a
|
||||
value. Values may be free text, provided as a String or as a Reader, or they
|
||||
may be atomic keywords, which are not further processed. Such keywords may
|
||||
be used to represent dates, urls, etc. Fields are optionally stored in the
|
||||
index, so that they may be returned with hits on the document.
|
||||
*/
|
||||
|
||||
public final class Field {
|
||||
private String name = "body";
|
||||
private String stringValue = null;
|
||||
private Reader readerValue = null;
|
||||
private boolean isStored = false;
|
||||
private boolean isIndexed = true;
|
||||
private boolean isTokenized = true;
|
||||
|
||||
/** Constructs a String-valued Field that is not tokenized, but is indexed
|
||||
and stored. Useful for non-text fields, e.g. date or url. */
|
||||
public static final Field Keyword(String name, String value) {
|
||||
return new Field(name, value, true, true, false);
|
||||
}
|
||||
|
||||
/** Constructs a String-valued Field that is not tokenized or indexed,
|
||||
but is stored in the index, for return with hits. */
|
||||
public static final Field UnIndexed(String name, String value) {
|
||||
return new Field(name, value, true, false, false);
|
||||
}
|
||||
|
||||
/** Constructs a String-valued Field that is tokenized and indexed,
|
||||
and is stored in the index, for return with hits. Useful for short text
|
||||
fields, like "title" or "subject". */
|
||||
public static final Field Text(String name, String value) {
|
||||
return new Field(name, value, true, true, true);
|
||||
}
|
||||
|
||||
/** Constructs a String-valued Field that is tokenized and indexed,
|
||||
but that is not stored in the index. */
|
||||
public static final Field UnStored(String name, String value) {
|
||||
return new Field(name, value, false, true, true);
|
||||
}
|
||||
|
||||
/** Constructs a Reader-valued Field that is tokenized and indexed, but is
|
||||
not stored in the index verbatim. Useful for longer text fields, like
|
||||
"body". */
|
||||
public static final Field Text(String name, Reader value) {
|
||||
return new Field(name, value);
|
||||
}
|
||||
|
||||
/** The name of the field (e.g., "date", "subject", "title", "body", etc.)
|
||||
as an interned string. */
|
||||
public String name() { return name; }
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value
|
||||
is used. Exactly one of stringValue() and readerValue() must be set. */
|
||||
public String stringValue() { return stringValue; }
|
||||
/** The value of the field as a Reader, or null. If null, the String value
|
||||
is used. Exactly one of stringValue() and readerValue() must be set. */
|
||||
public Reader readerValue() { return readerValue; }
|
||||
|
||||
public Field(String name, String string,
|
||||
boolean store, boolean index, boolean token) {
|
||||
if (name == null)
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
if (string == null)
|
||||
throw new IllegalArgumentException("value cannot be null");
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.stringValue = string;
|
||||
this.isStored = store;
|
||||
this.isIndexed = index;
|
||||
this.isTokenized = token;
|
||||
}
|
||||
Field(String name, Reader reader) {
|
||||
if (name == null)
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
if (reader == null)
|
||||
throw new IllegalArgumentException("value cannot be null");
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.readerValue = reader;
|
||||
}
|
||||
|
||||
/** True iff the value of the field is to be stored in the index for return
|
||||
with search hits. It is an error for this to be true if a field is
|
||||
Reader-valued. */
|
||||
public final boolean isStored() { return isStored; }
|
||||
|
||||
/** True iff the value of the field is to be indexed, so that it may be
|
||||
searched on. */
|
||||
public final boolean isIndexed() { return isIndexed; }
|
||||
|
||||
/** True iff the value of the field should be tokenized as text prior to
|
||||
indexing. Un-tokenized fields are indexed as a single word and may not be
|
||||
Reader-valued. */
|
||||
public final boolean isTokenized() { return isTokenized; }
|
||||
|
||||
/** Prints a Field for human consumption. */
|
||||
public final String toString() {
|
||||
if (isStored && isIndexed && !isTokenized)
|
||||
return "Keyword<" + name + ":" + stringValue + ">";
|
||||
else if (isStored && !isIndexed && !isTokenized)
|
||||
return "Unindexed<" + name + ":" + stringValue + ">";
|
||||
else if (isStored && isIndexed && isTokenized && stringValue!=null)
|
||||
return "Text<" + name + ":" + stringValue + ">";
|
||||
else if (!isStored && isIndexed && isTokenized && readerValue!=null)
|
||||
return "Text<" + name + ":" + readerValue + ">";
|
||||
else
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
# sub-directory makefile for lucene
|
||||
include ../rules.mk
|
|
@ -0,0 +1,10 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="Author" content="Doug Cutting">
|
||||
</head>
|
||||
<body>
|
||||
The Document abstraction.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,336 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Enumeration;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
|
||||
final class DocumentWriter {
|
||||
private Analyzer analyzer;
|
||||
private Directory directory;
|
||||
private FieldInfos fieldInfos;
|
||||
private int maxFieldLength;
|
||||
|
||||
DocumentWriter(Directory d, Analyzer a, int mfl) {
|
||||
directory = d;
|
||||
analyzer = a;
|
||||
maxFieldLength = mfl;
|
||||
}
|
||||
|
||||
final void addDocument(String segment, Document doc)
|
||||
throws IOException {
|
||||
// write field names
|
||||
fieldInfos = new FieldInfos();
|
||||
fieldInfos.add(doc);
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
||||
// write field values
|
||||
FieldsWriter fieldsWriter =
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
try {
|
||||
fieldsWriter.addDocument(doc);
|
||||
} finally {
|
||||
fieldsWriter.close();
|
||||
}
|
||||
|
||||
// invert doc into postingTable
|
||||
postingTable.clear(); // clear postingTable
|
||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||
invertDocument(doc);
|
||||
|
||||
// sort postingTable into an array
|
||||
Posting[] postings = sortPostingTable();
|
||||
|
||||
/*
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
System.out.print(posting.term);
|
||||
System.out.print(" freq=" + posting.freq);
|
||||
System.out.print(" pos=");
|
||||
System.out.print(posting.positions[0]);
|
||||
for (int j = 1; j < posting.freq; j++)
|
||||
System.out.print("," + posting.positions[j]);
|
||||
System.out.println("");
|
||||
}
|
||||
*/
|
||||
|
||||
// write postings
|
||||
writePostings(postings, segment);
|
||||
|
||||
// write norms of indexed fields
|
||||
writeNorms(doc, segment);
|
||||
|
||||
}
|
||||
|
||||
// Keys are Terms, values are Postings.
|
||||
// Used to buffer a document before it is written to the index.
|
||||
private final Hashtable postingTable = new Hashtable();
|
||||
private int[] fieldLengths;
|
||||
|
||||
// Tokenizes the fields of a document into Postings.
|
||||
private final void invertDocument(Document doc)
|
||||
throws IOException {
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field)fields.nextElement();
|
||||
String fieldName = field.name();
|
||||
int fieldNumber = fieldInfos.fieldNumber(fieldName);
|
||||
|
||||
int position = fieldLengths[fieldNumber]; // position in field
|
||||
|
||||
if (field.isIndexed()) {
|
||||
if (!field.isTokenized()) { // un-tokenized field
|
||||
addPosition(fieldName, field.stringValue(), position++);
|
||||
} else {
|
||||
Reader reader; // find or make Reader
|
||||
if (field.readerValue() != null)
|
||||
reader = field.readerValue();
|
||||
else if (field.stringValue() != null)
|
||||
reader = new StringReader(field.stringValue());
|
||||
else
|
||||
throw new IllegalArgumentException
|
||||
("field must have either String or Reader value");
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
||||
try {
|
||||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||
addPosition(fieldName, t.termText(), position++);
|
||||
if (position > maxFieldLength) break;
|
||||
}
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
fieldLengths[fieldNumber] = position; // save field length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Term termBuffer = new Term("", ""); // avoid consing
|
||||
|
||||
private final void addPosition(String field, String text, int position) {
|
||||
termBuffer.set(field, text);
|
||||
Posting ti = (Posting)postingTable.get(termBuffer);
|
||||
if (ti != null) { // word seen before
|
||||
int freq = ti.freq;
|
||||
if (ti.positions.length == freq) { // positions array is full
|
||||
int[] newPositions = new int[freq * 2]; // double size
|
||||
int[] positions = ti.positions;
|
||||
for (int i = 0; i < freq; i++) // copy old positions to new
|
||||
newPositions[i] = positions[i];
|
||||
ti.positions = newPositions;
|
||||
}
|
||||
ti.positions[freq] = position; // add new position
|
||||
ti.freq = freq + 1; // update frequency
|
||||
}
|
||||
else { // word not seen before
|
||||
Term term = new Term(field, text, false);
|
||||
postingTable.put(term, new Posting(term, position));
|
||||
}
|
||||
}
|
||||
|
||||
private final Posting[] sortPostingTable() {
|
||||
// copy postingTable into an array
|
||||
Posting[] array = new Posting[postingTable.size()];
|
||||
Enumeration postings = postingTable.elements();
|
||||
for (int i = 0; postings.hasMoreElements(); i++)
|
||||
array[i] = (Posting)postings.nextElement();
|
||||
|
||||
// sort the array
|
||||
quickSort(array, 0, array.length - 1);
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
static private final void quickSort(Posting[] postings, int lo, int hi) {
|
||||
if(lo >= hi)
|
||||
return;
|
||||
|
||||
int mid = (lo + hi) / 2;
|
||||
|
||||
if(postings[lo].term.compareTo(postings[mid].term) > 0) {
|
||||
Posting tmp = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp;
|
||||
}
|
||||
|
||||
if(postings[mid].term.compareTo(postings[hi].term) > 0) {
|
||||
Posting tmp = postings[mid];
|
||||
postings[mid] = postings[hi];
|
||||
postings[hi] = tmp;
|
||||
|
||||
if(postings[lo].term.compareTo(postings[mid].term) > 0) {
|
||||
Posting tmp2 = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int left = lo + 1;
|
||||
int right = hi - 1;
|
||||
|
||||
if (left >= right)
|
||||
return;
|
||||
|
||||
Term partition = postings[mid].term;
|
||||
|
||||
for( ;; ) {
|
||||
while(postings[right].term.compareTo(partition) > 0)
|
||||
--right;
|
||||
|
||||
while(left < right && postings[left].term.compareTo(partition) <= 0)
|
||||
++left;
|
||||
|
||||
if(left < right) {
|
||||
Posting tmp = postings[left];
|
||||
postings[left] = postings[right];
|
||||
postings[right] = tmp;
|
||||
--right;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
quickSort(postings, lo, left);
|
||||
quickSort(postings, left + 1, hi);
|
||||
}
|
||||
|
||||
private final void writePostings(Posting[] postings, String segment)
|
||||
throws IOException {
|
||||
OutputStream freq = null, prox = null;
|
||||
TermInfosWriter tis = null;
|
||||
|
||||
try {
|
||||
freq = directory.createFile(segment + ".frq");
|
||||
prox = directory.createFile(segment + ".prx");
|
||||
tis = new TermInfosWriter(directory, segment, fieldInfos);
|
||||
TermInfo ti = new TermInfo();
|
||||
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
ti.set(1, freq.getFilePointer(), prox.getFilePointer());
|
||||
tis.add(posting.term, ti);
|
||||
|
||||
// add an entry to the freq file
|
||||
int f = posting.freq;
|
||||
if (f == 1) // optimize freq=1
|
||||
freq.writeVInt(1); // set low bit of doc num.
|
||||
else {
|
||||
freq.writeVInt(0); // the document number
|
||||
freq.writeVInt(f); // frequency in doc
|
||||
}
|
||||
|
||||
int lastPosition = 0; // write positions
|
||||
int[] positions = posting.positions;
|
||||
for (int j = 0; j < f; j++) { // use delta-encoding
|
||||
int position = positions[j];
|
||||
prox.writeVInt(position - lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
if (freq != null) freq.close();
|
||||
if (prox != null) prox.close();
|
||||
if (tis != null) tis.close();
|
||||
}
|
||||
}
|
||||
|
||||
private final void writeNorms(Document doc, String segment)
|
||||
throws IOException {
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field)fields.nextElement();
|
||||
if (field.isIndexed()) {
|
||||
int fieldNumber = fieldInfos.fieldNumber(field.name());
|
||||
OutputStream norm = directory.createFile(segment + ".f" + fieldNumber);
|
||||
try {
|
||||
norm.writeByte(Similarity.norm(fieldLengths[fieldNumber]));
|
||||
} finally {
|
||||
norm.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class Posting { // info about a Term in a doc
|
||||
Term term; // the Term
|
||||
int freq; // its frequency in doc
|
||||
int[] positions; // positions it occurs at
|
||||
|
||||
Posting(Term t, int position) {
|
||||
term = t;
|
||||
freq = 1;
|
||||
positions = new int[1];
|
||||
positions[0] = position;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
final class FieldInfo {
|
||||
String name;
|
||||
boolean isIndexed;
|
||||
int number;
|
||||
|
||||
FieldInfo(String na, boolean tk, int nu) {
|
||||
name = na;
|
||||
isIndexed = tk;
|
||||
number = nu;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,167 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Hashtable;
|
||||
import java.util.Vector;
|
||||
import java.util.Enumeration;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
final class FieldInfos {
|
||||
private Vector byNumber = new Vector();
|
||||
private Hashtable byName = new Hashtable();
|
||||
|
||||
FieldInfos() {
|
||||
add("", false);
|
||||
}
|
||||
|
||||
FieldInfos(Directory d, String name) throws IOException {
|
||||
InputStream input = d.openFile(name);
|
||||
try {
|
||||
read(input);
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Adds field info for a Document. */
|
||||
final void add(Document doc) {
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field)fields.nextElement();
|
||||
add(field.name(), field.isIndexed());
|
||||
}
|
||||
}
|
||||
|
||||
/** Merges in information from another FieldInfos. */
|
||||
final void add(FieldInfos other) {
|
||||
for (int i = 0; i < other.size(); i++) {
|
||||
FieldInfo fi = other.fieldInfo(i);
|
||||
add(fi.name, fi.isIndexed);
|
||||
}
|
||||
}
|
||||
|
||||
private final void add(String name, boolean isIndexed) {
|
||||
FieldInfo fi = fieldInfo(name);
|
||||
if (fi == null)
|
||||
addInternal(name, isIndexed);
|
||||
else if (fi.isIndexed != isIndexed)
|
||||
throw new IllegalStateException("field " + name +
|
||||
(fi.isIndexed ? " must" : " cannot") +
|
||||
" be an indexed field.");
|
||||
}
|
||||
|
||||
private final void addInternal(String name, boolean isIndexed) {
|
||||
FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.size());
|
||||
byNumber.addElement(fi);
|
||||
byName.put(name, fi);
|
||||
}
|
||||
|
||||
final int fieldNumber(String fieldName) {
|
||||
FieldInfo fi = fieldInfo(fieldName);
|
||||
if (fi != null)
|
||||
return fi.number;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
final FieldInfo fieldInfo(String fieldName) {
|
||||
return (FieldInfo)byName.get(fieldName);
|
||||
}
|
||||
|
||||
final String fieldName(int fieldNumber) {
|
||||
return fieldInfo(fieldNumber).name;
|
||||
}
|
||||
|
||||
final FieldInfo fieldInfo(int fieldNumber) {
|
||||
return (FieldInfo)byNumber.elementAt(fieldNumber);
|
||||
}
|
||||
|
||||
final int size() {
|
||||
return byNumber.size();
|
||||
}
|
||||
|
||||
final void write(Directory d, String name) throws IOException {
|
||||
OutputStream output = d.createFile(name);
|
||||
try {
|
||||
write(output);
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
|
||||
final void write(OutputStream output) throws IOException {
|
||||
output.writeVInt(size());
|
||||
for (int i = 0; i < size(); i++) {
|
||||
FieldInfo fi = fieldInfo(i);
|
||||
output.writeString(fi.name);
|
||||
output.writeByte((byte)(fi.isIndexed ? 1 : 0));
|
||||
}
|
||||
}
|
||||
|
||||
private final void read(InputStream input) throws IOException {
|
||||
int size = input.readVInt();
|
||||
for (int i = 0; i < size; i++)
|
||||
addInternal(input.readString().intern(),
|
||||
input.readByte() != 0);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.Hashtable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
final class FieldsReader {
|
||||
private FieldInfos fieldInfos;
|
||||
private InputStream fieldsStream;
|
||||
private InputStream indexStream;
|
||||
private int size;
|
||||
|
||||
FieldsReader(Directory d, String segment, FieldInfos fn)
|
||||
throws IOException {
|
||||
fieldInfos = fn;
|
||||
|
||||
fieldsStream = d.openFile(segment + ".fdt");
|
||||
indexStream = d.openFile(segment + ".fdx");
|
||||
|
||||
size = (int)indexStream.length() / 8;
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
fieldsStream.close();
|
||||
indexStream.close();
|
||||
}
|
||||
|
||||
final int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
final Document doc(int n) throws IOException {
|
||||
indexStream.seek(n * 8L);
|
||||
long position = indexStream.readLong();
|
||||
fieldsStream.seek(position);
|
||||
|
||||
Document doc = new Document();
|
||||
int numFields = fieldsStream.readVInt();
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
int fieldNumber = fieldsStream.readVInt();
|
||||
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
|
||||
|
||||
byte bits = fieldsStream.readByte();
|
||||
|
||||
doc.add(new Field(fi.name, // name
|
||||
fieldsStream.readString(), // read value
|
||||
true, // stored
|
||||
fi.isIndexed, // indexed
|
||||
(bits & 1) != 0)); // tokenized
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.Hashtable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
final class FieldsWriter {
|
||||
private FieldInfos fieldInfos;
|
||||
private OutputStream fieldsStream;
|
||||
private OutputStream indexStream;
|
||||
|
||||
FieldsWriter(Directory d, String segment, FieldInfos fn)
|
||||
throws IOException {
|
||||
fieldInfos = fn;
|
||||
fieldsStream = d.createFile(segment + ".fdt");
|
||||
indexStream = d.createFile(segment + ".fdx");
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
fieldsStream.close();
|
||||
indexStream.close();
|
||||
}
|
||||
|
||||
final void addDocument(Document doc) throws IOException {
|
||||
indexStream.writeLong(fieldsStream.getFilePointer());
|
||||
|
||||
int storedCount = 0;
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field)fields.nextElement();
|
||||
if (field.isStored())
|
||||
storedCount++;
|
||||
}
|
||||
fieldsStream.writeVInt(storedCount);
|
||||
|
||||
fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field)fields.nextElement();
|
||||
if (field.isStored()) {
|
||||
fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
|
||||
|
||||
byte bits = 0;
|
||||
if (field.isTokenized())
|
||||
bits |= 1;
|
||||
fieldsStream.writeByte(bits);
|
||||
|
||||
fieldsStream.writeString(field.stringValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,215 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
/** IndexReader is an abstract class, providing an interface for accessing an
|
||||
index. Search of an index is done entirely through this abstract interface,
|
||||
so that any subclass which implements it is searchable.
|
||||
|
||||
<p> Concrete subclasses of IndexReader are usually constructed with a call to
|
||||
the static method {@link #open}.
|
||||
|
||||
<p> For efficiency, in this API documents are often referred to via
|
||||
<it>document numbers</it>, non-negative integers which each name a unique
|
||||
document in the index. These document numbers are ephemeral--they may change
|
||||
as documents are added to and deleted from an index. Clients should thus not
|
||||
rely on a given document having the same number between sessions. */
|
||||
|
||||
abstract public class IndexReader {
|
||||
protected IndexReader() {};
|
||||
|
||||
/** Returns an IndexReader reading the index in an FSDirectory in the named
|
||||
path. */
|
||||
public static IndexReader open(String path) throws IOException {
|
||||
return open(FSDirectory.getDirectory(path, false));
|
||||
}
|
||||
|
||||
/** Returns an IndexReader reading the index in an FSDirectory in the named
|
||||
path. */
|
||||
public static IndexReader open(File path) throws IOException {
|
||||
return open(FSDirectory.getDirectory(path, false));
|
||||
}
|
||||
|
||||
/** Returns an IndexReader reading the index in the given Directory. */
|
||||
public static IndexReader open(Directory directory) throws IOException {
|
||||
synchronized (directory) {
|
||||
SegmentInfos infos = new SegmentInfos();
|
||||
infos.read(directory);
|
||||
if (infos.size() == 1) // index is optimized
|
||||
return new SegmentReader(infos.info(0), true);
|
||||
|
||||
SegmentReader[] readers = new SegmentReader[infos.size()];
|
||||
for (int i = 0; i < infos.size(); i++)
|
||||
readers[i] = new SegmentReader(infos.info(i), i == infos.size() - 1);
|
||||
return new SegmentsReader(readers);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the time the index in the named directory was last modified. */
|
||||
public static long lastModified(String directory) throws IOException {
|
||||
return lastModified(new File(directory));
|
||||
}
|
||||
|
||||
/** Returns the time the index in the named directory was last modified. */
|
||||
public static long lastModified(File directory) throws IOException {
|
||||
return FSDirectory.fileModified(directory, "segments");
|
||||
}
|
||||
|
||||
/** Returns the time the index in this directory was last modified. */
|
||||
public static long lastModified(Directory directory) throws IOException {
|
||||
return directory.fileModified("segments");
|
||||
}
|
||||
|
||||
/** Returns the number of documents in this index. */
|
||||
abstract public int numDocs();
|
||||
/** Returns one greater than the largest possible document number.
|
||||
This may be used to, e.g., determine how big to allocate an array which
|
||||
will have an element for every document number in an index.
|
||||
*/
|
||||
abstract public int maxDoc();
|
||||
/** Returns the stored fields of the <code>n</code><sup>th</sup>
|
||||
<code>Document</code> in this index. */
|
||||
abstract public Document document(int n) throws IOException;
|
||||
|
||||
/** Returns true if document <i>n</i> has been deleted */
|
||||
abstract public boolean isDeleted(int n);
|
||||
|
||||
/** Returns the byte-encoded normalization factor for the named field of
|
||||
every document. This is used by the search code to score documents.
|
||||
@see org.apache.lucene.search.Similarity#norm
|
||||
*/
|
||||
abstract public byte[] norms(String field) throws IOException;
|
||||
|
||||
/** Returns an enumeration of all the terms in the index.
|
||||
The enumeration is ordered by Term.compareTo(). Each term
|
||||
is greater than all that precede it in the enumeration.
|
||||
*/
|
||||
abstract public TermEnum terms() throws IOException;
|
||||
/** Returns an enumeration of all terms after a given term.
|
||||
The enumeration is ordered by Term.compareTo(). Each term
|
||||
is greater than all that precede it in the enumeration.
|
||||
*/
|
||||
abstract public TermEnum terms(Term t) throws IOException;
|
||||
|
||||
/** Returns the number of documents containing the term <code>t</code>. */
|
||||
abstract public int docFreq(Term t) throws IOException;
|
||||
|
||||
/** Returns an enumeration of all the documents which contain
|
||||
<code>Term</code>. For each document, the document number, the frequency of
|
||||
the term in that document is also provided, for use in search scoring.
|
||||
Thus, this method implements the mapping:
|
||||
<p><ul>
|
||||
Term => <docNum, freq><sup>*</sup>
|
||||
</ul>
|
||||
<p>The enumeration is ordered by document number. Each document number
|
||||
is greater than all that precede it in the enumeration. */
|
||||
abstract public TermDocs termDocs(Term t) throws IOException;
|
||||
|
||||
/** Returns an enumeration of all the documents which contain
|
||||
<code>Term</code>. For each document, in addition to the document number
|
||||
and frequency of the term in that document, a list of all of the ordinal
|
||||
positions of the term in the document is available. Thus, this method
|
||||
implements the mapping:
|
||||
|
||||
<p><ul>
|
||||
Term => <docNum, freq,
|
||||
<pos<sub>1</sub>, pos<sub>2</sub>, ...
|
||||
pos<sub>freq-1</sub>>
|
||||
><sup>*</sup>
|
||||
</ul>
|
||||
<p> This positional information faciliates phrase and proximity searching.
|
||||
<p>The enumeration is ordered by document number. Each document number is
|
||||
greater than all that precede it in the enumeration. */
|
||||
abstract public TermPositions termPositions(Term t) throws IOException;
|
||||
|
||||
/** Deletes the document numbered <code>docNum</code>. Once a document is
|
||||
deleted it will not appear in TermDocs or TermPostitions enumerations.
|
||||
Attempts to read its field with the {@link #document}
|
||||
method will result in an error. The presence of this document may still be
|
||||
reflected in the {@link #docFreq} statistic, though
|
||||
this will be corrected eventually as the index is further modified. */
|
||||
abstract public void delete(int docNum) throws IOException;
|
||||
|
||||
/** Deletes all documents containing <code>term</code>.
|
||||
This is useful if one uses a document field to hold a unique ID string for
|
||||
the document. Then to delete such a document, one merely constructs a
|
||||
term with the appropriate field and the unique ID string as its text and
|
||||
passes it to this method. Returns the number of documents deleted. */
|
||||
public final int delete(Term term) throws IOException {
|
||||
TermDocs docs = termDocs(term);
|
||||
if ( docs == null ) return 0;
|
||||
int n = 0;
|
||||
try {
|
||||
while (docs.next()) {
|
||||
delete(docs.doc());
|
||||
n++;
|
||||
}
|
||||
} finally {
|
||||
docs.close();
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/** Closes files associated with this index.
|
||||
Also saves any new deletions to disk.
|
||||
No other methods should be called after this has been called. */
|
||||
abstract public void close() throws IOException;
|
||||
}
|
|
@ -0,0 +1,385 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
||||
/**
|
||||
An IndexWriter creates and maintains an index.
|
||||
|
||||
The third argument to the <a href="#IndexWriter"><b>constructor</b></a>
|
||||
determines whether a new index is created, or whether an existing index is
|
||||
opened for the addition of new documents.
|
||||
|
||||
In either case, documents are added with the <a
|
||||
href="#addDocument"><b>addDocument</b></a> method. When finished adding
|
||||
documents, <a href="#close"><b>close</b></a> should be called.
|
||||
|
||||
If an index will not have more documents added for a while and optimal search
|
||||
performance is desired, then the <a href="#optimize"><b>optimize</b></a>
|
||||
method should be called before the index is closed.
|
||||
*/
|
||||
|
||||
public final class IndexWriter {
|
||||
private Directory directory; // where this index resides
|
||||
private Analyzer analyzer; // how to analyze text
|
||||
|
||||
private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
|
||||
private final Directory ramDirectory = new RAMDirectory(); // for temp segs
|
||||
|
||||
/** Constructs an IndexWriter for the index in <code>path</code>. Text will
|
||||
be analyzed with <code>a</code>. If <code>create</code> is true, then a
|
||||
new, empty index will be created in <code>d</code>, replacing the index
|
||||
already there, if any. */
|
||||
public IndexWriter(String path, Analyzer a, boolean create)
|
||||
throws IOException {
|
||||
this(FSDirectory.getDirectory(path, create), a, create);
|
||||
}
|
||||
|
||||
/** Constructs an IndexWriter for the index in <code>path</code>. Text will
|
||||
be analyzed with <code>a</code>. If <code>create</code> is true, then a
|
||||
new, empty index will be created in <code>d</code>, replacing the index
|
||||
already there, if any. */
|
||||
public IndexWriter(File path, Analyzer a, boolean create)
|
||||
throws IOException {
|
||||
this(FSDirectory.getDirectory(path, create), a, create);
|
||||
}
|
||||
|
||||
/** Constructs an IndexWriter for the index in <code>d</code>. Text will be
|
||||
analyzed with <code>a</code>. If <code>create</code> is true, then a new,
|
||||
empty index will be created in <code>d</code>, replacing the index already
|
||||
there, if any. */
|
||||
public IndexWriter(Directory d, Analyzer a, boolean create)
|
||||
throws IOException {
|
||||
directory = d;
|
||||
analyzer = a;
|
||||
|
||||
synchronized (directory) {
|
||||
if (create)
|
||||
segmentInfos.write(directory);
|
||||
else
|
||||
segmentInfos.read(directory);
|
||||
}
|
||||
}
|
||||
|
||||
/** Flushes all changes to an index, closes all associated files, and closes
|
||||
the directory that the index is stored in. */
|
||||
public final synchronized void close() throws IOException {
|
||||
flushRamSegments();
|
||||
ramDirectory.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
/** Returns the number of documents currently in this index. */
|
||||
public final synchronized int docCount() {
|
||||
int count = 0;
|
||||
for (int i = 0; i < segmentInfos.size(); i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
count += si.docCount;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/** The maximum number of terms that will be indexed for a single field in a
|
||||
document. This limits the amount of memory required for indexing, so that
|
||||
collections with very large files will not crash the indexing process by
|
||||
running out of memory.
|
||||
|
||||
<p>By default, no more than 10,000 terms will be indexed for a field. */
|
||||
public int maxFieldLength = 10000;
|
||||
|
||||
/** Adds a document to this index.*/
|
||||
public final void addDocument(Document doc) throws IOException {
|
||||
DocumentWriter dw =
|
||||
new DocumentWriter(ramDirectory, analyzer, maxFieldLength);
|
||||
String segmentName = newSegmentName();
|
||||
dw.addDocument(segmentName, doc);
|
||||
synchronized (this) {
|
||||
segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory));
|
||||
maybeMergeSegments();
|
||||
}
|
||||
}
|
||||
|
||||
private final synchronized String newSegmentName() {
|
||||
return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
|
||||
}
|
||||
|
||||
/** Determines how often segment indexes are merged by addDocument(). With
|
||||
* smaller values, less RAM is used while indexing, and searches on
|
||||
* unoptimized indexes are faster, but indexing speed is slower. With larger
|
||||
* values more RAM is used while indexing and searches on unoptimized indexes
|
||||
* are slower, but indexing is faster. Thus larger values (> 10) are best
|
||||
* for batched index creation, and smaller values (< 10) for indexes that are
|
||||
* interactively maintained.
|
||||
*
|
||||
* <p>This must never be less than 2. The default value is 10.*/
|
||||
public int mergeFactor = 10;
|
||||
|
||||
/** Determines the largest number of documents ever merged by addDocument().
|
||||
* Small values (e.g., less than 10,000) are best for interactive indexing,
|
||||
* as this limits the length of pauses while indexing to a few seconds.
|
||||
* Larger values are best for batched indexing and speedier searches.
|
||||
*
|
||||
* <p>The default value is {@link Integer#MAX_VALUE}. */
|
||||
public int maxMergeDocs = Integer.MAX_VALUE;
|
||||
|
||||
/** If non-null, information about merges will be printed to this. */
|
||||
public PrintStream infoStream = null;
|
||||
|
||||
/** Merges all segments together into a single segment, optimizing an index
|
||||
for search. */
|
||||
public final synchronized void optimize() throws IOException {
|
||||
flushRamSegments();
|
||||
while (segmentInfos.size() > 1 ||
|
||||
(segmentInfos.size() == 1 &&
|
||||
SegmentReader.hasDeletions(segmentInfos.info(0)))){
|
||||
int minSegment = segmentInfos.size() - mergeFactor;
|
||||
mergeSegments(minSegment < 0 ? 0 : minSegment);
|
||||
}
|
||||
}
|
||||
|
||||
/** Merges all segments from an array of indexes into this index.
|
||||
*
|
||||
* <p>This may be used to parallelize batch indexing. A large document
|
||||
* collection can be broken into sub-collections. Each sub-collection can be
|
||||
* indexed in parallel, on a different thread, process or machine. The
|
||||
* complete index can then be created by merging sub-collection indexes
|
||||
* with this method.
|
||||
*
|
||||
* <p>After this completes, the index is optimized. */
|
||||
public final synchronized void addIndexes(Directory[] dirs)
|
||||
throws IOException {
|
||||
optimize(); // start with zero or 1 seg
|
||||
int minSegment = segmentInfos.size();
|
||||
int segmentsAddedSinceMerge = 0;
|
||||
for (int i = 0; i < dirs.length; i++) {
|
||||
SegmentInfos sis = new SegmentInfos(); // read infos from dir
|
||||
sis.read(dirs[i]);
|
||||
for (int j = 0; j < sis.size(); j++) {
|
||||
segmentInfos.addElement(sis.info(j)); // add each info
|
||||
|
||||
// merge whenever mergeFactor segments have been added
|
||||
if (++segmentsAddedSinceMerge == mergeFactor) {
|
||||
mergeSegments(minSegment++, false);
|
||||
segmentsAddedSinceMerge = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
optimize(); // final cleanup
|
||||
}
|
||||
|
||||
/** Merges all RAM-resident segments. */
|
||||
private final void flushRamSegments() throws IOException {
|
||||
int minSegment = segmentInfos.size()-1;
|
||||
int docCount = 0;
|
||||
while (minSegment >= 0 &&
|
||||
(segmentInfos.info(minSegment)).dir == ramDirectory) {
|
||||
docCount += segmentInfos.info(minSegment).docCount;
|
||||
minSegment--;
|
||||
}
|
||||
if (minSegment < 0 || // add one FS segment?
|
||||
(docCount + segmentInfos.info(minSegment).docCount) > mergeFactor ||
|
||||
!(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory))
|
||||
minSegment++;
|
||||
if (minSegment >= segmentInfos.size())
|
||||
return; // none to merge
|
||||
mergeSegments(minSegment);
|
||||
}
|
||||
|
||||
/** Incremental segment merger. */
|
||||
private final void maybeMergeSegments() throws IOException {
|
||||
long targetMergeDocs = mergeFactor;
|
||||
while (targetMergeDocs <= maxMergeDocs) {
|
||||
// find segments smaller than current target size
|
||||
int minSegment = segmentInfos.size();
|
||||
int mergeDocs = 0;
|
||||
while (--minSegment >= 0) {
|
||||
SegmentInfo si = segmentInfos.info(minSegment);
|
||||
if (si.docCount >= targetMergeDocs)
|
||||
break;
|
||||
mergeDocs += si.docCount;
|
||||
}
|
||||
|
||||
if (mergeDocs >= targetMergeDocs) // found a merge to do
|
||||
mergeSegments(minSegment+1);
|
||||
else
|
||||
break;
|
||||
|
||||
targetMergeDocs *= mergeFactor; // increase target size
|
||||
}
|
||||
}
|
||||
|
||||
/** Pops segments off of segmentInfos stack down to minSegment, merges them,
|
||||
and pushes the merged index onto the top of the segmentInfos stack. */
|
||||
private final void mergeSegments(int minSegment) throws IOException {
|
||||
mergeSegments(minSegment, true);
|
||||
}
|
||||
|
||||
/** Pops segments off of segmentInfos stack down to minSegment, merges them,
|
||||
and pushes the merged index onto the top of the segmentInfos stack. */
|
||||
private final void mergeSegments(int minSegment, boolean delete)
|
||||
throws IOException {
|
||||
String mergedName = newSegmentName();
|
||||
int mergedDocCount = 0;
|
||||
if (infoStream != null) infoStream.print("merging segments");
|
||||
SegmentMerger merger = new SegmentMerger(directory, mergedName);
|
||||
Vector segmentsToDelete = new Vector();
|
||||
for (int i = minSegment; i < segmentInfos.size(); i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
if (infoStream != null)
|
||||
infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
|
||||
SegmentReader reader = new SegmentReader(si);
|
||||
merger.add(reader);
|
||||
if (delete)
|
||||
segmentsToDelete.addElement(reader); // queue for deletion
|
||||
mergedDocCount += si.docCount;
|
||||
}
|
||||
if (infoStream != null) {
|
||||
infoStream.println();
|
||||
infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
|
||||
}
|
||||
merger.merge();
|
||||
|
||||
segmentInfos.setSize(minSegment); // pop old infos & add new
|
||||
segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount,
|
||||
directory));
|
||||
|
||||
synchronized (directory) {
|
||||
segmentInfos.write(directory); // commit before deleting
|
||||
deleteSegments(segmentsToDelete); // delete now-unused segments
|
||||
}
|
||||
}
|
||||
|
||||
/* Some operating systems (e.g. Windows) don't permit a file to be deleted
|
||||
while it is opened for read (e.g. by another process or thread). So we
|
||||
assume that when a delete fails it is because the file is open in another
|
||||
process, and queue the file for subsequent deletion. */
|
||||
|
||||
private final void deleteSegments(Vector segments) throws IOException {
|
||||
Vector deletable = new Vector();
|
||||
|
||||
deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable
|
||||
|
||||
for (int i = 0; i < segments.size(); i++) {
|
||||
SegmentReader reader = (SegmentReader)segments.elementAt(i);
|
||||
if (reader.directory == this.directory)
|
||||
deleteFiles(reader.files(), deletable); // try to delete our files
|
||||
else
|
||||
deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files
|
||||
}
|
||||
|
||||
writeDeleteableFiles(deletable); // note files we can't delete
|
||||
}
|
||||
|
||||
private final void deleteFiles(Vector files, Directory directory)
|
||||
throws IOException {
|
||||
for (int i = 0; i < files.size(); i++)
|
||||
directory.deleteFile((String)files.elementAt(i));
|
||||
}
|
||||
|
||||
private final void deleteFiles(Vector files, Vector deletable)
|
||||
throws IOException {
|
||||
for (int i = 0; i < files.size(); i++) {
|
||||
String file = (String)files.elementAt(i);
|
||||
try {
|
||||
directory.deleteFile(file); // try to delete each file
|
||||
} catch (IOException e) { // if delete fails
|
||||
if (directory.fileExists(file)) {
|
||||
if (infoStream != null)
|
||||
infoStream.println(e.getMessage() + "; Will re-try later.");
|
||||
deletable.addElement(file); // add to deletable
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Vector readDeleteableFiles() throws IOException {
|
||||
Vector result = new Vector();
|
||||
if (!directory.fileExists("deletable"))
|
||||
return result;
|
||||
|
||||
InputStream input = directory.openFile("deletable");
|
||||
try {
|
||||
for (int i = input.readInt(); i > 0; i--) // read file names
|
||||
result.addElement(input.readString());
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private final void writeDeleteableFiles(Vector files) throws IOException {
|
||||
OutputStream output = directory.createFile("deleteable.new");
|
||||
try {
|
||||
output.writeInt(files.size());
|
||||
for (int i = 0; i < files.size(); i++)
|
||||
output.writeString((String)files.elementAt(i));
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
directory.renameFile("deleteable.new", "deletable");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
# sub-directory makefile for lucene
|
||||
include ../rules.mk
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
final class SegmentInfo {
|
||||
public String name; // unique name in dir
|
||||
public int docCount; // number of docs in seg
|
||||
public Directory dir; // where segment resides
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir) {
|
||||
this.name = name;
|
||||
this.docCount = docCount;
|
||||
this.dir = dir;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Vector;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
|
||||
final class SegmentInfos extends Vector {
|
||||
public int counter = 0; // used to name new segments
|
||||
|
||||
public final SegmentInfo info(int i) {
|
||||
return (SegmentInfo)elementAt(i);
|
||||
}
|
||||
|
||||
public final void read(Directory directory) throws IOException {
|
||||
InputStream input = directory.openFile("segments");
|
||||
try {
|
||||
counter = input.readInt(); // read counter
|
||||
for (int i = input.readInt(); i > 0; i--) { // read segmentInfos
|
||||
SegmentInfo si = new SegmentInfo(input.readString(), input.readInt(),
|
||||
directory);
|
||||
addElement(si);
|
||||
}
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
public final void write(Directory directory) throws IOException {
|
||||
OutputStream output = directory.createFile("segments.new");
|
||||
try {
|
||||
output.writeInt(counter); // write counter
|
||||
output.writeInt(size()); // write infos
|
||||
for (int i = 0; i < size(); i++) {
|
||||
SegmentInfo si = info(i);
|
||||
output.writeString(si.name);
|
||||
output.writeInt(si.docCount);
|
||||
}
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
|
||||
// install new segment info
|
||||
directory.renameFile("segments.new", "segments");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
|
||||
final class SegmentMergeInfo {
|
||||
Term term;
|
||||
int base;
|
||||
SegmentTermEnum termEnum;
|
||||
SegmentReader reader;
|
||||
SegmentTermPositions postings;
|
||||
int[] docMap = null; // maps around deleted docs
|
||||
|
||||
SegmentMergeInfo(int b, SegmentTermEnum te, SegmentReader r)
|
||||
throws IOException {
|
||||
base = b;
|
||||
reader = r;
|
||||
termEnum = te;
|
||||
term = te.term();
|
||||
postings = new SegmentTermPositions(r);
|
||||
|
||||
if (reader.deletedDocs != null) {
|
||||
// build array which maps document numbers around deletions
|
||||
BitVector deletedDocs = reader.deletedDocs;
|
||||
int maxDoc = reader.maxDoc();
|
||||
docMap = new int[maxDoc];
|
||||
int j = 0;
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
if (deletedDocs.get(i))
|
||||
docMap[i] = -1;
|
||||
else
|
||||
docMap[i] = j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final boolean next() throws IOException {
|
||||
if (termEnum.next()) {
|
||||
term = termEnum.term();
|
||||
return true;
|
||||
} else {
|
||||
term = null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
termEnum.close();
|
||||
postings.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
final class SegmentMergeQueue extends PriorityQueue {
|
||||
SegmentMergeQueue(int size) {
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
protected final boolean lessThan(Object a, Object b) {
|
||||
SegmentMergeInfo stiA = (SegmentMergeInfo)a;
|
||||
SegmentMergeInfo stiB = (SegmentMergeInfo)b;
|
||||
int comparison = stiA.term.compareTo(stiB.term);
|
||||
if (comparison == 0)
|
||||
return stiA.base < stiB.base;
|
||||
else
|
||||
return comparison < 0;
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
while (top() != null)
|
||||
((SegmentMergeInfo)pop()).close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,275 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Vector;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
|
||||
final class SegmentMerger {
|
||||
private Directory directory;
|
||||
private String segment;
|
||||
|
||||
private Vector readers = new Vector();
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
SegmentMerger(Directory dir, String name) {
|
||||
directory = dir;
|
||||
segment = name;
|
||||
}
|
||||
|
||||
final void add(SegmentReader reader) {
|
||||
readers.addElement(reader);
|
||||
}
|
||||
|
||||
final SegmentReader segmentReader(int i) {
|
||||
return (SegmentReader)readers.elementAt(i);
|
||||
}
|
||||
|
||||
final void merge() throws IOException {
|
||||
try {
|
||||
mergeFields();
|
||||
mergeTerms();
|
||||
mergeNorms();
|
||||
|
||||
} finally {
|
||||
for (int i = 0; i < readers.size(); i++) { // close readers
|
||||
SegmentReader reader = (SegmentReader)readers.elementAt(i);
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final void mergeFields() throws IOException {
|
||||
fieldInfos = new FieldInfos(); // merge field names
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
SegmentReader reader = (SegmentReader)readers.elementAt(i);
|
||||
fieldInfos.add(reader.fieldInfos);
|
||||
}
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
||||
FieldsWriter fieldsWriter = // merge field values
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
try {
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
SegmentReader reader = (SegmentReader)readers.elementAt(i);
|
||||
BitVector deletedDocs = reader.deletedDocs;
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int j = 0; j < maxDoc; j++)
|
||||
if (deletedDocs == null || !deletedDocs.get(j)) // skip deleted docs
|
||||
fieldsWriter.addDocument(reader.document(j));
|
||||
}
|
||||
} finally {
|
||||
fieldsWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
private OutputStream freqOutput = null;
|
||||
private OutputStream proxOutput = null;
|
||||
private TermInfosWriter termInfosWriter = null;
|
||||
private SegmentMergeQueue queue = null;
|
||||
|
||||
private final void mergeTerms() throws IOException {
|
||||
try {
|
||||
freqOutput = directory.createFile(segment + ".frq");
|
||||
proxOutput = directory.createFile(segment + ".prx");
|
||||
termInfosWriter =
|
||||
new TermInfosWriter(directory, segment, fieldInfos);
|
||||
|
||||
mergeTermInfos();
|
||||
|
||||
} finally {
|
||||
if (freqOutput != null) freqOutput.close();
|
||||
if (proxOutput != null) proxOutput.close();
|
||||
if (termInfosWriter != null) termInfosWriter.close();
|
||||
if (queue != null) queue.close();
|
||||
}
|
||||
}
|
||||
|
||||
private final void mergeTermInfos() throws IOException {
|
||||
queue = new SegmentMergeQueue(readers.size());
|
||||
int base = 0;
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
SegmentReader reader = (SegmentReader)readers.elementAt(i);
|
||||
SegmentTermEnum termEnum = (SegmentTermEnum)reader.terms();
|
||||
SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
|
||||
base += reader.numDocs();
|
||||
if (smi.next())
|
||||
queue.put(smi); // initialize queue
|
||||
else
|
||||
smi.close();
|
||||
}
|
||||
|
||||
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
|
||||
|
||||
while (queue.size() > 0) {
|
||||
int matchSize = 0; // pop matching terms
|
||||
match[matchSize++] = (SegmentMergeInfo)queue.pop();
|
||||
Term term = match[0].term;
|
||||
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
|
||||
|
||||
while (top != null && term.compareTo(top.term) == 0) {
|
||||
match[matchSize++] = (SegmentMergeInfo)queue.pop();
|
||||
top = (SegmentMergeInfo)queue.top();
|
||||
}
|
||||
|
||||
mergeTermInfo(match, matchSize); // add new TermInfo
|
||||
|
||||
while (matchSize > 0) {
|
||||
SegmentMergeInfo smi = match[--matchSize];
|
||||
if (smi.next())
|
||||
queue.put(smi); // restore queue
|
||||
else
|
||||
smi.close(); // done with a segment
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final TermInfo termInfo = new TermInfo(); // minimize consing
|
||||
|
||||
private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
|
||||
throws IOException {
|
||||
long freqPointer = freqOutput.getFilePointer();
|
||||
long proxPointer = proxOutput.getFilePointer();
|
||||
|
||||
int df = appendPostings(smis, n); // append posting data
|
||||
|
||||
if (df > 0) {
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
termInfo.set(df, freqPointer, proxPointer);
|
||||
termInfosWriter.add(smis[0].term, termInfo);
|
||||
}
|
||||
}
|
||||
|
||||
private final int appendPostings(SegmentMergeInfo[] smis, int n)
|
||||
throws IOException {
|
||||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
for (int i = 0; i < n; i++) {
|
||||
SegmentMergeInfo smi = smis[i];
|
||||
SegmentTermPositions postings = smi.postings;
|
||||
int base = smi.base;
|
||||
int[] docMap = smi.docMap;
|
||||
smi.termEnum.termInfo(termInfo);
|
||||
postings.seek(termInfo);
|
||||
while (postings.next()) {
|
||||
int doc;
|
||||
if (docMap == null)
|
||||
doc = base + postings.doc; // no deletions
|
||||
else
|
||||
doc = base + docMap[postings.doc]; // re-map around deletions
|
||||
|
||||
if (doc < lastDoc)
|
||||
throw new IllegalStateException("docs out of order");
|
||||
|
||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||
lastDoc = doc;
|
||||
|
||||
int freq = postings.freq;
|
||||
if (freq == 1) {
|
||||
freqOutput.writeVInt(docCode | 1); // write doc & freq=1
|
||||
} else {
|
||||
freqOutput.writeVInt(docCode); // write doc
|
||||
freqOutput.writeVInt(freq); // write frequency in doc
|
||||
}
|
||||
|
||||
int lastPosition = 0; // write position deltas
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int position = postings.nextPosition();
|
||||
proxOutput.writeVInt(position - lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
|
||||
df++;
|
||||
}
|
||||
}
|
||||
return df;
|
||||
}
|
||||
|
||||
private final void mergeNorms() throws IOException {
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed) {
|
||||
OutputStream output = directory.createFile(segment + ".f" + i);
|
||||
try {
|
||||
for (int j = 0; j < readers.size(); j++) {
|
||||
SegmentReader reader = (SegmentReader)readers.elementAt(j);
|
||||
BitVector deletedDocs = reader.deletedDocs;
|
||||
InputStream input = reader.normStream(fi.name);
|
||||
int maxDoc = reader.maxDoc();
|
||||
try {
|
||||
for (int k = 0; k < maxDoc; k++) {
|
||||
byte norm = input != null ? input.readByte() : (byte)0;
|
||||
if (deletedDocs == null || !deletedDocs.get(k))
|
||||
output.writeByte(norm);
|
||||
}
|
||||
} finally {
|
||||
if (input != null)
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,284 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.util.BitVector;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
final class SegmentReader extends IndexReader {
|
||||
Directory directory;
|
||||
private boolean closeDirectory = false;
|
||||
private String segment;
|
||||
|
||||
FieldInfos fieldInfos;
|
||||
private FieldsReader fieldsReader;
|
||||
|
||||
TermInfosReader tis;
|
||||
|
||||
BitVector deletedDocs = null;
|
||||
private boolean deletedDocsDirty = false;
|
||||
|
||||
private InputStream freqStream;
|
||||
private InputStream proxStream;
|
||||
|
||||
|
||||
private static class Norm {
|
||||
public Norm(InputStream in) { this.in = in; }
|
||||
public InputStream in;
|
||||
public byte[] bytes;
|
||||
}
|
||||
private Hashtable norms = new Hashtable();
|
||||
|
||||
SegmentReader(SegmentInfo si, boolean closeDir)
|
||||
throws IOException {
|
||||
this(si);
|
||||
closeDirectory = closeDir;
|
||||
}
|
||||
|
||||
SegmentReader(SegmentInfo si)
|
||||
throws IOException {
|
||||
directory = si.dir;
|
||||
segment = si.name;
|
||||
|
||||
fieldInfos = new FieldInfos(directory, segment + ".fnm");
|
||||
fieldsReader = new FieldsReader(directory, segment, fieldInfos);
|
||||
|
||||
tis = new TermInfosReader(directory, segment, fieldInfos);
|
||||
|
||||
if (hasDeletions(si))
|
||||
deletedDocs = new BitVector(directory, segment + ".del");
|
||||
|
||||
// make sure that all index files have been read or are kept open
|
||||
// so that if an index update removes them we'll still have them
|
||||
freqStream = directory.openFile(segment + ".frq");
|
||||
proxStream = directory.openFile(segment + ".prx");
|
||||
openNorms();
|
||||
}
|
||||
|
||||
public final synchronized void close() throws IOException {
|
||||
if (deletedDocsDirty) {
|
||||
synchronized (directory) {
|
||||
deletedDocs.write(directory, segment + ".tmp");
|
||||
directory.renameFile(segment + ".tmp", segment + ".del");
|
||||
}
|
||||
deletedDocsDirty = false;
|
||||
}
|
||||
|
||||
fieldsReader.close();
|
||||
tis.close();
|
||||
|
||||
if (freqStream != null)
|
||||
freqStream.close();
|
||||
if (proxStream != null)
|
||||
proxStream.close();
|
||||
|
||||
closeNorms();
|
||||
|
||||
if (closeDirectory)
|
||||
directory.close();
|
||||
}
|
||||
|
||||
final static boolean hasDeletions(SegmentInfo si) throws IOException {
|
||||
return si.dir.fileExists(si.name + ".del");
|
||||
}
|
||||
|
||||
public final synchronized void delete(int docNum) throws IOException {
|
||||
if (deletedDocs == null)
|
||||
deletedDocs = new BitVector(maxDoc());
|
||||
deletedDocsDirty = true;
|
||||
deletedDocs.set(docNum);
|
||||
}
|
||||
|
||||
final Vector files() throws IOException {
|
||||
Vector files = new Vector(16);
|
||||
files.addElement(segment + ".fnm");
|
||||
files.addElement(segment + ".fdx");
|
||||
files.addElement(segment + ".fdt");
|
||||
files.addElement(segment + ".tii");
|
||||
files.addElement(segment + ".tis");
|
||||
files.addElement(segment + ".frq");
|
||||
files.addElement(segment + ".prx");
|
||||
|
||||
if (directory.fileExists(segment + ".del"))
|
||||
files.addElement(segment + ".del");
|
||||
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed)
|
||||
files.addElement(segment + ".f" + i);
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
public final TermEnum terms() throws IOException {
|
||||
return tis.terms();
|
||||
}
|
||||
|
||||
public final TermEnum terms(Term t) throws IOException {
|
||||
return tis.terms(t);
|
||||
}
|
||||
|
||||
public final synchronized Document document(int n) throws IOException {
|
||||
if (isDeleted(n))
|
||||
throw new IllegalArgumentException
|
||||
("attempt to access a deleted document");
|
||||
return fieldsReader.doc(n);
|
||||
}
|
||||
|
||||
public final synchronized boolean isDeleted(int n) {
|
||||
return (deletedDocs != null && deletedDocs.get(n));
|
||||
}
|
||||
|
||||
public final TermDocs termDocs(Term t) throws IOException {
|
||||
TermInfo ti = tis.get(t);
|
||||
if (ti != null)
|
||||
return new SegmentTermDocs(this, ti);
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
final InputStream getFreqStream () {
|
||||
return (InputStream)freqStream.clone();
|
||||
}
|
||||
|
||||
public final TermPositions termPositions(Term t) throws IOException {
|
||||
TermInfo ti = tis.get(t);
|
||||
if (ti != null)
|
||||
return new SegmentTermPositions(this, ti);
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
final InputStream getProxStream () {
|
||||
return (InputStream)proxStream.clone();
|
||||
}
|
||||
|
||||
public final int docFreq(Term t) throws IOException {
|
||||
TermInfo ti = tis.get(t);
|
||||
if (ti != null)
|
||||
return ti.docFreq;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
public final int numDocs() {
|
||||
int n = maxDoc();
|
||||
if (deletedDocs != null)
|
||||
n -= deletedDocs.count();
|
||||
return n;
|
||||
}
|
||||
|
||||
public final int maxDoc() {
|
||||
return fieldsReader.size();
|
||||
}
|
||||
|
||||
public final byte[] norms(String field) throws IOException {
|
||||
Norm norm = (Norm)norms.get(field);
|
||||
if (norm == null)
|
||||
return null;
|
||||
if (norm.bytes == null) {
|
||||
byte[] bytes = new byte[maxDoc()];
|
||||
norms(field, bytes, 0);
|
||||
norm.bytes = bytes;
|
||||
}
|
||||
return norm.bytes;
|
||||
}
|
||||
|
||||
final void norms(String field, byte[] bytes, int offset) throws IOException {
|
||||
InputStream normStream = normStream(field);
|
||||
if (normStream == null)
|
||||
return; // use zeros in array
|
||||
try {
|
||||
normStream.readBytes(bytes, offset, maxDoc());
|
||||
} finally {
|
||||
normStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
final InputStream normStream(String field) throws IOException {
|
||||
Norm norm = (Norm)norms.get(field);
|
||||
if (norm == null)
|
||||
return null;
|
||||
InputStream result = (InputStream)norm.in.clone();
|
||||
result.seek(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
private final void openNorms() throws IOException {
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed)
|
||||
norms.put(fi.name,
|
||||
new Norm(directory.openFile(segment + ".f" + fi.number)));
|
||||
}
|
||||
}
|
||||
|
||||
private final void closeNorms() throws IOException {
|
||||
synchronized (norms) {
|
||||
Enumeration enum = norms.elements();
|
||||
while (enum.hasMoreElements()) {
|
||||
Norm norm = (Norm)enum.nextElement();
|
||||
norm.in.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
class SegmentTermDocs implements TermDocs {
|
||||
protected SegmentReader parent;
|
||||
private InputStream freqStream;
|
||||
private int freqCount;
|
||||
private BitVector deletedDocs;
|
||||
int doc = 0;
|
||||
int freq;
|
||||
|
||||
SegmentTermDocs(SegmentReader p) throws IOException {
|
||||
parent = p;
|
||||
freqStream = parent.getFreqStream();
|
||||
deletedDocs = parent.deletedDocs;
|
||||
}
|
||||
|
||||
SegmentTermDocs(SegmentReader p, TermInfo ti) throws IOException {
|
||||
this(p);
|
||||
seek(ti);
|
||||
}
|
||||
|
||||
void seek(TermInfo ti) throws IOException {
|
||||
freqCount = ti.docFreq;
|
||||
doc = 0;
|
||||
freqStream.seek(ti.freqPointer);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
freqStream.close();
|
||||
}
|
||||
|
||||
public final int doc() { return doc; }
|
||||
public final int freq() { return freq; }
|
||||
|
||||
protected void skippingDoc() throws IOException {
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
while (true) {
|
||||
if (freqCount == 0)
|
||||
return false;
|
||||
|
||||
int docCode = freqStream.readVInt();
|
||||
doc += docCode >>> 1; // shift off low bit
|
||||
if ((docCode & 1) != 0) // if low bit is set
|
||||
freq = 1; // freq is one
|
||||
else
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
|
||||
freqCount--;
|
||||
|
||||
if (deletedDocs == null || !deletedDocs.get(doc))
|
||||
break;
|
||||
skippingDoc();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Optimized implementation. */
|
||||
public int read(final int[] docs, final int[] freqs)
|
||||
throws IOException {
|
||||
final int end = docs.length;
|
||||
int i = 0;
|
||||
while (i < end && freqCount > 0) {
|
||||
|
||||
// manually inlined call to next() for speed
|
||||
final int docCode = freqStream.readVInt();
|
||||
doc += docCode >>> 1; // shift off low bit
|
||||
if ((docCode & 1) != 0) // if low bit is set
|
||||
freq = 1; // freq is one
|
||||
else
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
freqCount--;
|
||||
|
||||
if (deletedDocs == null || !deletedDocs.get(doc)) {
|
||||
docs[i] = doc;
|
||||
freqs[i] = freq;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/** As yet unoptimized implementation. */
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
do {
|
||||
if (!next())
|
||||
return false;
|
||||
} while (target > doc);
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,184 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
final class SegmentTermEnum extends TermEnum implements Cloneable {
|
||||
private InputStream input;
|
||||
private FieldInfos fieldInfos;
|
||||
int size;
|
||||
int position = -1;
|
||||
|
||||
private Term term = new Term("", "");
|
||||
private TermInfo termInfo = new TermInfo();
|
||||
|
||||
boolean isIndex = false;
|
||||
long indexPointer = 0;
|
||||
Term prev;
|
||||
|
||||
private char[] buffer = {};
|
||||
|
||||
SegmentTermEnum(InputStream i, FieldInfos fis, boolean isi)
|
||||
throws IOException {
|
||||
input = i;
|
||||
fieldInfos = fis;
|
||||
size = input.readInt();
|
||||
isIndex = isi;
|
||||
}
|
||||
|
||||
protected Object clone() {
|
||||
SegmentTermEnum clone = null;
|
||||
try {
|
||||
clone = (SegmentTermEnum)super.clone();
|
||||
} catch (CloneNotSupportedException e) {}
|
||||
|
||||
clone.input = (InputStream)input.clone();
|
||||
clone.termInfo = new TermInfo(termInfo);
|
||||
clone.growBuffer(term.text.length());
|
||||
|
||||
return clone;
|
||||
}
|
||||
|
||||
final void seek(long pointer, int p, Term t, TermInfo ti)
|
||||
throws IOException {
|
||||
input.seek(pointer);
|
||||
position = p;
|
||||
term = t;
|
||||
prev = null;
|
||||
termInfo.set(ti);
|
||||
growBuffer(term.text.length()); // copy term text into buffer
|
||||
}
|
||||
|
||||
/** Increments the enumeration to the next element. True if one exists.*/
|
||||
public final boolean next() throws IOException {
|
||||
if (position++ >= size-1) {
|
||||
term = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
prev = term;
|
||||
term = readTerm();
|
||||
|
||||
termInfo.docFreq = input.readVInt(); // read doc freq
|
||||
termInfo.freqPointer += input.readVLong(); // read freq pointer
|
||||
termInfo.proxPointer += input.readVLong(); // read prox pointer
|
||||
|
||||
if (isIndex)
|
||||
indexPointer += input.readVLong(); // read index pointer
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private final Term readTerm() throws IOException {
|
||||
int start = input.readVInt();
|
||||
int length = input.readVInt();
|
||||
int totalLength = start + length;
|
||||
if (buffer.length < totalLength)
|
||||
growBuffer(totalLength);
|
||||
|
||||
input.readChars(buffer, start, length);
|
||||
return new Term(fieldInfos.fieldName(input.readVInt()),
|
||||
new String(buffer, 0, totalLength), false);
|
||||
}
|
||||
|
||||
private final void growBuffer(int length) {
|
||||
buffer = new char[length];
|
||||
for (int i = 0; i < term.text.length(); i++) // copy contents
|
||||
buffer[i] = term.text.charAt(i);
|
||||
}
|
||||
|
||||
/** Returns the current Term in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
public final Term term() {
|
||||
return term;
|
||||
}
|
||||
|
||||
/** Returns the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
final TermInfo termInfo() {
|
||||
return new TermInfo(termInfo);
|
||||
}
|
||||
|
||||
/** Sets the argument to the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
final void termInfo(TermInfo ti) {
|
||||
ti.set(termInfo);
|
||||
}
|
||||
|
||||
/** Returns the docFreq from the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
public final int docFreq() {
|
||||
return termInfo.docFreq;
|
||||
}
|
||||
|
||||
/* Returns the freqPointer from the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
final long freqPointer() {
|
||||
return termInfo.freqPointer;
|
||||
}
|
||||
|
||||
/* Returns the proxPointer from the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
final long proxPointer() {
|
||||
return termInfo.proxPointer;
|
||||
}
|
||||
|
||||
/** Closes the enumeration to further activity, freeing resources. */
|
||||
public final void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
final class SegmentTermPositions
|
||||
extends SegmentTermDocs implements TermPositions {
|
||||
private InputStream proxStream;
|
||||
private int proxCount;
|
||||
private int position;
|
||||
|
||||
SegmentTermPositions(SegmentReader p) throws IOException {
|
||||
super(p);
|
||||
proxStream = parent.getProxStream();
|
||||
}
|
||||
|
||||
SegmentTermPositions(SegmentReader p, TermInfo ti)
|
||||
throws IOException {
|
||||
this(p);
|
||||
seek(ti);
|
||||
}
|
||||
|
||||
final void seek(TermInfo ti) throws IOException {
|
||||
super.seek(ti);
|
||||
proxStream.seek(ti.proxPointer);
|
||||
}
|
||||
|
||||
public final void close() throws IOException {
|
||||
super.close();
|
||||
proxStream.close();
|
||||
}
|
||||
|
||||
public final int nextPosition() throws IOException {
|
||||
proxCount--;
|
||||
return position += proxStream.readVInt();
|
||||
}
|
||||
|
||||
protected final void skippingDoc() throws IOException {
|
||||
for (int f = freq; f > 0; f--) // skip all positions
|
||||
proxStream.readVInt();
|
||||
}
|
||||
|
||||
public final boolean next() throws IOException {
|
||||
for (int f = proxCount; f > 0; f--) // skip unread positions
|
||||
proxStream.readVInt();
|
||||
|
||||
if (super.next()) { // run super
|
||||
proxCount = freq; // note frequency
|
||||
position = 0; // reset position
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public final int read(final int[] docs, final int[] freqs)
|
||||
throws IOException {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,329 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
final class SegmentsReader extends IndexReader {
|
||||
protected SegmentReader[] readers;
|
||||
protected int[] starts; // 1st docno for each segment
|
||||
private Hashtable normsCache = new Hashtable();
|
||||
private int maxDoc = 0;
|
||||
private int numDocs = -1;
|
||||
|
||||
SegmentsReader(SegmentReader[] r) throws IOException {
|
||||
readers = r;
|
||||
starts = new int[readers.length + 1]; // build starts array
|
||||
for (int i = 0; i < readers.length; i++) {
|
||||
starts[i] = maxDoc;
|
||||
maxDoc += readers[i].maxDoc(); // compute maxDocs
|
||||
}
|
||||
starts[readers.length] = maxDoc;
|
||||
}
|
||||
|
||||
public final int numDocs() {
|
||||
if (numDocs == -1) { // check cache
|
||||
int n = 0; // cache miss--recompute
|
||||
for (int i = 0; i < readers.length; i++)
|
||||
n += readers[i].numDocs(); // sum from readers
|
||||
numDocs = n;
|
||||
}
|
||||
return numDocs;
|
||||
}
|
||||
|
||||
public final int maxDoc() {
|
||||
return maxDoc;
|
||||
}
|
||||
|
||||
public final Document document(int n) throws IOException {
|
||||
int i = readerIndex(n); // find segment num
|
||||
return readers[i].document(n - starts[i]); // dispatch to segment reader
|
||||
}
|
||||
|
||||
public final boolean isDeleted(int n) {
|
||||
int i = readerIndex(n); // find segment num
|
||||
return readers[i].isDeleted(n - starts[i]); // dispatch to segment reader
|
||||
}
|
||||
|
||||
public final void delete(int n) throws IOException {
|
||||
numDocs = -1; // invalidate cache
|
||||
int i = readerIndex(n); // find segment num
|
||||
readers[i].delete(n - starts[i]); // dispatch to segment reader
|
||||
}
|
||||
|
||||
private final int readerIndex(int n) { // find reader for doc n:
|
||||
int lo = 0; // search starts array
|
||||
int hi = readers.length - 1; // for first element less
|
||||
// than n, return its index
|
||||
while (hi >= lo) {
|
||||
int mid = (lo + hi) >> 1;
|
||||
int midValue = starts[mid];
|
||||
if (n < midValue)
|
||||
hi = mid - 1;
|
||||
else if (n > midValue)
|
||||
lo = mid + 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
public final synchronized byte[] norms(String field) throws IOException {
|
||||
byte[] bytes = (byte[])normsCache.get(field);
|
||||
if (bytes != null)
|
||||
return bytes; // cache hit
|
||||
|
||||
bytes = new byte[maxDoc()];
|
||||
for (int i = 0; i < readers.length; i++)
|
||||
readers[i].norms(field, bytes, starts[i]);
|
||||
normsCache.put(field, bytes); // update cache
|
||||
return bytes;
|
||||
}
|
||||
|
||||
public final TermEnum terms() throws IOException {
|
||||
return new SegmentsTermEnum(readers, starts, null);
|
||||
}
|
||||
|
||||
public final TermEnum terms(Term term) throws IOException {
|
||||
return new SegmentsTermEnum(readers, starts, term);
|
||||
}
|
||||
|
||||
public final int docFreq(Term t) throws IOException {
|
||||
int total = 0; // sum freqs in segments
|
||||
for (int i = 0; i < readers.length; i++)
|
||||
total += readers[i].docFreq(t);
|
||||
return total;
|
||||
}
|
||||
|
||||
public final TermDocs termDocs(Term term) throws IOException {
|
||||
return new SegmentsTermDocs(readers, starts, term);
|
||||
}
|
||||
|
||||
public final TermPositions termPositions(Term term) throws IOException {
|
||||
return new SegmentsTermPositions(readers, starts, term);
|
||||
}
|
||||
|
||||
public final void close() throws IOException {
|
||||
for (int i = 0; i < readers.length; i++)
|
||||
readers[i].close();
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentsTermEnum extends TermEnum {
|
||||
private SegmentMergeQueue queue;
|
||||
|
||||
private Term term;
|
||||
private int docFreq;
|
||||
|
||||
SegmentsTermEnum(SegmentReader[] readers, int[] starts, Term t)
|
||||
throws IOException {
|
||||
queue = new SegmentMergeQueue(readers.length);
|
||||
for (int i = 0; i < readers.length; i++) {
|
||||
SegmentReader reader = readers[i];
|
||||
SegmentTermEnum termEnum;
|
||||
|
||||
if (t != null) {
|
||||
termEnum = (SegmentTermEnum)reader.terms(t);
|
||||
} else
|
||||
termEnum = (SegmentTermEnum)reader.terms();
|
||||
|
||||
SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader);
|
||||
if (t == null ? smi.next() : termEnum.term() != null)
|
||||
queue.put(smi); // initialize queue
|
||||
else
|
||||
smi.close();
|
||||
}
|
||||
|
||||
if (t != null && queue.size() > 0) {
|
||||
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
|
||||
term = top.termEnum.term();
|
||||
docFreq = top.termEnum.docFreq();
|
||||
}
|
||||
}
|
||||
|
||||
public final boolean next() throws IOException {
|
||||
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
|
||||
if (top == null) {
|
||||
term = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
term = top.term;
|
||||
docFreq = 0;
|
||||
|
||||
while (top != null && term.compareTo(top.term) == 0) {
|
||||
queue.pop();
|
||||
docFreq += top.termEnum.docFreq(); // increment freq
|
||||
if (top.next())
|
||||
queue.put(top); // restore queue
|
||||
else
|
||||
top.close(); // done with a segment
|
||||
top = (SegmentMergeInfo)queue.top();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public final Term term() {
|
||||
return term;
|
||||
}
|
||||
|
||||
public final int docFreq() {
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
public final void close() throws IOException {
|
||||
queue.close();
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentsTermDocs implements TermDocs {
|
||||
protected SegmentReader[] readers;
|
||||
protected int[] starts;
|
||||
protected Term term;
|
||||
|
||||
protected int base = 0;
|
||||
protected int pointer = 0;
|
||||
|
||||
SegmentsTermDocs(SegmentReader[] r, int[] s, Term t) {
|
||||
readers = r;
|
||||
starts = s;
|
||||
term = t;
|
||||
}
|
||||
|
||||
protected SegmentTermDocs current;
|
||||
|
||||
public final int doc() {
|
||||
return base + current.doc;
|
||||
}
|
||||
public final int freq() {
|
||||
return current.freq;
|
||||
}
|
||||
|
||||
public final boolean next() throws IOException {
|
||||
if (current != null && current.next()) {
|
||||
return true;
|
||||
} else if (pointer < readers.length) {
|
||||
if (current != null)
|
||||
current.close();
|
||||
base = starts[pointer];
|
||||
current = termDocs(readers[pointer++]);
|
||||
return next();
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Optimized implementation. */
|
||||
public final int read(final int[] docs, final int[] freqs)
|
||||
throws IOException {
|
||||
while (true) {
|
||||
while (current == null) {
|
||||
if (pointer < readers.length) { // try next segment
|
||||
base = starts[pointer];
|
||||
current = termDocs(readers[pointer++]);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
int end = current.read(docs, freqs);
|
||||
if (end == 0) { // none left in segment
|
||||
current.close();
|
||||
current = null;
|
||||
} else { // got some
|
||||
final int b = base; // adjust doc numbers
|
||||
for (int i = 0; i < end; i++)
|
||||
docs[i] += b;
|
||||
return end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** As yet unoptimized implementation. */
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
do {
|
||||
if (!next())
|
||||
return false;
|
||||
} while (target > doc());
|
||||
return true;
|
||||
}
|
||||
|
||||
protected SegmentTermDocs termDocs(SegmentReader reader)
|
||||
throws IOException {
|
||||
return (SegmentTermDocs)reader.termDocs(term);
|
||||
}
|
||||
|
||||
public final void close() throws IOException {
|
||||
if (current != null)
|
||||
current.close();
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentsTermPositions extends SegmentsTermDocs implements TermPositions {
|
||||
SegmentsTermPositions(SegmentReader[] r, int[] s, Term t) {
|
||||
super(r,s,t);
|
||||
}
|
||||
|
||||
protected final SegmentTermDocs termDocs(SegmentReader reader)
|
||||
throws IOException {
|
||||
return (SegmentTermDocs)reader.termPositions(term);
|
||||
}
|
||||
|
||||
public final int nextPosition() throws IOException {
|
||||
return ((SegmentTermPositions)current).nextPosition();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
A Term represents a word from text. This is the unit of search. It is
|
||||
composed of two elements, the text of the word, as a string, and the name of
|
||||
the field that the text occured in, an interned string.
|
||||
|
||||
Note that terms may represent more than words from text fields, but also
|
||||
things like dates, email addresses, urls, etc. */
|
||||
|
||||
public final class Term {
|
||||
String field;
|
||||
String text;
|
||||
|
||||
/** Constructs a Term with the given field and text. */
|
||||
public Term(String fld, String txt) {
|
||||
this(fld, txt, true);
|
||||
}
|
||||
Term(String fld, String txt, boolean intern) {
|
||||
field = intern ? fld.intern() : fld; // field names are interned
|
||||
text = txt; // unless already known to be
|
||||
}
|
||||
|
||||
/** Returns the field of this term, an interned string. The field indicates
|
||||
the part of a document which this term came from. */
|
||||
public final String field() { return field; }
|
||||
|
||||
/** Returns the text of this term. In the case of words, this is simply the
|
||||
text of the word. In the case of dates and other types, this is an
|
||||
encoding of the object as a string. */
|
||||
public final String text() { return text; }
|
||||
|
||||
/** Compares two terms, returning true iff they have the same
|
||||
field and text. */
|
||||
public final boolean equals(Object o) {
|
||||
if (o == null)
|
||||
return false;
|
||||
Term other = (Term)o;
|
||||
return field == other.field && text.equals(other.text);
|
||||
}
|
||||
|
||||
/** Combines the hashCode() of the field and the text. */
|
||||
public final int hashCode() {
|
||||
return field.hashCode() + text.hashCode();
|
||||
}
|
||||
|
||||
/** Compares two terms, returning an integer which is less than zero iff this
|
||||
term belongs after the argument, equal zero iff this term is equal to the
|
||||
argument, and greater than zero iff this term belongs after the argument.
|
||||
|
||||
The ordering of terms is first by field, then by text.*/
|
||||
public final int compareTo(Term other) {
|
||||
if (field == other.field) // fields are interned
|
||||
return text.compareTo(other.text);
|
||||
else
|
||||
return field.compareTo(other.field);
|
||||
}
|
||||
|
||||
/** Resets the field and text of a Term. */
|
||||
final void set(String fld, String txt) {
|
||||
field = fld;
|
||||
text = txt;
|
||||
}
|
||||
|
||||
public final String toString() {
|
||||
return "Term<" + field + ":" + text + ">";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
/** TermDocs provides an interface for enumerating <document, frequency>
|
||||
pairs for a term. <p> The document portion names each document containing
|
||||
the term. Documents are indicated by number. The frequency portion gives
|
||||
the number of times the term occurred in each document. <p> The pairs are
|
||||
ordered by document number.
|
||||
|
||||
@see IndexReader#termDocs
|
||||
*/
|
||||
|
||||
public interface TermDocs {
|
||||
/** Returns the current document number. <p> This is invalid until {@link
|
||||
#next()} is called for the first time.*/
|
||||
public int doc();
|
||||
|
||||
/** Returns the frequency of the term within the current document. <p> This
|
||||
is invalid until {@link #next()} is called for the first time.*/
|
||||
public int freq();
|
||||
|
||||
/** Moves to the next pair in the enumeration. <p> Returns true iff there is
|
||||
such a next pair in the enumeration. */
|
||||
public boolean next() throws IOException;
|
||||
|
||||
/** Attempts to read multiple entries from the enumeration, up to length of
|
||||
* <i>docs</i>. Document numbers are stored in <i>docs</i>, and term
|
||||
* frequencies are stored in <i>freqs</i>. The <i>freqs</i> array must be as
|
||||
* long as the <i>docs</i> array.
|
||||
*
|
||||
* <p>Returns the number of entries read. Zero is only returned when the
|
||||
* stream has been exhausted. */
|
||||
public int read(int[] docs, int[] freqs) throws IOException;
|
||||
|
||||
/** Skips entries to the first beyond the current whose document number is
|
||||
* greater than or equal to <i>target</i>. <p>Returns true iff there is such
|
||||
* an entry. <p>Behaves as if written: <pre>
|
||||
* public boolean skipTo(int target) {
|
||||
* do {
|
||||
* if (!next())
|
||||
* return false;
|
||||
* } while (target > doc());
|
||||
* return true;
|
||||
* }
|
||||
* </pre>
|
||||
* Some implementations are considerably more efficient than that.
|
||||
*/
|
||||
public boolean skipTo(int target) throws IOException;
|
||||
|
||||
/** Frees associated resources. */
|
||||
public void close() throws IOException;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Abstract class for enumerating terms.
|
||||
|
||||
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
|
||||
the enumeration is greater than all that precede it. */
|
||||
|
||||
public abstract class TermEnum {
|
||||
/** Increments the enumeration to the next element. True if one exists.*/
|
||||
abstract public boolean next() throws IOException;
|
||||
|
||||
/** Returns the current Term in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
abstract public Term term();
|
||||
|
||||
/** Returns the docFreq of the current Term in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
abstract public int docFreq();
|
||||
|
||||
/** Closes the enumeration to further activity, freeing resources. */
|
||||
abstract public void close() throws IOException;
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/** A TermInfo is the record of information stored for a term.*/
|
||||
|
||||
final class TermInfo {
|
||||
/** The number of documents which contain the term. */
|
||||
int docFreq = 0;
|
||||
|
||||
long freqPointer = 0;
|
||||
long proxPointer = 0;
|
||||
|
||||
TermInfo() {}
|
||||
|
||||
TermInfo(int df, long fp, long pp) {
|
||||
docFreq = df;
|
||||
freqPointer = fp;
|
||||
proxPointer = pp;
|
||||
}
|
||||
|
||||
TermInfo(TermInfo ti) {
|
||||
docFreq = ti.docFreq;
|
||||
freqPointer = ti.freqPointer;
|
||||
proxPointer = ti.proxPointer;
|
||||
}
|
||||
|
||||
final void set(int df, long fp, long pp) {
|
||||
docFreq = df;
|
||||
freqPointer = fp;
|
||||
proxPointer = pp;
|
||||
}
|
||||
|
||||
final void set(TermInfo ti) {
|
||||
docFreq = ti.docFreq;
|
||||
freqPointer = ti.freqPointer;
|
||||
proxPointer = ti.proxPointer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,222 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
||||
* Directory. Pairs are accessed either by Term or by ordinal position the
|
||||
* set. */
|
||||
|
||||
final class TermInfosReader {
|
||||
private Directory directory;
|
||||
private String segment;
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private SegmentTermEnum enum;
|
||||
private int size;
|
||||
|
||||
TermInfosReader(Directory dir, String seg, FieldInfos fis)
|
||||
throws IOException {
|
||||
directory = dir;
|
||||
segment = seg;
|
||||
fieldInfos = fis;
|
||||
|
||||
enum = new SegmentTermEnum(directory.openFile(segment + ".tis"),
|
||||
fieldInfos, false);
|
||||
size = enum.size;
|
||||
readIndex();
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
if (enum != null)
|
||||
enum.close();
|
||||
}
|
||||
|
||||
/** Returns the number of term/value pairs in the set. */
|
||||
final int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
Term[] indexTerms = null;
|
||||
TermInfo[] indexInfos;
|
||||
long[] indexPointers;
|
||||
|
||||
private final void readIndex() throws IOException {
|
||||
SegmentTermEnum indexEnum =
|
||||
new SegmentTermEnum(directory.openFile(segment + ".tii"),
|
||||
fieldInfos, true);
|
||||
try {
|
||||
int indexSize = indexEnum.size;
|
||||
|
||||
indexTerms = new Term[indexSize];
|
||||
indexInfos = new TermInfo[indexSize];
|
||||
indexPointers = new long[indexSize];
|
||||
|
||||
for (int i = 0; indexEnum.next(); i++) {
|
||||
indexTerms[i] = indexEnum.term();
|
||||
indexInfos[i] = indexEnum.termInfo();
|
||||
indexPointers[i] = indexEnum.indexPointer;
|
||||
}
|
||||
} finally {
|
||||
indexEnum.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the offset of the greatest index entry which is less than term.*/
|
||||
private final int getIndexOffset(Term term) throws IOException {
|
||||
int lo = 0; // binary search indexTerms[]
|
||||
int hi = indexTerms.length - 1;
|
||||
|
||||
while (hi >= lo) {
|
||||
int mid = (lo + hi) >> 1;
|
||||
int delta = term.compareTo(indexTerms[mid]);
|
||||
if (delta < 0)
|
||||
hi = mid - 1;
|
||||
else if (delta > 0)
|
||||
lo = mid + 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
private final void seekEnum(int indexOffset) throws IOException {
|
||||
enum.seek(indexPointers[indexOffset],
|
||||
(indexOffset * TermInfosWriter.INDEX_INTERVAL) - 1,
|
||||
indexTerms[indexOffset], indexInfos[indexOffset]);
|
||||
}
|
||||
|
||||
/** Returns the TermInfo for a Term in the set, or null. */
|
||||
final synchronized TermInfo get(Term term) throws IOException {
|
||||
if (size == 0) return null;
|
||||
|
||||
// optimize sequential access: first try scanning cached enum w/o seeking
|
||||
if (enum.term() != null // term is at or past current
|
||||
&& ((enum.prev != null && term.compareTo(enum.prev) > 0)
|
||||
|| term.compareTo(enum.term()) >= 0)) {
|
||||
int enumOffset = (enum.position/TermInfosWriter.INDEX_INTERVAL)+1;
|
||||
if (indexTerms.length == enumOffset // but before end of block
|
||||
|| term.compareTo(indexTerms[enumOffset]) < 0)
|
||||
return scanEnum(term); // no need to seek
|
||||
}
|
||||
|
||||
// random-access: must seek
|
||||
seekEnum(getIndexOffset(term));
|
||||
return scanEnum(term);
|
||||
}
|
||||
|
||||
/** Scans within block for matching term. */
|
||||
private final TermInfo scanEnum(Term term) throws IOException {
|
||||
while (term.compareTo(enum.term()) > 0 && enum.next()) {}
|
||||
if (enum.term() != null && term.compareTo(enum.term()) == 0)
|
||||
return enum.termInfo();
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns the nth term in the set. */
|
||||
final synchronized Term get(int position) throws IOException {
|
||||
if (size == 0) return null;
|
||||
|
||||
if (enum != null && enum.term() != null && position >= enum.position &&
|
||||
position < (enum.position + TermInfosWriter.INDEX_INTERVAL))
|
||||
return scanEnum(position); // can avoid seek
|
||||
|
||||
seekEnum(position / TermInfosWriter.INDEX_INTERVAL); // must seek
|
||||
return scanEnum(position);
|
||||
}
|
||||
|
||||
private final Term scanEnum(int position) throws IOException {
|
||||
while(enum.position < position)
|
||||
if (!enum.next())
|
||||
return null;
|
||||
|
||||
return enum.term();
|
||||
}
|
||||
|
||||
/** Returns the position of a Term in the set or -1. */
|
||||
final synchronized int getPosition(Term term) throws IOException {
|
||||
if (size == 0) return -1;
|
||||
|
||||
int indexOffset = getIndexOffset(term);
|
||||
seekEnum(indexOffset);
|
||||
|
||||
while(term.compareTo(enum.term()) > 0 && enum.next()) {}
|
||||
|
||||
if (term.compareTo(enum.term()) == 0)
|
||||
return enum.position;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** Returns an enumeration of all the Terms and TermInfos in the set. */
|
||||
final synchronized SegmentTermEnum terms() throws IOException {
|
||||
if (enum.position != -1) // if not at start
|
||||
seekEnum(0); // reset to start
|
||||
return (SegmentTermEnum)enum.clone();
|
||||
}
|
||||
|
||||
/** Returns an enumeration of terms starting at or after the named term. */
|
||||
final synchronized SegmentTermEnum terms(Term term) throws IOException {
|
||||
get(term); // seek enum to term
|
||||
return (SegmentTermEnum)enum.clone();
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
||||
Directory. A TermInfos can be written once, in order. */
|
||||
|
||||
final class TermInfosWriter {
|
||||
private FieldInfos fieldInfos;
|
||||
private OutputStream output;
|
||||
private Term lastTerm = new Term("", "");
|
||||
private TermInfo lastTi = new TermInfo();
|
||||
private int size = 0;
|
||||
|
||||
static final int INDEX_INTERVAL = 128;
|
||||
private long lastIndexPointer = 0;
|
||||
private boolean isIndex = false;
|
||||
|
||||
private TermInfosWriter other = null;
|
||||
|
||||
TermInfosWriter(Directory directory, String segment, FieldInfos fis)
|
||||
throws IOException, SecurityException {
|
||||
initialize(directory, segment, fis, false);
|
||||
other = new TermInfosWriter(directory, segment, fis, true);
|
||||
other.other = this;
|
||||
}
|
||||
|
||||
private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
|
||||
boolean isIndex) throws IOException {
|
||||
initialize(directory, segment, fis, isIndex);
|
||||
}
|
||||
|
||||
private void initialize(Directory directory, String segment, FieldInfos fis,
|
||||
boolean isi) throws IOException {
|
||||
fieldInfos = fis;
|
||||
isIndex = isi;
|
||||
output = directory.createFile(segment + (isIndex ? ".tii" : ".tis"));
|
||||
output.writeInt(0); // leave space for size
|
||||
}
|
||||
|
||||
/** Adds a new <Term, TermInfo> pair to the set.
|
||||
Term must be lexicographically greater than all previous Terms added.
|
||||
TermInfo pointers must be positive and greater than all previous.*/
|
||||
final void add(Term term, TermInfo ti)
|
||||
throws IOException, SecurityException {
|
||||
if (!isIndex && term.compareTo(lastTerm) <= 0)
|
||||
throw new IOException("term out of order");
|
||||
if (ti.freqPointer < lastTi.freqPointer)
|
||||
throw new IOException("freqPointer out of order");
|
||||
if (ti.proxPointer < lastTi.proxPointer)
|
||||
throw new IOException("proxPointer out of order");
|
||||
|
||||
if (!isIndex && size % INDEX_INTERVAL == 0)
|
||||
other.add(lastTerm, lastTi); // add an index term
|
||||
|
||||
writeTerm(term); // write term
|
||||
output.writeVInt(ti.docFreq); // write doc freq
|
||||
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
|
||||
output.writeVLong(ti.proxPointer - lastTi.proxPointer);
|
||||
|
||||
if (isIndex) {
|
||||
output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
|
||||
lastIndexPointer = other.output.getFilePointer(); // write pointer
|
||||
}
|
||||
|
||||
lastTi.set(ti);
|
||||
size++;
|
||||
}
|
||||
|
||||
private final void writeTerm(Term term)
|
||||
throws IOException {
|
||||
int start = stringDifference(lastTerm.text, term.text);
|
||||
int length = term.text.length() - start;
|
||||
|
||||
output.writeVInt(start); // write shared prefix length
|
||||
output.writeVInt(length); // write delta length
|
||||
output.writeChars(term.text, start, length); // write delta chars
|
||||
|
||||
output.writeVInt(fieldInfos.fieldNumber(term.field)); // write field num
|
||||
|
||||
lastTerm = term;
|
||||
}
|
||||
|
||||
private static final int stringDifference(String s1, String s2) {
|
||||
int len1 = s1.length();
|
||||
int len2 = s2.length();
|
||||
int len = len1 < len2 ? len1 : len2;
|
||||
for (int i = 0; i < len; i++)
|
||||
if (s1.charAt(i) != s2.charAt(i))
|
||||
return i;
|
||||
return len;
|
||||
}
|
||||
|
||||
/** Called to complete TermInfos creation. */
|
||||
final void close() throws IOException, SecurityException {
|
||||
output.seek(0); // write size at start
|
||||
output.writeInt(size);
|
||||
output.close();
|
||||
|
||||
if (!isIndex)
|
||||
other.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
|
||||
/** TermPositions provides an interface for enumerating the <document,
|
||||
frequency, <position>* > tuples for a term. <p> The document and
|
||||
frequency are as for a TermDocs. The positions portion lists the ordinal
|
||||
positions of each occurence of a term in a document.
|
||||
@see IndexReader#termPositions
|
||||
*/
|
||||
|
||||
public interface TermPositions extends TermDocs {
|
||||
/** Returns next position in the current document. It is an error to call
|
||||
this more than {@link #freq()} times
|
||||
without calling {@link #next()}<p> This is
|
||||
invalid until {@link #next()} is called for
|
||||
the first time.*/
|
||||
public int nextPosition() throws IOException;
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="Author" content="Doug Cutting">
|
||||
</head>
|
||||
<body>
|
||||
Code to maintain and access indices.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
Name: com/lucene
|
||||
Specification-Title: Lucene Search Engine
|
||||
Specification-Version: $Name$
|
||||
Specification-Vendor: Lucene
|
||||
Implementation-Title: com.lucene
|
||||
Implementation-Version: $Name$ $Date$
|
||||
Implementation-Vendor: Lucene
|
|
@ -0,0 +1,6 @@
|
|||
QueryParser.java
|
||||
TokenMgrError.java
|
||||
ParseException.java
|
||||
Token.java
|
||||
TokenManager.java
|
||||
QueryParserConstants.java
|
|
@ -0,0 +1,2 @@
|
|||
# sub-directory makefile for lucene
|
||||
include ../rules.mk
|
|
@ -0,0 +1,366 @@
|
|||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
|
||||
options {
|
||||
STATIC= false;
|
||||
}
|
||||
|
||||
PARSER_BEGIN(QueryParser)
|
||||
|
||||
package org.apache.lucene.queryParser;
|
||||
|
||||
import java.util.Vector;
|
||||
import java.io.*;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.search.*;
|
||||
|
||||
/**
|
||||
* This class is generated by JavaCC. The only method that clients should need
|
||||
* to call is <a href="#parse">parse()</a>.
|
||||
*
|
||||
* The syntax for query strings is as follows:
|
||||
* A Query is a series of clauses.
|
||||
* A clause may be prefixed by:
|
||||
* <ul>
|
||||
* <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign, indicating
|
||||
* that the clause is required or prohibited respectively; or
|
||||
* <li> a term followed by a colon, indicating the field to be searched.
|
||||
* This enables one to construct queries which search multiple fields.
|
||||
* </ul>
|
||||
*
|
||||
* A clause may be either a:
|
||||
* <ul>
|
||||
* <li> a term, indicating all the documents that contain this term; or
|
||||
* <li> a nested query, enclosed in parentheses. Note that this may be used
|
||||
* with a <code>+</code>/<code>-</code> prefix to require any of a set of
|
||||
* terms.
|
||||
* </ul>
|
||||
*
|
||||
* Thus, in BNF, the query grammar is:
|
||||
* <pre>
|
||||
* Query ::= ( Clause )*
|
||||
* Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
public class QueryParser {
|
||||
/** Parses a query string, returning a
|
||||
* <a href="lucene.search.Query.html">Query</a>.
|
||||
* @param query the query string to be parsed.
|
||||
* @param field the default field for query terms.
|
||||
* @param analyzer used to find terms in the query text.
|
||||
*/
|
||||
static public Query parse(String query, String field, Analyzer analyzer)
|
||||
throws ParseException {
|
||||
QueryParser parser = new QueryParser(field, analyzer);
|
||||
return parser.parse(query);
|
||||
}
|
||||
|
||||
Analyzer analyzer;
|
||||
String field;
|
||||
int phraseSlop = 0;
|
||||
|
||||
/** Constructs a query parser.
|
||||
* @param field the default field for query terms.
|
||||
* @param analyzer used to find terms in the query text.
|
||||
*/
|
||||
public QueryParser(String f, Analyzer a) {
|
||||
this(new StringReader(""));
|
||||
analyzer = a;
|
||||
field = f;
|
||||
}
|
||||
|
||||
/** Parses a query string, returning a
|
||||
* <a href="lucene.search.Query.html">Query</a>.
|
||||
* @param query the query string to be parsed.
|
||||
*/
|
||||
public Query parse(String query) throws ParseException {
|
||||
ReInit(new StringReader(query));
|
||||
return Query(field);
|
||||
}
|
||||
|
||||
/** Sets the default slop for phrases. If zero, then exact phrase matches
|
||||
are required. Zero by default. */
|
||||
public void setPhraseSlop(int s) { phraseSlop = s; }
|
||||
/** Gets the default slop for phrases. */
|
||||
public int getPhraseSlop() { return phraseSlop; }
|
||||
|
||||
private void addClause(Vector clauses, int conj, int mods,
|
||||
Query q) {
|
||||
boolean required, prohibited;
|
||||
|
||||
// If this term is introduced by AND, make the preceding term required,
|
||||
// unless it's already prohibited
|
||||
if (conj == CONJ_AND) {
|
||||
BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
|
||||
if (!c.prohibited)
|
||||
c.required = true;
|
||||
}
|
||||
|
||||
// We might have been passed a null query; the term might have been
|
||||
// filtered away by the analyzer.
|
||||
if (q == null)
|
||||
return;
|
||||
|
||||
// We set REQUIRED if we're introduced by AND or +; PROHIBITED if
|
||||
// introduced by NOT or -; make sure not to set both.
|
||||
prohibited = (mods == MOD_NOT);
|
||||
required = (mods == MOD_REQ);
|
||||
if (conj == CONJ_AND && !prohibited)
|
||||
required = true;
|
||||
clauses.addElement(new BooleanClause(q, required, prohibited));
|
||||
}
|
||||
|
||||
private Query getFieldQuery(String field, Analyzer analyzer, String queryText) {
|
||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||
// PhraseQuery, or nothing based on the term count
|
||||
|
||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||
Vector v = new Vector();
|
||||
org.apache.lucene.analysis.Token t;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
t = source.next();
|
||||
}
|
||||
catch (IOException e) {
|
||||
t = null;
|
||||
}
|
||||
if (t == null)
|
||||
break;
|
||||
v.addElement(t.termText());
|
||||
}
|
||||
if (v.size() == 0)
|
||||
return null;
|
||||
else if (v.size() == 1)
|
||||
return new TermQuery(new Term(field, (String) v.elementAt(0)));
|
||||
else {
|
||||
PhraseQuery q = new PhraseQuery();
|
||||
q.setSlop(phraseSlop);
|
||||
for (int i=0; i<v.size(); i++) {
|
||||
q.add(new Term(field, (String) v.elementAt(i)));
|
||||
}
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
QueryParser qp = new QueryParser("field",
|
||||
new org.apache.lucene.analysis.SimpleAnalyzer());
|
||||
Query q = qp.parse(args[0]);
|
||||
System.out.println(q.toString("field"));
|
||||
}
|
||||
|
||||
private static final int CONJ_NONE = 0;
|
||||
private static final int CONJ_AND = 1;
|
||||
private static final int CONJ_OR = 2;
|
||||
|
||||
private static final int MOD_NONE = 0;
|
||||
private static final int MOD_NOT = 10;
|
||||
private static final int MOD_REQ = 11;
|
||||
}
|
||||
|
||||
PARSER_END(QueryParser)
|
||||
|
||||
/* ***************** */
|
||||
/* Token Definitions */
|
||||
/* ***************** */
|
||||
|
||||
<*> TOKEN : {
|
||||
<#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
|
||||
| <#_NUM_CHAR: ["0"-"9"] >
|
||||
| <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
|
||||
| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
|
||||
| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
|
||||
| <#_NEWLINE: ( "\r\n" | "\r" | "\n" ) >
|
||||
| <#_WHITESPACE: ( " " | "\t" ) >
|
||||
| <#_QCHAR: ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
|
||||
| <#_RESTOFLINE: (~["\r", "\n"])* >
|
||||
}
|
||||
|
||||
<DEFAULT> TOKEN : {
|
||||
<AND: ("AND" | "&&") >
|
||||
| <OR: ("OR" | "||") >
|
||||
| <NOT: ("NOT" | "!") >
|
||||
| <PLUS: "+" >
|
||||
| <MINUS: "-" >
|
||||
| <LPAREN: "(" >
|
||||
| <RPAREN: ")" >
|
||||
| <COLON: ":" >
|
||||
| <CARAT: "^" >
|
||||
| <STAR: "*" >
|
||||
| <QUOTED: "\"" (~["\""])+ "\"">
|
||||
| <NUMBER: (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
|
||||
| <TERM: <_IDENTIFIER_CHAR>
|
||||
( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "*", "?", "~" ] )* >
|
||||
| <FUZZY: "~" >
|
||||
| <WILDTERM: <_IDENTIFIER_CHAR>
|
||||
( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "~" ] )* <_IDENTIFIER_CHAR>>
|
||||
}
|
||||
|
||||
<DEFAULT> SKIP : {
|
||||
<<_WHITESPACE>>
|
||||
}
|
||||
|
||||
// * Query ::= ( Clause )*
|
||||
// * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
|
||||
|
||||
int Conjunction() : {
|
||||
int ret = CONJ_NONE;
|
||||
}
|
||||
{
|
||||
[
|
||||
<AND> { ret = CONJ_AND; }
|
||||
| <OR> { ret = CONJ_OR; }
|
||||
]
|
||||
{ return ret; }
|
||||
}
|
||||
|
||||
int Modifiers() : {
|
||||
int ret = MOD_NONE;
|
||||
}
|
||||
{
|
||||
[
|
||||
<PLUS> { ret = MOD_REQ; }
|
||||
| <MINUS> { ret = MOD_NOT; }
|
||||
| <NOT> { ret = MOD_NOT; }
|
||||
]
|
||||
{ return ret; }
|
||||
}
|
||||
|
||||
Query Query(String field) :
|
||||
{
|
||||
Vector clauses = new Vector();
|
||||
Query q;
|
||||
int conj, mods;
|
||||
}
|
||||
{
|
||||
mods=Modifiers() q=Clause(field)
|
||||
{ addClause(clauses, CONJ_NONE, mods, q); }
|
||||
(
|
||||
conj=Conjunction() mods=Modifiers() q=Clause(field)
|
||||
{ addClause(clauses, conj, mods, q); }
|
||||
)*
|
||||
{
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
for (int i = 0; i < clauses.size(); i++)
|
||||
query.add((BooleanClause)clauses.elementAt(i));
|
||||
return query;
|
||||
}
|
||||
}
|
||||
|
||||
Query Clause(String field) : {
|
||||
Query q;
|
||||
Token fieldToken=null;
|
||||
}
|
||||
{
|
||||
[
|
||||
LOOKAHEAD(2)
|
||||
fieldToken=<TERM> <COLON> { field = fieldToken.image; }
|
||||
]
|
||||
|
||||
(
|
||||
q=Term(field)
|
||||
| <LPAREN> q=Query(field) <RPAREN>
|
||||
)
|
||||
{
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Query Term(String field) : {
|
||||
Token term, boost=null;
|
||||
boolean prefix = false;
|
||||
boolean wildcard = false;
|
||||
boolean fuzzy = false;
|
||||
Query q;
|
||||
}
|
||||
{
|
||||
(
|
||||
(term=<TERM>|term=<WILDTERM>{wildcard=true;}|term=<NUMBER>)[<STAR>{prefix=true;}|<FUZZY>{fuzzy=true;}][<CARAT> boost=<NUMBER>]
|
||||
{ if (wildcard)
|
||||
q = new WildcardQuery(new Term(field, term.image));
|
||||
else if (prefix)
|
||||
q = new PrefixQuery(new Term(field, term.image));
|
||||
else if (fuzzy)
|
||||
q = new FuzzyQuery(new Term(field, term.image));
|
||||
else
|
||||
q = getFieldQuery(field, analyzer, term.image); }
|
||||
| term=<QUOTED>
|
||||
{ q = getFieldQuery(field, analyzer,
|
||||
term.image.substring(1, term.image.length()-1)); }
|
||||
)
|
||||
{
|
||||
if (boost != null) {
|
||||
float f = (float) 1.0;
|
||||
try {
|
||||
f = Float.valueOf(boost.image).floatValue();
|
||||
}
|
||||
catch (Exception ignored) { }
|
||||
|
||||
if (q instanceof TermQuery)
|
||||
((TermQuery) q).setBoost(f);
|
||||
else if (q instanceof PhraseQuery)
|
||||
((PhraseQuery) q).setBoost(f);
|
||||
else if (q instanceof MultiTermQuery)
|
||||
((MultiTermQuery) q).setBoost(f);
|
||||
}
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="Author" content="Doug Cutting">
|
||||
</head>
|
||||
<body>
|
||||
A simple query parser implemented with JavaCC.
|
||||
<p>Note that JavaCC defines lots of public, classes, methods and fields
|
||||
that do not need to be public. These clutter the documentation.
|
||||
Sorry.
|
||||
<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>com.lucene.analysis.Token</tt>
|
||||
must always be fully qualified in sourced code in this package.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,58 @@
|
|||
# rules to enable the running of "make jar" and the like from any dir..
|
||||
|
||||
# directories containing java source code
|
||||
DIRS = store util document analysis analysis/standard index search queryParser
|
||||
PACKAGES = $(subst /,.,$(patsubst %,com.lucene.%,$(DIRS)))
|
||||
|
||||
ifeq ($(JAVALINK),)
|
||||
JAVALINK = http://java.sun.com/products/jdk/1.3/docs/api/
|
||||
endif
|
||||
|
||||
# OLDJAVA does not have a -link option
|
||||
ifeq ($(OLDJAVA),)
|
||||
JLINK_OPT = -link $(JAVALINK)
|
||||
JAR_CMD = $(JAR) -cvfm lucene.jar com/lucene/manifest
|
||||
else
|
||||
JAR_CMD = $(JAR) -cvf lucene.jar
|
||||
endif
|
||||
|
||||
.PHONY: jar doc demo release
|
||||
|
||||
jar: all_classes
|
||||
cd $(ROOT) && $(JAR_CMD) \
|
||||
`ls com/lucene/*/*.class` `ls com/lucene/*/*/*.class`
|
||||
|
||||
doc: all_classes
|
||||
if [ -d $(ROOT)/doc/api ]; then rm -rf $(ROOT)/doc/api ;fi
|
||||
mkdir $(ROOT)/doc/api
|
||||
$(JAVADOC) -classpath '$(CLASSPATH)' -author -version \
|
||||
-d $(ROOT)/doc/api $(JLINK_OPT) $(PACKAGES)
|
||||
|
||||
demo: all_classes
|
||||
$(MAKE) -C $(ROOT)/demo/HTMLParser -w
|
||||
$(MAKE) -C $(ROOT)/demo -w CLASSPATH=..
|
||||
|
||||
release: jar demo doc
|
||||
cd $(ROOT) && tar cvf lucene.tar lucene.jar doc/*.html doc/api \
|
||||
demo/*.java demo/*.class demo/*.html demo/*.jhtml \
|
||||
demo/HTMLParser/*.class demo/HTMLParser/*.jj \
|
||||
demo/HTMLParser/*.java
|
||||
|
||||
# make all the Lucene classes
|
||||
all_classes : TARGET = classes
|
||||
all_classes : $(DIRS)
|
||||
|
||||
.PHONY: $(DIRS)
|
||||
$(DIRS):
|
||||
$(MAKE) -C $(ROOT)/com/lucene/$@ -w $(TARGET)
|
||||
|
||||
# Removes all generated files from src directories.
|
||||
src_clean: TARGET = clean
|
||||
src_clean: $(DIRS) clean
|
||||
|
||||
# Removes all generated files.
|
||||
real_clean: DIRS += demo
|
||||
real_clean: DIRS += demo/HTMLParser
|
||||
real_clean: TARGET = clean
|
||||
real_clean: $(DIRS) clean
|
||||
cd $(ROOT) && rm -rf lucene.jar lucene.tar doc/api
|
|
@ -0,0 +1,128 @@
|
|||
# GNU make rules for lucene
|
||||
|
||||
# determine whether we're on Win32 or Unix
|
||||
ifeq ($(findstring CYGWIN,$(shell uname)),CYGWIN)
|
||||
OS = win32
|
||||
else
|
||||
OS = unix
|
||||
endif
|
||||
|
||||
# DOS compatibility:
|
||||
# These should be used in variables that end up in CLASSPATH.
|
||||
ifeq ($(OS),win32)
|
||||
SLASH=\\
|
||||
COLON=;
|
||||
else
|
||||
SLASH=/
|
||||
COLON=:
|
||||
endif
|
||||
|
||||
# ROOT should be set to the root directory of the Lucene package
|
||||
# hierarchy. This is typically ../../.., as most packages are of the
|
||||
# form com.lucene.<package>.
|
||||
ifeq ($(ROOT),)
|
||||
ROOT = ..$(SLASH)..$(SLASH)..
|
||||
else
|
||||
ROOT := $(subst /,$(SLASH),$(ROOT))
|
||||
endif
|
||||
|
||||
#include all the relevant variables
|
||||
include $(subst $(SLASH),/,$(ROOT))/com/lucene/variables.mk
|
||||
|
||||
# directories containing java source code
|
||||
DIRS = store util document analysis analysis/standard index search queryParser
|
||||
PACKAGES = $(subst /,.,$(patsubst %,com.lucene.%,$(DIRS)))
|
||||
|
||||
ifeq ($(JDK_HOME),)
|
||||
ifneq ($(JAVA_HOME),)
|
||||
JDK_HOME=$(JAVA_HOME)
|
||||
else
|
||||
ifeq ($(OS),win32)
|
||||
JDK_HOME = C:/jdk1.3.1
|
||||
else
|
||||
JDK_HOME = /usr/local/java/jdk1.3.1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Location of JavaCC
|
||||
ifeq ($(JAVACC),)
|
||||
ifeq ($(OS),win32)
|
||||
JAVACC = C:/javacc2_0/bin/lib/JavaCC.zip
|
||||
else
|
||||
JAVACC = /usr/local/java/javacc2_0/bin/lib/JavaCC.zip
|
||||
endif
|
||||
endif
|
||||
|
||||
JAVADIR = $(subst \,/,$(JDK_HOME))
|
||||
|
||||
# The compiler executable.
|
||||
ifeq ($(JAVAC),)
|
||||
JAVAC = $(JAVADIR)/bin/javac
|
||||
endif
|
||||
|
||||
# The java executable
|
||||
JAVA = $(JAVADIR)/bin/java
|
||||
|
||||
# The jar executable
|
||||
JAR = $(JAVADIR)/bin/jar
|
||||
|
||||
# javadoc
|
||||
JAVADOC = $(JAVADIR)/bin/javadoc
|
||||
|
||||
# Options to pass to Java compiler
|
||||
ifeq ($(JFLAGS),)
|
||||
JFLAGS = -O
|
||||
endif
|
||||
|
||||
|
||||
# CLASSPATH
|
||||
# By default include the Lucene root, and Java's builtin classes
|
||||
ifeq ($(OLDJAVA),)
|
||||
export CLASSPATH=$(PREPENDCLASSPATH)$(COLON)$(ROOT)$(COLON)$(JDK_HOME)$(SLASH)jre$(SLASH)lib$(SLASH)rt.jar
|
||||
else
|
||||
export CLASSPATH=$(PREPENDCLASSPATH)$(COLON)$(ROOT)$(COLON)$(JDK_HOME)$(SLASH)lib$(SLASH)classes.zip
|
||||
endif
|
||||
|
||||
# JIKESPATH overrides the classpath variable for jikes, so we need to set it
|
||||
# here to avoid problems with a jikes user
|
||||
export JIKESPATH=$(CLASSPATH)
|
||||
|
||||
## Rules
|
||||
|
||||
# Use JAVAC to compile .java files into .class files
|
||||
%.class : %.java
|
||||
$(JAVAC) $(JFLAGS) $<
|
||||
|
||||
# Compile .jj files to .java with JavaCC
|
||||
%.java : %.jj
|
||||
$(JAVA) -classpath '$(CLASSPATH)$(COLON)$(JAVACC)' COM.sun.labs.javacc.Main $<
|
||||
|
||||
# Add JavaCC generated files to 'classes' and 'clean' targets.
|
||||
JJFILES = $(wildcard *.jj)
|
||||
ifneq ($(JJFILES),)
|
||||
CLASSES += $(patsubst %.jj,%.class, $(JJFILES))
|
||||
DIRT += $(patsubst %.jj,%.java, $(JJFILES))
|
||||
DIRT += $(patsubst %.jj,%Constants.java, $(JJFILES))
|
||||
DIRT += $(patsubst %.jj,%TokenManager.java, $(JJFILES))
|
||||
DIRT += Token.java TokenMgrError.java TokenManager.java \
|
||||
CharStream.java ASCII_CharStream.java ParseException.java
|
||||
endif
|
||||
|
||||
|
||||
# Don't delete parser's .java file -- it's needed by javadoc.
|
||||
.PRECIOUS: $(patsubst %.jj,%.java, $(JJFILES))
|
||||
|
||||
|
||||
# Assume all .java files should have a .class file.
|
||||
CLASSES += $(patsubst %.java,%.class,$(wildcard *.java))
|
||||
|
||||
# default rule
|
||||
classes : $(CLASSES)
|
||||
|
||||
# Removes all generated files from the connected src directory.
|
||||
clean:
|
||||
rm -f *.class $(DIRT)
|
||||
|
||||
# include all the rules for the root directory..
|
||||
include $(subst $(SLASH),/,$(ROOT))/com/lucene/rootrules.mk
|
|
@ -0,0 +1,75 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/** A clause in a BooleanQuery. */
|
||||
public final class BooleanClause {
|
||||
/** The query whose matching documents are combined by the boolean query. */
|
||||
public Query query;
|
||||
/** If true, documents documents which <i>do not</i>
|
||||
match this sub-query will <it>not</it> match the boolean query. */
|
||||
public boolean required = false;
|
||||
/** If true, documents documents which <i>do</i>
|
||||
match this sub-query will <it>not</it> match the boolean query. */
|
||||
public boolean prohibited = false;
|
||||
|
||||
/** Constructs a BooleanClause with query <code>q</code>, required
|
||||
<code>r</code> and prohibited <code>p</code>. */
|
||||
public BooleanClause(Query q, boolean r, boolean p) {
|
||||
query = q;
|
||||
required = r;
|
||||
prohibited = p;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** A Query that matches documents matching boolean combinations of other
|
||||
queries, typically {@link TermQuery}s or {@link PhraseQuery}s.
|
||||
*/
|
||||
final public class BooleanQuery extends Query {
|
||||
private Vector clauses = new Vector();
|
||||
|
||||
/** Constructs an empty boolean query. */
|
||||
public BooleanQuery() {}
|
||||
|
||||
/** Adds a clause to a boolean query. Clauses may be:
|
||||
<ul>
|
||||
<li><code>required</code> which means that documents which <i>do not</i>
|
||||
match this sub-query will <it>not</it> match the boolean query;
|
||||
<li><code>prohibited</code> which means that documents which <i>do</i>
|
||||
match this sub-query will <it>not</it> match the boolean query; or
|
||||
<li>neither, in which case matched documents are neither prohibited from
|
||||
nor required to match the sub-query.
|
||||
</ul>
|
||||
It is an error to specify a clause as both <code>required</code> and
|
||||
<code>prohibited</code>.
|
||||
*/
|
||||
public final void add(Query query, boolean required, boolean prohibited) {
|
||||
clauses.addElement(new BooleanClause(query, required, prohibited));
|
||||
}
|
||||
|
||||
/** Adds a clause to a boolean query. */
|
||||
public final void add(BooleanClause clause) {
|
||||
clauses.addElement(clause);
|
||||
}
|
||||
|
||||
void prepare(IndexReader reader) {
|
||||
for (int i = 0 ; i < clauses.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
c.query.prepare(reader);
|
||||
}
|
||||
}
|
||||
|
||||
final float sumOfSquaredWeights(Searcher searcher)
|
||||
throws IOException {
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int i = 0 ; i < clauses.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
if (!c.prohibited)
|
||||
sum += c.query.sumOfSquaredWeights(searcher); // sum sub-query weights
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
final void normalize(float norm) {
|
||||
for (int i = 0 ; i < clauses.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
if (!c.prohibited)
|
||||
c.query.normalize(norm);
|
||||
}
|
||||
}
|
||||
|
||||
final Scorer scorer(IndexReader reader)
|
||||
throws IOException {
|
||||
|
||||
if (clauses.size() == 1) { // optimize 1-term queries
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(0);
|
||||
if (!c.prohibited) // just return term scorer
|
||||
return c.query.scorer(reader);
|
||||
}
|
||||
|
||||
BooleanScorer result = new BooleanScorer();
|
||||
|
||||
int theMask = 1, thisMask;
|
||||
for (int i = 0 ; i < clauses.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
if (c.required || c.prohibited) {
|
||||
thisMask = theMask;
|
||||
theMask = theMask << 1;
|
||||
} else
|
||||
thisMask = 0;
|
||||
|
||||
Scorer subScorer = c.query.scorer(reader);
|
||||
if (subScorer != null)
|
||||
result.add(subScorer, c.required, c.prohibited);
|
||||
else if (c.required)
|
||||
return null;
|
||||
}
|
||||
if (theMask == 0)
|
||||
throw new IndexOutOfBoundsException
|
||||
("More than 32 required/prohibited clauses in query.");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Prints a user-readable version of this query. */
|
||||
public String toString(String field) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
for (int i = 0 ; i < clauses.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
if (c.prohibited)
|
||||
buffer.append("-");
|
||||
else if (c.required)
|
||||
buffer.append("+");
|
||||
|
||||
Query subQuery = c.query;
|
||||
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||
BooleanQuery bq = (BooleanQuery)subQuery;
|
||||
buffer.append("(");
|
||||
buffer.append(c.query.toString(field));
|
||||
buffer.append(")");
|
||||
} else
|
||||
buffer.append(c.query.toString(field));
|
||||
|
||||
if (i != clauses.size()-1)
|
||||
buffer.append(" ");
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,204 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
final class BooleanScorer extends Scorer {
|
||||
private int currentDoc;
|
||||
|
||||
private SubScorer scorers = null;
|
||||
private BucketTable bucketTable = new BucketTable(this);
|
||||
|
||||
private int maxCoord = 1;
|
||||
private float[] coordFactors = null;
|
||||
|
||||
private int requiredMask = 0;
|
||||
private int prohibitedMask = 0;
|
||||
private int nextMask = 1;
|
||||
|
||||
static final class SubScorer {
|
||||
public Scorer scorer;
|
||||
public boolean required = false;
|
||||
public boolean prohibited = false;
|
||||
public HitCollector collector;
|
||||
public SubScorer next;
|
||||
|
||||
public SubScorer(Scorer scorer, boolean required, boolean prohibited,
|
||||
HitCollector collector, SubScorer next) {
|
||||
this.scorer = scorer;
|
||||
this.required = required;
|
||||
this.prohibited = prohibited;
|
||||
this.collector = collector;
|
||||
this.next = next;
|
||||
}
|
||||
}
|
||||
|
||||
final void add(Scorer scorer, boolean required, boolean prohibited) {
|
||||
int mask = 0;
|
||||
if (required || prohibited) {
|
||||
if (nextMask == 0)
|
||||
throw new IndexOutOfBoundsException
|
||||
("More than 32 required/prohibited clauses in query.");
|
||||
mask = nextMask;
|
||||
nextMask = nextMask << 1;
|
||||
} else
|
||||
mask = 0;
|
||||
|
||||
if (!prohibited)
|
||||
maxCoord++;
|
||||
|
||||
if (prohibited)
|
||||
prohibitedMask |= mask; // update prohibited mask
|
||||
else if (required)
|
||||
requiredMask |= mask; // update required mask
|
||||
|
||||
scorers = new SubScorer(scorer, required, prohibited,
|
||||
bucketTable.newCollector(mask), scorers);
|
||||
}
|
||||
|
||||
private final void computeCoordFactors() throws IOException {
|
||||
coordFactors = new float[maxCoord];
|
||||
for (int i = 0; i < maxCoord; i++)
|
||||
coordFactors[i] = Similarity.coord(i, maxCoord);
|
||||
}
|
||||
|
||||
final void score(HitCollector results, int maxDoc) throws IOException {
|
||||
if (coordFactors == null)
|
||||
computeCoordFactors();
|
||||
|
||||
while (currentDoc < maxDoc) {
|
||||
currentDoc = Math.min(currentDoc+BucketTable.SIZE, maxDoc);
|
||||
for (SubScorer t = scorers; t != null; t = t.next)
|
||||
t.scorer.score(t.collector, currentDoc);
|
||||
bucketTable.collectHits(results);
|
||||
}
|
||||
}
|
||||
|
||||
static final class Bucket {
|
||||
int doc = -1; // tells if bucket is valid
|
||||
float score; // incremental score
|
||||
int bits; // used for bool constraints
|
||||
int coord; // count of terms in score
|
||||
Bucket next; // next valid bucket
|
||||
}
|
||||
|
||||
/** A simple hash table of document scores within a range. */
|
||||
static final class BucketTable {
|
||||
public static final int SIZE = 1 << 10;
|
||||
public static final int MASK = SIZE - 1;
|
||||
|
||||
final Bucket[] buckets = new Bucket[SIZE];
|
||||
Bucket first = null; // head of valid list
|
||||
|
||||
private BooleanScorer scorer;
|
||||
|
||||
public BucketTable(BooleanScorer scorer) {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
|
||||
public final void collectHits(HitCollector results) {
|
||||
final int required = scorer.requiredMask;
|
||||
final int prohibited = scorer.prohibitedMask;
|
||||
final float[] coord = scorer.coordFactors;
|
||||
|
||||
for (Bucket bucket = first; bucket!=null; bucket = bucket.next) {
|
||||
if ((bucket.bits & prohibited) == 0 && // check prohibited
|
||||
(bucket.bits & required) == required){// check required
|
||||
results.collect(bucket.doc, // add to results
|
||||
bucket.score * coord[bucket.coord]);
|
||||
}
|
||||
}
|
||||
first = null; // reset for next round
|
||||
}
|
||||
|
||||
public final int size() { return SIZE; }
|
||||
|
||||
public HitCollector newCollector(int mask) {
|
||||
return new Collector(mask, this);
|
||||
}
|
||||
}
|
||||
|
||||
static final class Collector extends HitCollector {
|
||||
private BucketTable bucketTable;
|
||||
private int mask;
|
||||
public Collector(int mask, BucketTable bucketTable) {
|
||||
this.mask = mask;
|
||||
this.bucketTable = bucketTable;
|
||||
}
|
||||
public final void collect(final int doc, final float score) {
|
||||
final BucketTable table = bucketTable;
|
||||
final int i = doc & BucketTable.MASK;
|
||||
Bucket bucket = table.buckets[i];
|
||||
if (bucket == null)
|
||||
table.buckets[i] = bucket = new Bucket();
|
||||
|
||||
if (bucket.doc != doc) { // invalid bucket
|
||||
bucket.doc = doc; // set doc
|
||||
bucket.score = score; // initialize score
|
||||
bucket.bits = mask; // initialize mask
|
||||
bucket.coord = 1; // initialize coord
|
||||
|
||||
bucket.next = table.first; // push onto valid list
|
||||
table.first = bucket;
|
||||
} else { // valid bucket
|
||||
bucket.score += score; // increment score
|
||||
bucket.bits |= mask; // add bits in mask
|
||||
bucket.coord++; // increment coord
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Date;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.DateField;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** A Filter that restricts search results to a range of time.
|
||||
|
||||
<p>For this to work, documents must have been indexed with a {@link
|
||||
DateField}. */
|
||||
|
||||
public final class DateFilter extends Filter {
|
||||
String field;
|
||||
|
||||
String start = DateField.MIN_DATE_STRING();
|
||||
String end = DateField.MAX_DATE_STRING();
|
||||
|
||||
private DateFilter(String f) {
|
||||
field = f;
|
||||
}
|
||||
|
||||
/** Constructs a filter for field <code>f</code> matching dates between
|
||||
<code>from</code> and <code>to</code>. */
|
||||
public DateFilter(String f, Date from, Date to) {
|
||||
field = f;
|
||||
start = DateField.dateToString(from);
|
||||
end = DateField.dateToString(to);
|
||||
}
|
||||
/** Constructs a filter for field <code>f</code> matching times between
|
||||
<code>from</code> and <code>to</code>. */
|
||||
public DateFilter(String f, long from, long to) {
|
||||
field = f;
|
||||
start = DateField.timeToString(from);
|
||||
end = DateField.timeToString(to);
|
||||
}
|
||||
|
||||
/** Constructs a filter for field <code>f</code> matching dates before
|
||||
<code>date</code>. */
|
||||
public static DateFilter Before(String field, Date date) {
|
||||
DateFilter result = new DateFilter(field);
|
||||
result.end = DateField.dateToString(date);
|
||||
return result;
|
||||
}
|
||||
/** Constructs a filter for field <code>f</code> matching times before
|
||||
<code>time</code>. */
|
||||
public static DateFilter Before(String field, long time) {
|
||||
DateFilter result = new DateFilter(field);
|
||||
result.end = DateField.timeToString(time);
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Constructs a filter for field <code>f</code> matching dates before
|
||||
<code>date</code>. */
|
||||
public static DateFilter After(String field, Date date) {
|
||||
DateFilter result = new DateFilter(field);
|
||||
result.start = DateField.dateToString(date);
|
||||
return result;
|
||||
}
|
||||
/** Constructs a filter for field <code>f</code> matching times before
|
||||
<code>time</code>. */
|
||||
public static DateFilter After(String field, long time) {
|
||||
DateFilter result = new DateFilter(field);
|
||||
result.start = DateField.timeToString(time);
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Returns a BitSet with true for documents which should be permitted in
|
||||
search results, and false for those that should not. */
|
||||
final public BitSet bits(IndexReader reader) throws IOException {
|
||||
BitSet bits = new BitSet(reader.maxDoc());
|
||||
TermEnum enum = reader.terms(new Term(field, start));
|
||||
try {
|
||||
Term stop = new Term(field, end);
|
||||
while (enum.term().compareTo(stop) <= 0) {
|
||||
TermDocs termDocs = reader.termDocs(enum.term());
|
||||
try {
|
||||
while (termDocs.next())
|
||||
bits.set(termDocs.doc());
|
||||
} finally {
|
||||
termDocs.close();
|
||||
}
|
||||
if (!enum.next()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
enum.close();
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
public final String toString() {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
buffer.append(field);
|
||||
buffer.append(":");
|
||||
buffer.append(DateField.stringToDate(start).toString());
|
||||
buffer.append("-");
|
||||
buffer.append(DateField.stringToDate(end).toString());
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
final class ExactPhraseScorer extends PhraseScorer {
|
||||
|
||||
ExactPhraseScorer(TermPositions[] tps, byte[] n, float w)
|
||||
throws IOException {
|
||||
super(tps, n, w);
|
||||
}
|
||||
|
||||
protected final float phraseFreq() throws IOException {
|
||||
// sort list with pq
|
||||
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
|
||||
pp.firstPosition();
|
||||
pq.put(pp); // build pq from list
|
||||
}
|
||||
pqToList(); // rebuild list from pq
|
||||
|
||||
int freq = 0;
|
||||
do { // find position w/ all terms
|
||||
while (first.position < last.position) { // scan forward in first
|
||||
do {
|
||||
if (!first.nextPosition())
|
||||
return (float)freq;
|
||||
} while (first.position < last.position);
|
||||
firstToLast();
|
||||
}
|
||||
freq++; // all equal: a match
|
||||
} while (last.nextPosition());
|
||||
|
||||
return (float)freq;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** Abstract base class providing a mechanism to restrict searches to a subset
|
||||
of an index. */
|
||||
abstract public class Filter {
|
||||
/** Returns a BitSet with true for documents which should be permitted in
|
||||
search results, and false for those that should not. */
|
||||
abstract public BitSet bits(IndexReader reader) throws IOException;
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
|
||||
/** Abstract class for enumerating a subset of all terms.
|
||||
|
||||
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
|
||||
the enumeration is greater than all that precede it. */
|
||||
public abstract class FilteredTermEnum extends TermEnum {
|
||||
private Term currentTerm = null;
|
||||
private TermEnum actualEnum = null;
|
||||
|
||||
public FilteredTermEnum(IndexReader reader, Term term) throws IOException {}
|
||||
|
||||
/** Equality compare on the term */
|
||||
protected abstract boolean termCompare(Term term);
|
||||
|
||||
/** Equality measure on the term */
|
||||
protected abstract float difference();
|
||||
|
||||
/** Indiciates the end of the enumeration has been reached */
|
||||
protected abstract boolean endEnum();
|
||||
|
||||
protected void setEnum(TermEnum actualEnum) throws IOException {
|
||||
this.actualEnum = actualEnum;
|
||||
// Find the first term that matches
|
||||
Term term = actualEnum.term();
|
||||
if (termCompare(term))
|
||||
currentTerm = term;
|
||||
else next();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the docFreq of the current Term in the enumeration.
|
||||
* Initially invalid, valid after next() called for the first time.
|
||||
*/
|
||||
public int docFreq() {
|
||||
if (actualEnum == null) return -1;
|
||||
return actualEnum.docFreq();
|
||||
}
|
||||
|
||||
/** Increments the enumeration to the next element. True if one exists. */
|
||||
public boolean next() throws IOException {
|
||||
if (actualEnum == null) return false; // the actual enumerator is not initialized!
|
||||
currentTerm = null;
|
||||
while (currentTerm == null) {
|
||||
if (endEnum()) return false;
|
||||
if (actualEnum.next()) {
|
||||
Term term = actualEnum.term();
|
||||
if (termCompare(term)) {
|
||||
currentTerm = term;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else return false;
|
||||
}
|
||||
currentTerm = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns the current Term in the enumeration.
|
||||
* Initially invalid, valid after next() called for the first time. */
|
||||
public Term term() {
|
||||
return currentTerm;
|
||||
}
|
||||
|
||||
/** Closes the enumeration to further activity, freeing resources. */
|
||||
public void close() throws IOException {
|
||||
actualEnum.close();
|
||||
currentTerm = null;
|
||||
actualEnum = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import java.io.IOException;
|
||||
|
||||
/** Implements the fuzzy search query */
|
||||
final public class FuzzyQuery extends MultiTermQuery {
|
||||
private Term fuzzyTerm;
|
||||
|
||||
public FuzzyQuery(Term term) {
|
||||
super(term);
|
||||
fuzzyTerm = term;
|
||||
}
|
||||
|
||||
final void prepare(IndexReader reader) {
|
||||
try {
|
||||
setEnum(new FuzzyTermEnum(reader, fuzzyTerm));
|
||||
} catch (IOException e) {}
|
||||
}
|
||||
|
||||
public String toString(String field) {
|
||||
return super.toString(field) + '~';
|
||||
}
|
||||
}
|
|
@ -0,0 +1,175 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
|
||||
/** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
|
||||
|
||||
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
|
||||
the enumeration is greater than all that precede it. */
|
||||
final public class FuzzyTermEnum extends FilteredTermEnum {
|
||||
double distance;
|
||||
boolean fieldMatch = false;
|
||||
boolean endEnum = false;
|
||||
|
||||
Term searchTerm = null;
|
||||
String field = "";
|
||||
String text = "";
|
||||
int textlen;
|
||||
|
||||
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
|
||||
super(reader, term);
|
||||
searchTerm = term;
|
||||
field = searchTerm.field();
|
||||
text = searchTerm.text();
|
||||
textlen = text.length();
|
||||
setEnum(reader.terms(new Term(searchTerm.field(), "")));
|
||||
}
|
||||
|
||||
/**
|
||||
The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
||||
calculate the distance between the given term and the comparing term.
|
||||
*/
|
||||
final protected boolean termCompare(Term term) {
|
||||
if (field == term.field()) {
|
||||
String target = term.text();
|
||||
int targetlen = target.length();
|
||||
int dist = editDistance(text, target, textlen, targetlen);
|
||||
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
|
||||
return (distance > FUZZY_THRESHOLD);
|
||||
}
|
||||
endEnum = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
final protected float difference() {
|
||||
return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
|
||||
}
|
||||
|
||||
final public boolean endEnum() {
|
||||
return endEnum;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* Compute Levenshtein distance
|
||||
******************************/
|
||||
|
||||
public static final double FUZZY_THRESHOLD = 0.5;
|
||||
public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
|
||||
|
||||
/**
|
||||
Finds and returns the smallest of three integers
|
||||
*/
|
||||
private final static int min(int a, int b, int c) {
|
||||
int t = (a < b) ? a : b;
|
||||
return (t < c) ? t : c;
|
||||
}
|
||||
|
||||
/**
|
||||
* This static array saves us from the time required to create a new array
|
||||
* everytime editDistance is called.
|
||||
*/
|
||||
private int e[][] = new int[0][0];
|
||||
|
||||
/**
|
||||
Levenshtein distance also known as edit distance is a measure of similiarity
|
||||
between two strings where the distance is measured as the number of character
|
||||
deletions, insertions or substitutions required to transform one string to
|
||||
the other string.
|
||||
<p>This method takes in four parameters; two strings and their respective
|
||||
lengths to compute the Levenshtein distance between the two strings.
|
||||
The result is returned as an integer.
|
||||
*/
|
||||
private final int editDistance(String s, String t, int n, int m) {
|
||||
if (e.length <= n || e[0].length <= m) {
|
||||
e = new int[Math.max(e.length, n+1)][Math.max(e.length, m+1)];
|
||||
}
|
||||
int d[][] = e; // matrix
|
||||
int i; // iterates through s
|
||||
int j; // iterates through t
|
||||
char s_i; // ith character of s
|
||||
|
||||
if (n == 0) return m;
|
||||
if (m == 0) return n;
|
||||
|
||||
// init matrix d
|
||||
for (i = 0; i <= n; i++) d[i][0] = i;
|
||||
for (j = 0; j <= m; j++) d[0][j] = j;
|
||||
|
||||
// start computing edit distance
|
||||
for (i = 1; i <= n; i++) {
|
||||
s_i = s.charAt(i - 1);
|
||||
for (j = 1; j <= m; j++) {
|
||||
if (s_i != t.charAt(j-1))
|
||||
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
|
||||
else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
|
||||
}
|
||||
}
|
||||
|
||||
// we got the result!
|
||||
return d[n][m];
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
searchTerm = null;
|
||||
field = null;
|
||||
text = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/** Lower-level search API.
|
||||
* @see IndexSearcher#search(Query,HitCollector)
|
||||
*/
|
||||
public abstract class HitCollector {
|
||||
/** Called once for every non-zero scoring document, with the document number
|
||||
* and its score.
|
||||
*
|
||||
* <P>If, for example, an application wished to collect all of the hits for a
|
||||
* query in a BitSet, then it might:<pre>
|
||||
* Searcher = new IndexSearcher(indexReader);
|
||||
* final BitSet bits = new BitSet(indexReader.maxDoc());
|
||||
* searcher.search(query, new HitCollector() {
|
||||
* public void collect(int doc, float score) {
|
||||
* bits.set(doc);
|
||||
* }
|
||||
* });
|
||||
* </pre>
|
||||
*/
|
||||
public abstract void collect(int doc, float score);
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
final class HitQueue extends PriorityQueue {
|
||||
HitQueue(int size) {
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
protected final boolean lessThan(Object a, Object b) {
|
||||
ScoreDoc hitA = (ScoreDoc)a;
|
||||
ScoreDoc hitB = (ScoreDoc)b;
|
||||
if (hitA.score == hitB.score)
|
||||
return hitA.doc > hitB.doc;
|
||||
else
|
||||
return hitA.score < hitB.score;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
import java.util.BitSet;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** A ranked list of documents, used to hold search results. */
|
||||
public final class Hits {
|
||||
private Query query;
|
||||
private Searcher searcher;
|
||||
private Filter filter = null;
|
||||
|
||||
private int length; // the total number of hits
|
||||
private Vector hitDocs = new Vector(); // cache of hits retrieved
|
||||
|
||||
private HitDoc first; // head of LRU cache
|
||||
private HitDoc last; // tail of LRU cache
|
||||
private int numDocs = 0; // number cached
|
||||
private int maxDocs = 200; // max to cache
|
||||
|
||||
Hits(Searcher s, Query q, Filter f) throws IOException {
|
||||
query = q;
|
||||
searcher = s;
|
||||
filter = f;
|
||||
getMoreDocs(50); // retrieve 100 initially
|
||||
}
|
||||
|
||||
// Tries to add new documents to hitDocs.
|
||||
// Ensures that the hit numbered <code>min</code> has been retrieved.
|
||||
private final void getMoreDocs(int min) throws IOException {
|
||||
if (hitDocs.size() > min)
|
||||
min = hitDocs.size();
|
||||
|
||||
int n = min * 2; // double # retrieved
|
||||
TopDocs topDocs = searcher.search(query, filter, n);
|
||||
length = topDocs.totalHits;
|
||||
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
|
||||
|
||||
float scoreNorm = 1.0f;
|
||||
if (length > 0 && scoreDocs[0].score > 1.0f)
|
||||
scoreNorm = 1.0f / scoreDocs[0].score;
|
||||
|
||||
int end = scoreDocs.length < length ? scoreDocs.length : length;
|
||||
for (int i = hitDocs.size(); i < end; i++)
|
||||
hitDocs.addElement(new HitDoc(scoreDocs[i].score*scoreNorm,
|
||||
scoreDocs[i].doc));
|
||||
}
|
||||
|
||||
/** Returns the total number of hits available in this set. */
|
||||
public final int length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
/** Returns the nth document in this set.
|
||||
<p>Documents are cached, so that repeated requests for the same element may
|
||||
return the same Document object. */
|
||||
public final Document doc(int n) throws IOException {
|
||||
HitDoc hitDoc = hitDoc(n);
|
||||
|
||||
// Update LRU cache of documents
|
||||
remove(hitDoc); // remove from list, if there
|
||||
addToFront(hitDoc); // add to front of list
|
||||
if (numDocs > maxDocs) { // if cache is full
|
||||
HitDoc oldLast = last;
|
||||
remove(last); // flush last
|
||||
oldLast.doc = null; // let doc get gc'd
|
||||
}
|
||||
|
||||
if (hitDoc.doc == null)
|
||||
hitDoc.doc = searcher.doc(hitDoc.id); // cache miss: read document
|
||||
|
||||
return hitDoc.doc;
|
||||
}
|
||||
|
||||
/** Returns the score for the nth document in this set. */
|
||||
public final float score(int n) throws IOException {
|
||||
return hitDoc(n).score;
|
||||
}
|
||||
|
||||
private final HitDoc hitDoc(int n) throws IOException {
|
||||
if (n >= length)
|
||||
throw new IndexOutOfBoundsException("Not a valid hit number: " + n);
|
||||
if (n >= hitDocs.size())
|
||||
getMoreDocs(n);
|
||||
|
||||
return (HitDoc)hitDocs.elementAt(n);
|
||||
}
|
||||
|
||||
private final void addToFront(HitDoc hitDoc) { // insert at front of cache
|
||||
if (first == null)
|
||||
last = hitDoc;
|
||||
else
|
||||
first.prev = hitDoc;
|
||||
|
||||
hitDoc.next = first;
|
||||
first = hitDoc;
|
||||
hitDoc.prev = null;
|
||||
|
||||
numDocs++;
|
||||
}
|
||||
|
||||
private final void remove(HitDoc hitDoc) { // remove from cache
|
||||
if (hitDoc.doc == null) // it's not in the list
|
||||
return; // abort
|
||||
|
||||
if (hitDoc.next == null)
|
||||
last = hitDoc.prev;
|
||||
else
|
||||
hitDoc.next.prev = hitDoc.prev;
|
||||
|
||||
if (hitDoc.prev == null)
|
||||
first = hitDoc.next;
|
||||
else
|
||||
hitDoc.prev.next = hitDoc.next;
|
||||
|
||||
numDocs--;
|
||||
}
|
||||
}
|
||||
|
||||
final class HitDoc {
|
||||
float score;
|
||||
int id;
|
||||
Document doc = null;
|
||||
|
||||
HitDoc next; // in doubly-linked cache
|
||||
HitDoc prev; // in doubly-linked cache
|
||||
|
||||
HitDoc(float s, int i) {
|
||||
score = s;
|
||||
id = i;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,178 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/** Implements search over a single IndexReader. */
|
||||
public final class IndexSearcher extends Searcher {
|
||||
IndexReader reader;
|
||||
|
||||
/** Creates a searcher searching the index in the named directory. */
|
||||
public IndexSearcher(String path) throws IOException {
|
||||
this(IndexReader.open(path));
|
||||
}
|
||||
|
||||
/** Creates a searcher searching the index in the provided directory. */
|
||||
public IndexSearcher(Directory directory) throws IOException {
|
||||
this(IndexReader.open(directory));
|
||||
}
|
||||
|
||||
/** Creates a searcher searching the provided index. */
|
||||
public IndexSearcher(IndexReader r) {
|
||||
reader = r;
|
||||
}
|
||||
|
||||
/** Frees resources associated with this Searcher. */
|
||||
public final void close() throws IOException {
|
||||
reader.close();
|
||||
}
|
||||
|
||||
final int docFreq(Term term) throws IOException {
|
||||
return reader.docFreq(term);
|
||||
}
|
||||
|
||||
final Document doc(int i) throws IOException {
|
||||
return reader.document(i);
|
||||
}
|
||||
|
||||
final int maxDoc() throws IOException {
|
||||
return reader.maxDoc();
|
||||
}
|
||||
|
||||
final TopDocs search(Query query, Filter filter, final int nDocs)
|
||||
throws IOException {
|
||||
Scorer scorer = Query.scorer(query, this, reader);
|
||||
if (scorer == null)
|
||||
return new TopDocs(0, new ScoreDoc[0]);
|
||||
|
||||
final BitSet bits = filter != null ? filter.bits(reader) : null;
|
||||
final HitQueue hq = new HitQueue(nDocs);
|
||||
final int[] totalHits = new int[1];
|
||||
scorer.score(new HitCollector() {
|
||||
private float minScore = 0.0f;
|
||||
public final void collect(int doc, float score) {
|
||||
if (score > 0.0f && // ignore zeroed buckets
|
||||
(bits==null || bits.get(doc))) { // skip docs not in bits
|
||||
totalHits[0]++;
|
||||
if (score >= minScore) {
|
||||
hq.put(new ScoreDoc(doc, score)); // update hit queue
|
||||
if (hq.size() > nDocs) { // if hit queue overfull
|
||||
hq.pop(); // remove lowest in hit queue
|
||||
minScore = ((ScoreDoc)hq.top()).score; // reset minScore
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}, reader.maxDoc());
|
||||
|
||||
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
|
||||
for (int i = hq.size()-1; i >= 0; i--) // put docs in array
|
||||
scoreDocs[i] = (ScoreDoc)hq.pop();
|
||||
|
||||
return new TopDocs(totalHits[0], scoreDocs);
|
||||
}
|
||||
|
||||
/** Lower-level search API.
|
||||
*
|
||||
* <p>{@link HitCollector#collect(int,float)} is called for every non-zero
|
||||
* scoring document.
|
||||
*
|
||||
* <p>Applications should only use this if they need <it>all</it> of the
|
||||
* matching documents. The high-level search API ({@link
|
||||
* Searcher#search(Query)}) is usually more efficient, as it skips
|
||||
* non-high-scoring hits. */
|
||||
public final void search(Query query, HitCollector results)
|
||||
throws IOException {
|
||||
search(query, null, results);
|
||||
}
|
||||
|
||||
/** Lower-level search API.
|
||||
*
|
||||
* <p>{@link HitCollector#collect(int,float)} is called for every non-zero
|
||||
* scoring document.
|
||||
*
|
||||
* <p>Applications should only use this if they need <it>all</it> of the
|
||||
* matching documents. The high-level search API ({@link
|
||||
* Searcher#search(Query)}) is usually more efficient, as it skips
|
||||
* non-high-scoring hits. */
|
||||
public final void search(Query query, Filter filter,
|
||||
final HitCollector results) throws IOException {
|
||||
HitCollector collector = results;
|
||||
if (filter != null) {
|
||||
final BitSet bits = filter.bits(reader);
|
||||
collector = new HitCollector() {
|
||||
public final void collect(int doc, float score) {
|
||||
if (bits.get(doc)) { // skip docs not in bits
|
||||
results.collect(doc, score);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Scorer scorer = Query.scorer(query, this, reader);
|
||||
if (scorer == null)
|
||||
return;
|
||||
scorer.score(collector, reader.maxDoc());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
# sub-directory makefile for lucene
|
||||
include ../rules.mk
|
|
@ -0,0 +1,152 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/** Implements search over a set of Searcher's. */
|
||||
public final class MultiSearcher extends Searcher {
|
||||
private Searcher[] searchers;
|
||||
private int[] starts;
|
||||
private int maxDoc = 0;
|
||||
|
||||
/** Creates a searcher which searches <i>searchers</i>. */
|
||||
public MultiSearcher(Searcher[] searchers) throws IOException {
|
||||
this.searchers = searchers;
|
||||
|
||||
starts = new int[searchers.length + 1]; // build starts array
|
||||
for (int i = 0; i < searchers.length; i++) {
|
||||
starts[i] = maxDoc;
|
||||
maxDoc += searchers[i].maxDoc(); // compute maxDocs
|
||||
}
|
||||
starts[searchers.length] = maxDoc;
|
||||
}
|
||||
|
||||
/** Frees resources associated with this Searcher. */
|
||||
public final void close() throws IOException {
|
||||
for (int i = 0; i < searchers.length; i++)
|
||||
searchers[i].close();
|
||||
}
|
||||
|
||||
final int docFreq(Term term) throws IOException {
|
||||
int docFreq = 0;
|
||||
for (int i = 0; i < searchers.length; i++)
|
||||
docFreq += searchers[i].docFreq(term);
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
final Document doc(int n) throws IOException {
|
||||
int i = searcherIndex(n); // find searcher index
|
||||
return searchers[i].doc(n - starts[i]); // dispatch to searcher
|
||||
}
|
||||
|
||||
// replace w/ call to Arrays.binarySearch in Java 1.2
|
||||
private final int searcherIndex(int n) { // find searcher for doc n:
|
||||
int lo = 0; // search starts array
|
||||
int hi = searchers.length - 1; // for first element less
|
||||
// than n, return its index
|
||||
while (hi >= lo) {
|
||||
int mid = (lo + hi) >> 1;
|
||||
int midValue = starts[mid];
|
||||
if (n < midValue)
|
||||
hi = mid - 1;
|
||||
else if (n > midValue)
|
||||
lo = mid + 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
final int maxDoc() throws IOException {
|
||||
return maxDoc;
|
||||
}
|
||||
|
||||
final TopDocs search(Query query, Filter filter, int nDocs)
|
||||
throws IOException {
|
||||
HitQueue hq = new HitQueue(nDocs);
|
||||
float minScore = 0.0f;
|
||||
int totalHits = 0;
|
||||
|
||||
for (int i = 0; i < searchers.length; i++) { // search each searcher
|
||||
TopDocs docs = searchers[i].search(query, filter, nDocs);
|
||||
totalHits += docs.totalHits; // update totalHits
|
||||
ScoreDoc[] scoreDocs = docs.scoreDocs;
|
||||
for (int j = 0; j < scoreDocs.length; j++) { // merge scoreDocs into hq
|
||||
ScoreDoc scoreDoc = scoreDocs[j];
|
||||
if (scoreDoc.score >= minScore) {
|
||||
scoreDoc.doc += starts[i]; // convert doc
|
||||
hq.put(scoreDoc); // update hit queue
|
||||
if (hq.size() > nDocs) { // if hit queue overfull
|
||||
hq.pop(); // remove lowest in hit queue
|
||||
minScore = ((ScoreDoc)hq.top()).score; // reset minScore
|
||||
}
|
||||
} else
|
||||
break; // no more scores > minScore
|
||||
}
|
||||
}
|
||||
|
||||
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
|
||||
for (int i = hq.size()-1; i >= 0; i--) // put docs in array
|
||||
scoreDocs[i] = (ScoreDoc)hq.pop();
|
||||
|
||||
return new TopDocs(totalHits, scoreDocs);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
|
||||
/** A Query that matches documents containing a subset of terms provided by a
|
||||
FilteredTermEnum enumeration. MultiTermQuery is not designed to be used by
|
||||
itself. The reason being that it is not intialized with a FilteredTermEnum
|
||||
enumeration. A FilteredTermEnum enumeration needs to be provided. For example,
|
||||
WildcardQuery and FuzzyQuery extends MultiTermQuery to provide WildcardTermEnum
|
||||
and FuzzyTermEnum respectively. */
|
||||
public class MultiTermQuery extends Query {
|
||||
private Term term;
|
||||
private FilteredTermEnum enum;
|
||||
private IndexReader reader;
|
||||
private float boost = 1.0f;
|
||||
private BooleanQuery query;
|
||||
|
||||
/** Enable or disable lucene style toString(field) format */
|
||||
private static boolean LUCENE_STYLE_TOSTRING = false;
|
||||
|
||||
/** Constructs a query for terms matching <code>term</code>. */
|
||||
public MultiTermQuery(Term term) {
|
||||
this.term = term;
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
/** Set the TermEnum to be used */
|
||||
protected void setEnum(FilteredTermEnum enum) {
|
||||
this.enum = enum;
|
||||
}
|
||||
|
||||
/** Sets the boost for this term to <code>b</code>. Documents containing
|
||||
* this term will (in addition to the normal weightings) have their score
|
||||
* multiplied by <code>boost</code>. */
|
||||
final public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/** Returns the boost for this term. */
|
||||
final public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
|
||||
return getQuery().sumOfSquaredWeights(searcher);
|
||||
}
|
||||
|
||||
final void normalize(float norm) {
|
||||
try {
|
||||
getQuery().normalize(norm);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
final Scorer scorer(IndexReader reader) throws IOException {
|
||||
return getQuery().scorer(reader);
|
||||
}
|
||||
|
||||
final private BooleanQuery getQuery() throws IOException {
|
||||
if (query == null) {
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
try {
|
||||
do {
|
||||
Term t = enum.term();
|
||||
if (t != null) {
|
||||
TermQuery tq = new TermQuery(t); // found a match
|
||||
tq.setBoost(boost * enum.difference()); // set the boost
|
||||
q.add(tq, false, false); // add to q
|
||||
}
|
||||
} while (enum.next());
|
||||
} finally {
|
||||
enum.close();
|
||||
}
|
||||
query = q;
|
||||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
/** Prints a user-readable version of this query. */
|
||||
public String toString(String field) {
|
||||
if (!LUCENE_STYLE_TOSTRING) {
|
||||
Query q = null;
|
||||
try {
|
||||
q = getQuery();
|
||||
} catch (Exception e) {}
|
||||
if (q != null) {
|
||||
return "(" + q.toString(field) + ")";
|
||||
}
|
||||
}
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
if (!term.field().equals(field)) {
|
||||
buffer.append(term.field());
|
||||
buffer.append(":");
|
||||
}
|
||||
buffer.append(term.text());
|
||||
if (boost != 1.0f) {
|
||||
buffer.append("^");
|
||||
buffer.append(Float.toString(boost));
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
final class PhrasePositions {
|
||||
int doc; // current doc
|
||||
int position; // position in doc
|
||||
int count; // remaining pos in this doc
|
||||
int offset; // position in phrase
|
||||
TermPositions tp; // stream of positions
|
||||
PhrasePositions next; // used to make lists
|
||||
|
||||
PhrasePositions(TermPositions t, int o) throws IOException {
|
||||
tp = t;
|
||||
offset = o;
|
||||
next();
|
||||
}
|
||||
|
||||
final void next() throws IOException { // increments to next doc
|
||||
if (!tp.next()) {
|
||||
tp.close(); // close stream
|
||||
doc = Integer.MAX_VALUE; // sentinel value
|
||||
return;
|
||||
}
|
||||
doc = tp.doc();
|
||||
position = 0;
|
||||
}
|
||||
|
||||
final void firstPosition() throws IOException {
|
||||
count = tp.freq(); // read first pos
|
||||
nextPosition();
|
||||
}
|
||||
|
||||
final boolean nextPosition() throws IOException {
|
||||
if (count-- > 0) { // read subsequent pos's
|
||||
position = tp.nextPosition() - offset;
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,183 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** A Query that matches documents containing a particular sequence of terms.
|
||||
This may be combined with other terms with a {@link BooleanQuery}.
|
||||
*/
|
||||
final public class PhraseQuery extends Query {
|
||||
private String field;
|
||||
private Vector terms = new Vector();
|
||||
private float idf = 0.0f;
|
||||
private float weight = 0.0f;
|
||||
|
||||
private float boost = 1.0f;
|
||||
private int slop = 0;
|
||||
|
||||
|
||||
/** Constructs an empty phrase query. */
|
||||
public PhraseQuery() {
|
||||
}
|
||||
|
||||
/** Sets the boost for this term to <code>b</code>. Documents containing
|
||||
this term will (in addition to the normal weightings) have their score
|
||||
multiplied by <code>b</code>. */
|
||||
public final void setBoost(float b) { boost = b; }
|
||||
/** Gets the boost for this term. Documents containing
|
||||
this term will (in addition to the normal weightings) have their score
|
||||
multiplied by <code>b</code>. The boost is 1.0 by default. */
|
||||
public final float getBoost() { return boost; }
|
||||
|
||||
/** Sets the number of other words permitted between words in query phrase.
|
||||
If zero, then this is an exact phrase search. For larger values this works
|
||||
like a <code>WITHIN</code> or <code>NEAR</code> operator.
|
||||
|
||||
<p>The slop is in fact an edit-distance, where the units correspond to
|
||||
moves of terms in the query phrase out of position. For example, to switch
|
||||
the order of two words requires two moves (the first move places the words
|
||||
atop one another), so to permit re-orderings of phrases, the slop must be
|
||||
at least two.
|
||||
|
||||
<p>More exact matches are scored higher than sloppier matches, thus search
|
||||
results are sorted by exactness.
|
||||
|
||||
<p>The slop is zero by default, requiring exact matches.*/
|
||||
public final void setSlop(int s) { slop = s; }
|
||||
/** Returns the slop. See setSlop(). */
|
||||
public final int getSlop() { return slop; }
|
||||
|
||||
/** Adds a term to the end of the query phrase. */
|
||||
public final void add(Term term) {
|
||||
if (terms.size() == 0)
|
||||
field = term.field();
|
||||
else if (term.field() != field)
|
||||
throw new IllegalArgumentException
|
||||
("All phrase terms must be in the same field: " + term);
|
||||
|
||||
terms.addElement(term);
|
||||
}
|
||||
|
||||
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
|
||||
for (int i = 0; i < terms.size(); i++) // sum term IDFs
|
||||
idf += Similarity.idf((Term)terms.elementAt(i), searcher);
|
||||
|
||||
weight = idf * boost;
|
||||
return weight * weight; // square term weights
|
||||
}
|
||||
|
||||
final void normalize(float norm) {
|
||||
weight *= norm; // normalize for query
|
||||
weight *= idf; // factor from document
|
||||
}
|
||||
|
||||
final Scorer scorer(IndexReader reader) throws IOException {
|
||||
if (terms.size() == 0) // optimize zero-term case
|
||||
return null;
|
||||
if (terms.size() == 1) { // optimize one-term case
|
||||
Term term = (Term)terms.elementAt(0);
|
||||
TermDocs docs = reader.termDocs(term);
|
||||
if (docs == null)
|
||||
return null;
|
||||
return new TermScorer(docs, reader.norms(term.field()), weight);
|
||||
}
|
||||
|
||||
TermPositions[] tps = new TermPositions[terms.size()];
|
||||
for (int i = 0; i < terms.size(); i++) {
|
||||
TermPositions p = reader.termPositions((Term)terms.elementAt(i));
|
||||
if (p == null)
|
||||
return null;
|
||||
tps[i] = p;
|
||||
}
|
||||
|
||||
if (slop == 0) // optimize exact case
|
||||
return new ExactPhraseScorer(tps, reader.norms(field), weight);
|
||||
else
|
||||
return
|
||||
new SloppyPhraseScorer(tps, slop, reader.norms(field), weight);
|
||||
|
||||
}
|
||||
|
||||
/** Prints a user-readable version of this query. */
|
||||
public final String toString(String f) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
if (!field.equals(f)) {
|
||||
buffer.append(field);
|
||||
buffer.append(":");
|
||||
}
|
||||
|
||||
buffer.append("\"");
|
||||
for (int i = 0; i < terms.size(); i++) {
|
||||
buffer.append(((Term)terms.elementAt(i)).text());
|
||||
if (i != terms.size()-1)
|
||||
buffer.append(" ");
|
||||
}
|
||||
buffer.append("\"");
|
||||
|
||||
if (boost != 1.0f) {
|
||||
buffer.append("^");
|
||||
buffer.append(Float.toString(boost));
|
||||
}
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
final class PhraseQueue extends PriorityQueue {
|
||||
PhraseQueue(int size) {
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
protected final boolean lessThan(Object o1, Object o2) {
|
||||
PhrasePositions pp1 = (PhrasePositions)o1;
|
||||
PhrasePositions pp2 = (PhrasePositions)o2;
|
||||
if (pp1.doc == pp2.doc)
|
||||
return pp1.position < pp2.position;
|
||||
else
|
||||
return pp1.doc < pp2.doc;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
abstract class PhraseScorer extends Scorer {
|
||||
protected byte[] norms;
|
||||
protected float weight;
|
||||
|
||||
protected PhraseQueue pq;
|
||||
protected PhrasePositions first, last;
|
||||
|
||||
PhraseScorer(TermPositions[] tps, byte[] n, float w) throws IOException {
|
||||
norms = n;
|
||||
weight = w;
|
||||
|
||||
// use PQ to build a sorted list of PhrasePositions
|
||||
pq = new PhraseQueue(tps.length);
|
||||
for (int i = 0; i < tps.length; i++)
|
||||
pq.put(new PhrasePositions(tps[i], i));
|
||||
pqToList();
|
||||
}
|
||||
|
||||
final void score(HitCollector results, int end) throws IOException {
|
||||
while (last.doc < end) { // find doc w/ all the terms
|
||||
while (first.doc < last.doc) { // scan forward in first
|
||||
do {
|
||||
first.next();
|
||||
} while (first.doc < last.doc);
|
||||
firstToLast();
|
||||
if (last.doc >= end)
|
||||
return;
|
||||
}
|
||||
|
||||
// found doc with all terms
|
||||
float freq = phraseFreq(); // check for phrase
|
||||
|
||||
if (freq > 0.0) {
|
||||
float score = Similarity.tf(freq)*weight; // compute score
|
||||
score *= Similarity.norm(norms[first.doc]); // normalize
|
||||
results.collect(first.doc, score); // add to results
|
||||
}
|
||||
last.next(); // resume scanning
|
||||
}
|
||||
}
|
||||
|
||||
abstract protected float phraseFreq() throws IOException;
|
||||
|
||||
protected final void pqToList() {
|
||||
last = first = null;
|
||||
while (pq.top() != null) {
|
||||
PhrasePositions pp = (PhrasePositions)pq.pop();
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else
|
||||
first = pp;
|
||||
last = pp;
|
||||
pp.next = null;
|
||||
}
|
||||
}
|
||||
|
||||
protected final void firstToLast() {
|
||||
last.next = first; // move first to end of list
|
||||
last = first;
|
||||
first = first.next;
|
||||
last.next = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** A Query that matches documents containing terms with a specified prefix. */
|
||||
final public class PrefixQuery extends Query {
|
||||
private Term prefix;
|
||||
private IndexReader reader;
|
||||
private float boost = 1.0f;
|
||||
private BooleanQuery query;
|
||||
|
||||
/** Constructs a query for terms starting with <code>prefix</code>. */
|
||||
public PrefixQuery(Term prefix) {
|
||||
this.prefix = prefix;
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
/** Sets the boost for this term to <code>b</code>. Documents containing
|
||||
this term will (in addition to the normal weightings) have their score
|
||||
multiplied by <code>boost</code>. */
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/** Returns the boost for this term. */
|
||||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
final void prepare(IndexReader reader) {
|
||||
this.query = null;
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
final float sumOfSquaredWeights(Searcher searcher)
|
||||
throws IOException {
|
||||
return getQuery().sumOfSquaredWeights(searcher);
|
||||
}
|
||||
|
||||
void normalize(float norm) {
|
||||
try {
|
||||
getQuery().normalize(norm);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
Scorer scorer(IndexReader reader) throws IOException {
|
||||
return getQuery().scorer(reader);
|
||||
}
|
||||
|
||||
private BooleanQuery getQuery() throws IOException {
|
||||
if (query == null) {
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
TermEnum enum = reader.terms(prefix);
|
||||
try {
|
||||
String prefixText = prefix.text();
|
||||
String prefixField = prefix.field();
|
||||
do {
|
||||
Term term = enum.term();
|
||||
if (term != null &&
|
||||
term.text().startsWith(prefixText) &&
|
||||
term.field() == prefixField) {
|
||||
TermQuery tq = new TermQuery(term); // found a match
|
||||
tq.setBoost(boost); // set the boost
|
||||
q.add(tq, false, false); // add to q
|
||||
//System.out.println("added " + term);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (enum.next());
|
||||
} finally {
|
||||
enum.close();
|
||||
}
|
||||
query = q;
|
||||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
/** Prints a user-readable version of this query. */
|
||||
public String toString(String field) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
if (!prefix.field().equals(field)) {
|
||||
buffer.append(prefix.field());
|
||||
buffer.append(":");
|
||||
}
|
||||
buffer.append(prefix.text());
|
||||
buffer.append('*');
|
||||
if (boost != 1.0f) {
|
||||
buffer.append("^");
|
||||
buffer.append(Float.toString(boost));
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/** The abstract base class for queries.
|
||||
<p>Instantiable subclasses are:
|
||||
<ul>
|
||||
<li> {@link TermQuery}
|
||||
<li> {@link PhraseQuery}
|
||||
<li> {@link BooleanQuery}
|
||||
</ul>
|
||||
<p>A parser for queries is contained in:
|
||||
<ul>
|
||||
<li><a href="doc/lucene.queryParser.QueryParser.html">QueryParser</a>
|
||||
</ul>
|
||||
*/
|
||||
abstract public class Query {
|
||||
|
||||
// query weighting
|
||||
abstract float sumOfSquaredWeights(Searcher searcher) throws IOException;
|
||||
abstract void normalize(float norm);
|
||||
|
||||
// query evaluation
|
||||
abstract Scorer scorer(IndexReader reader) throws IOException;
|
||||
|
||||
void prepare(IndexReader reader) {}
|
||||
|
||||
static Scorer scorer(Query query, Searcher searcher, IndexReader reader)
|
||||
throws IOException {
|
||||
query.prepare(reader);
|
||||
float sum = query.sumOfSquaredWeights(searcher);
|
||||
float norm = 1.0f / (float)Math.sqrt(sum);
|
||||
query.normalize(norm);
|
||||
return query.scorer(reader);
|
||||
}
|
||||
|
||||
/** Prints a query to a string, with <code>field</code> as the default field
|
||||
for terms.
|
||||
<p>The representation used is one that is readable by
|
||||
<a href="doc/lucene.queryParser.QueryParser.html">QueryParser</a>
|
||||
(although, if the query was created by the parser, the printed
|
||||
representation may not be exactly what was parsed). */
|
||||
abstract public String toString(String field);
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
final class ScoreDoc {
|
||||
float score;
|
||||
int doc;
|
||||
|
||||
ScoreDoc(int d, float s) {
|
||||
doc = d;
|
||||
score = s;
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue