Initial revision

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149570 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jason van Zyl 2001-09-18 16:29:48 +00:00
parent 749b4aaf7e
commit bd3948c539
134 changed files with 16175 additions and 0 deletions

View File

@ -0,0 +1,87 @@
package org.apache.lucene;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import com.lucene.store.Directory;
import com.lucene.store.FSDirectory;
import com.lucene.index.IndexReader;
import com.lucene.index.Term;
class DeleteFiles {
public static void main(String[] args) {
try {
Directory directory = FSDirectory.getDirectory("demo index", false);
IndexReader reader = IndexReader.open(directory);
// Term term = new Term("path", "pizza");
// int deleted = reader.delete(term);
// System.out.println("deleted " + deleted +
// " documents containing " + term);
for (int i = 0; i < reader.maxDoc(); i++)
reader.delete(i);
reader.close();
directory.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
}

View File

@ -0,0 +1,111 @@
package org.apache.lucene;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.File;
import java.io.Reader;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import com.lucene.document.Document;
import com.lucene.document.Field;
import com.lucene.document.DateField;
/** A utility for making Lucene Documents from a File. */
public class FileDocument {
/** Makes a document for a File.
<p>
The document has three fields:
<ul>
<li><code>path</code>--containing the pathname of the file, as a stored,
tokenized field;
<li><code>modified</code>--containing the last modified date of the file as
a keyword field as encoded by <a
href="lucene.document.DateField.html">DateField</a>; and
<li><code>contents</code>--containing the full contents of the file, as a
Reader field;
*/
public static Document Document(File f)
throws java.io.FileNotFoundException {
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a Text field, so
// that the index stores the path, and so that the path is searchable
doc.add(Field.Text("path", f.getPath()));
// Add the last modified date of the file a field named "modified". Use a
// Keyword field, so that it's searchable, but so that no attempt is made
// to tokenize the field into words.
doc.add(Field.Keyword("modified",
DateField.timeToString(f.lastModified())));
// Add the contents of the file a field named "contents". Use a Text
// field, specifying a Reader, so that the text of the file is tokenized.
// ?? why doesn't FileReader work here ??
FileInputStream is = new FileInputStream(f);
Reader reader = new BufferedReader(new InputStreamReader(is));
doc.add(Field.Text("contents", reader));
// return the document
return doc;
}
private FileDocument() {}
}

View File

@ -0,0 +1,121 @@
package org.apache.lucene;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.*;
import com.lucene.document.*;
import demo.HTMLParser.HTMLParser;
/** A utility for making Lucene Documents for HTML documents. */
public class HTMLDocument {
static char dirSep = System.getProperty("file.separator").charAt(0);
public static String uid(File f) {
// Append path and date into a string in such a way that lexicographic
// sorting gives the same results as a walk of the file hierarchy. Thus
// null (\u0000) is used both to separate directory components and to
// separate the path from the date.
return f.getPath().replace(dirSep, '\u0000') +
"\u0000" +
DateField.timeToString(f.lastModified());
}
public static String uid2url(String uid) {
String url = uid.replace('\u0000', '/'); // replace nulls with slashes
return url.substring(0, url.lastIndexOf('/')); // remove date from end
}
public static Document Document(File f)
throws IOException, InterruptedException {
// make a new, empty document
Document doc = new Document();
// Add the url as a field named "url". Use an UnIndexed field, so
// that the url is just stored with the document, but is not searchable.
doc.add(Field.UnIndexed("url", f.getPath().replace(dirSep, '/')));
// Add the last modified date of the file a field named "modified". Use a
// Keyword field, so that it's searchable, but so that no attempt is made
// to tokenize the field into words.
doc.add(Field.Keyword("modified",
DateField.timeToString(f.lastModified())));
// Add the uid as a field, so that index can be incrementally maintained.
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
doc.add(new Field("uid", uid(f), false, true, false));
HTMLParser parser = new HTMLParser(f);
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
doc.add(Field.Text("contents", parser.getReader()));
// Add the summary as an UnIndexed field, so that it is stored and returned
// with hit documents for display.
doc.add(Field.UnIndexed("summary", parser.getSummary()));
// Add the title as a separate Text field, so that it can be searched
// separately.
doc.add(Field.Text("title", parser.getTitle()));
// return the document
return doc;
}
private HTMLDocument() {}
}

View File

@ -0,0 +1,7 @@
HTMLParser.java
HTMLParserTokenManager.java
TokenMgrError.java
ParseException.java
Token.java
ASCII_CharStream.java
HTMLParserConstants.java

View File

@ -0,0 +1,365 @@
package demo.HTMLParser;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.*;
public class Entities {
static final Hashtable decoder = new Hashtable(300);
static final String[] encoder = new String[0x100];
static final String decode(String entity) {
if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
entity = entity.substring(0, entity.length()-1);
if (entity.charAt(1) == '#') {
int start = 2;
int radix = 10;
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
start++;
radix = 16;
}
Character c =
new Character((char)Integer.parseInt(entity.substring(start), radix));
return c.toString();
} else {
String s = (String)decoder.get(entity);
if (s != null)
return s;
else return "";
}
}
static final public String encode(String s) {
int length = s.length();
StringBuffer buffer = new StringBuffer(length * 2);
for (int i = 0; i < length; i++) {
char c = s.charAt(i);
int j = (int)c;
if (j < 0x100 && encoder[j] != null) {
buffer.append(encoder[j]); // have a named encoding
buffer.append(';');
} else if (j < 0x80) {
buffer.append(c); // use ASCII value
} else {
buffer.append("&#"); // use numeric encoding
buffer.append((int)c);
buffer.append(';');
}
}
return buffer.toString();
}
static final void add(String entity, int value) {
decoder.put(entity, (new Character((char)value)).toString());
if (value < 0x100)
encoder[value] = entity;
}
static {
add("&nbsp", 160);
add("&iexcl", 161);
add("&cent", 162);
add("&pound", 163);
add("&curren", 164);
add("&yen", 165);
add("&brvbar", 166);
add("&sect", 167);
add("&uml", 168);
add("&copy", 169);
add("&ordf", 170);
add("&laquo", 171);
add("&not", 172);
add("&shy", 173);
add("&reg", 174);
add("&macr", 175);
add("&deg", 176);
add("&plusmn", 177);
add("&sup2", 178);
add("&sup3", 179);
add("&acute", 180);
add("&micro", 181);
add("&para", 182);
add("&middot", 183);
add("&cedil", 184);
add("&sup1", 185);
add("&ordm", 186);
add("&raquo", 187);
add("&frac14", 188);
add("&frac12", 189);
add("&frac34", 190);
add("&iquest", 191);
add("&Agrave", 192);
add("&Aacute", 193);
add("&Acirc", 194);
add("&Atilde", 195);
add("&Auml", 196);
add("&Aring", 197);
add("&AElig", 198);
add("&Ccedil", 199);
add("&Egrave", 200);
add("&Eacute", 201);
add("&Ecirc", 202);
add("&Euml", 203);
add("&Igrave", 204);
add("&Iacute", 205);
add("&Icirc", 206);
add("&Iuml", 207);
add("&ETH", 208);
add("&Ntilde", 209);
add("&Ograve", 210);
add("&Oacute", 211);
add("&Ocirc", 212);
add("&Otilde", 213);
add("&Ouml", 214);
add("&times", 215);
add("&Oslash", 216);
add("&Ugrave", 217);
add("&Uacute", 218);
add("&Ucirc", 219);
add("&Uuml", 220);
add("&Yacute", 221);
add("&THORN", 222);
add("&szlig", 223);
add("&agrave", 224);
add("&aacute", 225);
add("&acirc", 226);
add("&atilde", 227);
add("&auml", 228);
add("&aring", 229);
add("&aelig", 230);
add("&ccedil", 231);
add("&egrave", 232);
add("&eacute", 233);
add("&ecirc", 234);
add("&euml", 235);
add("&igrave", 236);
add("&iacute", 237);
add("&icirc", 238);
add("&iuml", 239);
add("&eth", 240);
add("&ntilde", 241);
add("&ograve", 242);
add("&oacute", 243);
add("&ocirc", 244);
add("&otilde", 245);
add("&ouml", 246);
add("&divide", 247);
add("&oslash", 248);
add("&ugrave", 249);
add("&uacute", 250);
add("&ucirc", 251);
add("&uuml", 252);
add("&yacute", 253);
add("&thorn", 254);
add("&yuml", 255);
add("&fnof", 402);
add("&Alpha", 913);
add("&Beta", 914);
add("&Gamma", 915);
add("&Delta", 916);
add("&Epsilon",917);
add("&Zeta", 918);
add("&Eta", 919);
add("&Theta", 920);
add("&Iota", 921);
add("&Kappa", 922);
add("&Lambda", 923);
add("&Mu", 924);
add("&Nu", 925);
add("&Xi", 926);
add("&Omicron",927);
add("&Pi", 928);
add("&Rho", 929);
add("&Sigma", 931);
add("&Tau", 932);
add("&Upsilon",933);
add("&Phi", 934);
add("&Chi", 935);
add("&Psi", 936);
add("&Omega", 937);
add("&alpha", 945);
add("&beta", 946);
add("&gamma", 947);
add("&delta", 948);
add("&epsilon",949);
add("&zeta", 950);
add("&eta", 951);
add("&theta", 952);
add("&iota", 953);
add("&kappa", 954);
add("&lambda", 955);
add("&mu", 956);
add("&nu", 957);
add("&xi", 958);
add("&omicron",959);
add("&pi", 960);
add("&rho", 961);
add("&sigmaf", 962);
add("&sigma", 963);
add("&tau", 964);
add("&upsilon",965);
add("&phi", 966);
add("&chi", 967);
add("&psi", 968);
add("&omega", 969);
add("&thetasym",977);
add("&upsih", 978);
add("&piv", 982);
add("&bull", 8226);
add("&hellip", 8230);
add("&prime", 8242);
add("&Prime", 8243);
add("&oline", 8254);
add("&frasl", 8260);
add("&weierp", 8472);
add("&image", 8465);
add("&real", 8476);
add("&trade", 8482);
add("&alefsym",8501);
add("&larr", 8592);
add("&uarr", 8593);
add("&rarr", 8594);
add("&darr", 8595);
add("&harr", 8596);
add("&crarr", 8629);
add("&lArr", 8656);
add("&uArr", 8657);
add("&rArr", 8658);
add("&dArr", 8659);
add("&hArr", 8660);
add("&forall", 8704);
add("&part", 8706);
add("&exist", 8707);
add("&empty", 8709);
add("&nabla", 8711);
add("&isin", 8712);
add("&notin", 8713);
add("&ni", 8715);
add("&prod", 8719);
add("&sum", 8721);
add("&minus", 8722);
add("&lowast", 8727);
add("&radic", 8730);
add("&prop", 8733);
add("&infin", 8734);
add("&ang", 8736);
add("&and", 8743);
add("&or", 8744);
add("&cap", 8745);
add("&cup", 8746);
add("&int", 8747);
add("&there4", 8756);
add("&sim", 8764);
add("&cong", 8773);
add("&asymp", 8776);
add("&ne", 8800);
add("&equiv", 8801);
add("&le", 8804);
add("&ge", 8805);
add("&sub", 8834);
add("&sup", 8835);
add("&nsub", 8836);
add("&sube", 8838);
add("&supe", 8839);
add("&oplus", 8853);
add("&otimes", 8855);
add("&perp", 8869);
add("&sdot", 8901);
add("&lceil", 8968);
add("&rceil", 8969);
add("&lfloor", 8970);
add("&rfloor", 8971);
add("&lang", 9001);
add("&rang", 9002);
add("&loz", 9674);
add("&spades", 9824);
add("&clubs", 9827);
add("&hearts", 9829);
add("&diams", 9830);
add("&quot", 34);
add("&amp", 38);
add("&lt", 60);
add("&gt", 62);
add("&OElig", 338);
add("&oelig", 339);
add("&Scaron", 352);
add("&scaron", 353);
add("&Yuml", 376);
add("&circ", 710);
add("&tilde", 732);
add("&ensp", 8194);
add("&emsp", 8195);
add("&thinsp", 8201);
add("&zwnj", 8204);
add("&zwj", 8205);
add("&lrm", 8206);
add("&rlm", 8207);
add("&ndash", 8211);
add("&mdash", 8212);
add("&lsquo", 8216);
add("&rsquo", 8217);
add("&sbquo", 8218);
add("&ldquo", 8220);
add("&rdquo", 8221);
add("&bdquo", 8222);
add("&dagger", 8224);
add("&Dagger", 8225);
add("&permil", 8240);
add("&lsaquo", 8249);
add("&rsaquo", 8250);
add("&euro", 8364);
}
}

View File

@ -0,0 +1,347 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
// HTMLParser.jj
options {
STATIC = false;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_LOOKAHEAD = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(HTMLParser)
package org.apache.lucene.HTMLParser;
import java.io.*;
public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || (length > SUMMARY_LENGTH))
break;
wait(10);
}
}
return title.toString().trim();
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH)
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit))
return sum.substring(tit.length());
else
return sum;
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeIn = new PipedReader();
pipeOut = new PipedWriter(pipeIn);
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inScript)
return;
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !title.equals("")) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addSpace() throws IOException {
if (inScript)
return;
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
// Token t;
// do {
// t = getNextToken();
// } while (t.kind != TagEnd);
// }
}
PARSER_END(HTMLParser)
void HTMLDocument() throws IOException :
{
Token t;
}
{
// try {
( Tag() { afterTag = true; }
| t=Decl() { afterTag = true; }
| CommentTag() { afterTag = true; }
| t=<Word> { addText(t.image); afterTag = false; }
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
| t=<Punct> { addText(t.image); afterTag = false; }
| <Space> { addSpace(); afterTag = false; }
)* <EOF>
// } catch (ParseException e) {
// handleException(e);
// }
}
void Tag() throws IOException :
{
Token t1, t2;
boolean inImg = false;
}
{
t1=<TagName> {
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
}
(t1=<ArgName>
(<ArgEquals>
(t2=ArgValue() // save ALT text in IMG tag
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
}
)?
)?
)*
<TagEnd>
}
Token ArgValue() :
{
Token t = null;
}
{
t=<ArgValue> { return t; }
| LOOKAHEAD(2)
<ArgQuote1> <CloseQuote1> { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
| LOOKAHEAD(2)
<ArgQuote2> <CloseQuote2> { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
}
Token Decl() :
{
Token t;
}
{
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
{ return t; }
}
void CommentTag() :
{}
{
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
}
TOKEN :
{
< TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < Comment1: "<!--" > : WithinComment1
| < Comment2: "<!" > : WithinComment2
| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
<LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
| < #NUM: ["0"-"9"] >
| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? ) >
| < Space: (<SP>)+ >
| < #SP: [" ","\t","\r","\n"] >
| < Punct: ~[] > // Keep this last. It is a catch-all.
}
<WithinTag> TOKEN:
{
< ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
(~[" ","\t","\r","\n","=",">"])* >
| < ArgEquals: "=" > : AfterEquals
| < TagEnd: ">" | "=>" > : DEFAULT
}
<AfterEquals> TOKEN:
{
< ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
(~[" ","\t","\r","\n",">"])* > : WithinTag
}
<WithinTag, AfterEquals> TOKEN:
{
< ArgQuote1: "'" > : WithinQuote1
| < ArgQuote2: "\"" > : WithinQuote2
}
<WithinTag, AfterEquals> SKIP:
{
< <Space> >
}
<WithinQuote1> TOKEN:
{
< Quote1Text: (~["'"])+ >
| < CloseQuote1: <ArgQuote1> > : WithinTag
}
<WithinQuote2> TOKEN:
{
< Quote2Text: (~["\""])+ >
| < CloseQuote2: <ArgQuote2> > : WithinTag
}
<WithinComment1> TOKEN :
{
< CommentText1: (~["-"])+ | "-" >
| < CommentEnd1: "-->" > : DEFAULT
}
<WithinComment2> TOKEN :
{
< CommentText2: (~[">"])+ >
| < CommentEnd2: ">" > : DEFAULT
}

View File

@ -0,0 +1,3 @@
# sub-directory makefile for lucene
ROOT = ../..
include ../../com/lucene/rules.mk

View File

@ -0,0 +1,86 @@
package org.apache.lucene.HTMLParser;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.*;
class ParserThread extends Thread {
HTMLParser parser;
ParserThread(HTMLParser p) {
parser = p;
}
public void run() { // convert pipeOut to pipeIn
try {
try { // parse document to pipeOut
parser.HTMLDocument();
} catch (ParseException e) {
System.out.println("Parse Aborted: " + e.getMessage());
} catch (TokenMgrError e) {
System.out.println("Parse Aborted: " + e.getMessage());
} finally {
parser.pipeOut.close();
synchronized (parser) {
parser.summary.setLength(parser.SUMMARY_LENGTH);
parser.titleComplete = true;
parser.notifyAll();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.HTMLParser;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.*;
class Test {
public static void main(String[] argv) throws Exception {
if ("-dir".equals(argv[0])) {
String[] files = new File(argv[1]).list();
java.util.Arrays.sort(files);
for (int i = 0; i < files.length; i++) {
System.err.println(files[i]);
File file = new File(argv[1], files[i]);
parse(file);
}
} else
parse(new File(argv[0]));
}
public static void parse(File file) throws Exception {
HTMLParser parser = new HTMLParser(file);
System.out.println("Title: " + Entities.encode(parser.getTitle()));
System.out.println("Summary: " + Entities.encode(parser.getSummary()));
LineNumberReader reader = new LineNumberReader(parser.getReader());
for (String l = reader.readLine(); l != null; l = reader.readLine())
System.out.println(l);
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.lucene.analysis.StopAnalyzer;
import com.lucene.index.IndexWriter;
import java.io.File;
import java.util.Date;
class IndexFiles {
public static void main(String[] args) {
try {
Date start = new Date();
IndexWriter writer = new IndexWriter("index", new StopAnalyzer(), true);
writer.mergeFactor = 20;
indexDocs(writer, new File(args[0]));
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
public static void indexDocs(IndexWriter writer, File file)
throws Exception {
if (file.isDirectory()) {
String[] files = file.list();
for (int i = 0; i < files.length; i++)
indexDocs(writer, new File(file, files[i]));
} else {
System.out.println("adding " + file);
writer.addDocument(FileDocument.Document(file));
}
}
}

View File

@ -0,0 +1,195 @@
package org.apache.lucene;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.lucene.analysis.StopAnalyzer;
import com.lucene.index.*;
import com.lucene.document.Document;
import com.lucene.util.Arrays;
import demo.HTMLParser.HTMLParser;
import java.io.File;
import java.util.Date;
class IndexHTML {
private static boolean deleting = false; // true during deletion pass
private static IndexReader reader; // existing index
private static IndexWriter writer; // new index being built
private static TermEnum uidIter; // document id iterator
public static void main(String[] argv) {
try {
String index = "index";
boolean create = false;
File root = null;
String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
if (argv.length == 0) {
System.err.println("Usage: " + usage);
return;
}
for (int i = 0; i < argv.length; i++) {
if (argv[i].equals("-index")) { // parse -index option
index = argv[++i];
} else if (argv[i].equals("-create")) { // parse -create option
create = true;
} else if (i != argv.length-1) {
System.err.println("Usage: " + usage);
return;
} else
root = new File(argv[i]);
}
Date start = new Date();
if (!create) { // delete stale docs
deleting = true;
indexDocs(root, index, create);
}
writer = new IndexWriter(index, new StopAnalyzer(), create);
writer.mergeFactor = 20;
writer.maxFieldLength = 1000000;
indexDocs(root, index, create); // add new docs
System.out.println("Optimizing index...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
/* Walk directory hierarchy in uid order, while keeping uid iterator from
/* existing index in sync. Mismatches indicate one of: (a) old documents to
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
/* documents, to be indexed.
*/
private static void indexDocs(File file, String index, boolean create)
throws Exception {
if (!create) { // incrementally update
reader = IndexReader.open(index); // open existing index
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
indexDocs(file);
if (deleting) { // delete rest of stale docs
while (uidIter.term() != null && uidIter.term().field() == "uid") {
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.delete(uidIter.term());
uidIter.next();
}
deleting = false;
}
uidIter.close(); // close uid iterator
reader.close(); // close existing index
} else // don't have exisiting
indexDocs(file);
}
private static void indexDocs(File file) throws Exception {
if (file.isDirectory()) { // if a directory
String[] files = file.list(); // list its files
Arrays.sort(files); // sort the files
for (int i = 0; i < files.length; i++) // recursively index them
indexDocs(new File(file, files[i]));
} else if (file.getPath().endsWith(".html") || // index .html files
file.getPath().endsWith(".htm") || // index .htm files
file.getPath().endsWith(".txt")) { // index .txt files
if (uidIter != null) {
String uid = HTMLDocument.uid(file); // construct uid for doc
while (uidIter.term() != null && uidIter.term().field() == "uid" &&
uidIter.term().text().compareTo(uid) < 0) {
if (deleting) { // delete stale docs
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.delete(uidIter.term());
}
uidIter.next();
}
if (uidIter.term() != null && uidIter.term().field() == "uid" &&
uidIter.term().text().compareTo(uid) == 0) {
uidIter.next(); // keep matching docs
} else if (!deleting) { // add new docs
Document doc = HTMLDocument.Document(file);
System.out.println("adding " + doc.get("url"));
writer.addDocument(doc);
}
} else { // creating a new index
Document doc = HTMLDocument.Document(file);
System.out.println("adding " + doc.get("url"));
writer.addDocument(doc); // add docs unconditionally
}
}
}
}

View File

@ -0,0 +1,3 @@
# sub-directory makefile for lucene
ROOT = ..
include ../com/lucene/rules.mk

View File

@ -0,0 +1,17 @@
<HTML>
<HEAD>
<TITLE>Lucene Search Demo</TITLE>
</HEAD>
<BODY>
<CENTER>
<H1>
Lucene Search Demo</H1>
<form name=search action=http://localhost:8080/Search.jhtml method=get>
<input name=query size=44>&nbsp;<input type=submit value=Search></form>
</CENTER>
</BODY>
</HTML>

View File

@ -0,0 +1,166 @@
<HTML><!-- -*-java-*- -->
<!-- Lucene Search Demo via CompiledPageServlet -->
<!-- Copyright (c) 1998,2000 Douglass R. Cutting. -->
<java type=import>
javax.servlet.*
javax.servlet.http.*
java.io.*
com.lucene.analysis.*
com.lucene.document.*
com.lucene.index.*
com.lucene.search.*
com.lucene.queryParser.*
demo.HTMLParser.Entities
</java>
<java>
// get index from request
String indexName = request.getParameter("index");
if (indexName == null) // default to "index"
indexName = "index";
Searcher searcher = // make searcher
new IndexSearcher(getReader(indexName));
// get query from request
String queryString = request.getParameter("query");
if (queryString == null)
throw new ServletException("no query specified");
int start = 0; // first hit to display
String startString = request.getParameter("start");
if (startString != null)
start = Integer.parseInt(startString);
int hitsPerPage = 10; // number of hits to display
String hitsString = request.getParameter("hitsPerPage");
if (hitsString != null)
hitsPerPage = Integer.parseInt(hitsString);
boolean showSummaries = true; // show summaries?
if ("false".equals(request.getParameter("showSummaries")))
showSummaries = false;
Query query = null;
try { // parse query
query = QueryParser.parse(queryString, "contents", analyzer);
} catch (ParseException e) { // error parsing query
</java>
<HEAD><TITLE>Error Parsing Query</TITLE></HEAD><BODY>
<p>While parsing `queryString`: `e.getMessage()`
<java>
return;
}
String servletPath = request.getRequestURI(); // getServletPath should work
int j = servletPath.indexOf('?'); // here but doesn't, so we
if (j != -1) // remove query by hand...
servletPath = servletPath.substring(0, j);
</java>
<head><title>Lucene Search Results</title></head><body>
<center>
<form name=search action=`servletPath` method=get>
<input name=query size=44 value='`queryString`'>
<input type=hidden name=index value="`indexName`">
<input type=hidden name=hitsPerPage value=`hitsPerPage`>
<input type=hidden name=showSummaries value=`showSummaries`>
<input type=submit value=Search>
</form>
</center>
<java>
Hits hits = searcher.search(query); // perform query
int end = Math.min(hits.length(), start + hitsPerPage);
</java>
<p>Hits <b><java type=print>start+1</java>-<java type=print>end</java></b>
(out of <java type=print>hits.length()</java> total matching documents):
<ul>
<java>
for (int i = start; i < end; i++) { // display the hits
Document doc = hits.doc(i);
String title = doc.get("title");
if (title.equals("")) // use url for docs w/o title
title = doc.get("url");
</java>
<p><b><java type=print>(int)(hits.score(i) * 100.0f)</java>%
<a href="`doc.get("url")`">
<java type=print>Entities.encode(title)</java>
</b></a>
<java>
if (showSummaries) { // maybe show summary
</java>
<ul><i>Summary</i>:
<java type=print>Entities.encode(doc.get("summary"))</java>
</ul>
<java>
}
}
</java>
</ul>
<java>
if (end < hits.length()) { // insert next page button
</java>
<center>
<form name=search action=`servletPath` method=get>
<input type=hidden name=query value='`queryString`'>
<input type=hidden name=start value=`end`>
<input type=hidden name=index value="`indexName`">
<input type=hidden name=hitsPerPage value=`hitsPerPage`>
<input type=hidden name=showSummaries value=`showSummaries`>
<input type=submit value=Next>
</form>
</center>
<java>
}
</java>
</body>
<java type=class>
Analyzer analyzer = new StopAnalyzer(); // used to tokenize queries
/** Keep a cache of open IndexReader's, so that an index does not have to
opened for each query. The cache re-opens an index when it has changed
so that additions and deletions are visible ASAP. */
static Hashtable indexCache = new Hashtable(); // name->CachedIndex
class CachedIndex { // an entry in the cache
IndexReader reader; // an open reader
long modified; // reader's modified date
CachedIndex(String name) throws IOException {
modified = IndexReader.lastModified(name); // get modified date
reader = IndexReader.open(name); // open reader
}
}
IndexReader getReader(String name) throws ServletException {
CachedIndex index = // look in cache
(CachedIndex)indexCache.get(name);
try {
if (index != null && // check up-to-date
(index.modified == IndexReader.lastModified(name)))
return index.reader; // cache hit
else {
index = new CachedIndex(name); // cache miss
}
} catch (IOException e) {
StringWriter writer = new StringWriter();
PrintWriter pw = new PrintWriter(writer);
throw new ServletException("Could not open index " + name + ": " +
e.getClass().getName() + "--" +
e.getMessage());
}
indexCache.put(name, index); // add to cache
return index.reader;
}
</java>

View File

@ -0,0 +1,110 @@
package org.apache.lucene;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import com.lucene.analysis.Analyzer;
import com.lucene.analysis.StopAnalyzer;
import com.lucene.document.Document;
import com.lucene.search.Searcher;
import com.lucene.search.IndexSearcher;
import com.lucene.search.Query;
import com.lucene.search.Hits;
import com.lucene.queryParser.QueryParser;
class SearchFiles {
public static void main(String[] args) {
try {
Searcher searcher = new IndexSearcher("index");
Analyzer analyzer = new StopAnalyzer();
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
while (true) {
System.out.print("Query: ");
String line = in.readLine();
if (line.length() == -1)
break;
Query query = QueryParser.parse(line, "contents", analyzer);
System.out.println("Searching for: " + query.toString("contents"));
Hits hits = searcher.search(query);
System.out.println(hits.length() + " total matching documents");
final int HITS_PER_PAGE = 10;
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
for (int i = start; i < end; i++)
System.out.println(i + ". " + hits.doc(i).get("path"));
if (hits.length() > end) {
System.out.print("more (y/n) ? ");
line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n')
break;
}
}
}
searcher.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
}

View File

@ -0,0 +1,9 @@
# top-level makefile for lucene
all: jar doc
# root is two levels up
ROOT = ../..
include rules.mk

View File

@ -0,0 +1,91 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
* Typical implementations first build a Tokenizer, which breaks the stream of
* characters from the Reader into raw Tokens. One or more TokenFilters may
* then be applied to the output of the Tokenizer.
* <p>
* WARNING: You must override one of the methods defined by this class in your
* subclass or the Analyzer will enter an infinite loop.
*/
abstract public class Analyzer {
/** Creates a TokenStream which tokenizes all the text in the provided
Reader. Default implementation forwards to tokenStream(Reader) for
compatibility with older version. Override to allow Analyzer to choose
strategy based on document and/or field. Must be able to handle null
field name for backward compatibility. */
public TokenStream tokenStream(String fieldName, Reader reader)
{
// implemented for backward compatibility
return tokenStream(reader);
}
/** Creates a TokenStream which tokenizes all the text in the provided
* Reader. Provided for backward compatibility only.
* @deprecated use tokenStream(String, Reader) instead.
* @see tokenStream(String, Reader)
*/
public TokenStream tokenStream(Reader reader)
{
return tokenStream(null, reader);
}
}

View File

@ -0,0 +1,114 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public final class LetterTokenizer extends Tokenizer {
public LetterTokenizer(Reader in) {
input = in;
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
};
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
else
c = (char) ioBuffer[bufferIndex++];
if (Character.isLetter(c)) { // if it's a letter
if (length == 0) // start of token
start = offset-1;
buffer[length++] = c; // buffer it
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
return new Token(new String(buffer, 0, length), start, start+length);
}
}

View File

@ -0,0 +1,74 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/** Normalizes token text to lower case. */
public final class LowerCaseFilter extends TokenFilter {
public LowerCaseFilter(TokenStream in) {
input = in;
}
public final Token next() throws java.io.IOException {
Token t = input.next();
if (t == null)
return null;
t.termText = t.termText.toLowerCase();
return t;
}
}

View File

@ -0,0 +1,116 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** LowerCaseTokenizer performs the function of LetterTokenizer
and LowerCaseFilter together. It divides text at non-letters and converts
them to lower case. While it is functionally equivalent to the combination
of LetterTokenizer and LowerCaseFilter, there is a performance advantage
to doing the two tasks at once, hence this (redundent) implementation.
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public final class LowerCaseTokenizer extends Tokenizer {
public LowerCaseTokenizer(Reader in) {
input = in;
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
};
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
else
c = (char) ioBuffer[bufferIndex++];
if (Character.isLetter(c)) { // if it's a letter
if (length == 0) // start of token
start = offset-1;
buffer[length++] = Character.toLowerCase(c);
// buffer it
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
return new Token(new String(buffer, 0, length), start, start+length);
}
}

View File

@ -0,0 +1,2 @@
# sub-directory makefile for lucene
include ../rules.mk

View File

@ -0,0 +1,98 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Hashtable;
/** Transforms the token stream as per the Porter stemming algorithm.
Note: the input to the stemming filter must already be in lower case,
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
down the Tokenizer chain in order for this to work properly!
To use this filter with other analyzers, you'll want to write an
Analyzer class that sets up the TokenStream chain as you want it.
To use this with LowerCaseTokenizer, for example, you'd write an
analyzer like this:
class MyAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new PorterStemFilter(new LowerCaseTokenizer(reader));
}
}
*/
public final class PorterStemFilter extends TokenFilter {
private PorterStemmer stemmer;
public PorterStemFilter(TokenStream in) {
stemmer = new PorterStemmer();
input = in;
}
/** Returns the next input Token, after being stemmed */
public final Token next() throws IOException {
Token token = input.next();
if (token == null)
return null;
else {
String s = stemmer.stem(token.termText);
if (s != token.termText) // Yes, I mean object reference comparison here
token.termText = s;
return token;
}
}
}

View File

@ -0,0 +1,584 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
Porter stemmer in Java. The original paper is in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,
See also http://www.muscat.com/~martin/stem.html
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.
Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.
Release 3.
[ This version is derived from Release 3, modified by Brian Goetz to
optimize for fewer object creations. ]
*/
import java.io.*;
/**
*
* Stemmer, implementing the Porter Stemming Algorithm
*
* The Stemmer class transforms a word into its root form. The input
* word can be provided a character at time (by calling add()), or at once
* by calling one of the various stem(something) methods.
*/
class PorterStemmer
{
private char[] b;
private int i, /* offset into b */
j, k, k0;
private boolean dirty = false;
private static final int INC = 50; /* unit of size whereby b is increased */
private static final int EXTRA = 1;
public PorterStemmer() {
b = new char[INC];
i = 0;
}
/**
* reset() resets the stemmer so it can stem another word. If you invoke
* the stemmer by calling add(char) and then stem(), you must call reset()
* before starting another word.
*/
public void reset() { i = 0; dirty = false; }
/**
* Add a character to the word being stemmed. When you are finished
* adding characters, you can call stem(void) to process the word.
*/
public void add(char ch) {
if (b.length <= i + EXTRA) {
char[] new_b = new char[b.length+INC];
for (int c = 0; c < b.length; c++)
new_b[c] = b[c];
b = new_b;
}
b[i++] = ch;
}
/**
* After a word has been stemmed, it can be retrieved by toString(),
* or a reference to the internal buffer can be retrieved by getResultBuffer
* and getResultLength (which is generally more efficient.)
*/
public String toString() { return new String(b,0,i); }
/**
* Returns the length of the word resulting from the stemming process.
*/
public int getResultLength() { return i; }
/**
* Returns a reference to a character buffer containing the results of
* the stemming process. You also need to consult getResultLength()
* to determine the length of the result.
*/
public char[] getResultBuffer() { return b; }
/* cons(i) is true <=> b[i] is a consonant. */
private final boolean cons(int i) {
switch (b[i]) {
case 'a': case 'e': case 'i': case 'o': case 'u':
return false;
case 'y':
return (i==k0) ? true : !cons(i-1);
default:
return true;
}
}
/* m() measures the number of consonant sequences between k0 and j. if c is
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
*/
private final int m() {
int n = 0;
int i = k0;
while(true) {
if (i > j)
return n;
if (! cons(i))
break;
i++;
}
i++;
while(true) {
while(true) {
if (i > j)
return n;
if (cons(i))
break;
i++;
}
i++;
n++;
while(true) {
if (i > j)
return n;
if (! cons(i))
break;
i++;
}
i++;
}
}
/* vowelinstem() is true <=> k0,...j contains a vowel */
private final boolean vowelinstem() {
int i;
for (i = k0; i <= j; i++)
if (! cons(i))
return true;
return false;
}
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
private final boolean doublec(int j) {
if (j < k0+1)
return false;
if (b[j] != b[j-1])
return false;
return cons(j);
}
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
and also if the second c is not w,x or y. this is used when trying to
restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
*/
private final boolean cvc(int i) {
if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
return false;
else {
int ch = b[i];
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
}
return true;
}
private final boolean ends(String s) {
int l = s.length();
int o = k-l+1;
if (o < k0)
return false;
for (int i = 0; i < l; i++)
if (b[o+i] != s.charAt(i))
return false;
j = k-l;
return true;
}
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
k. */
void setto(String s) {
int l = s.length();
int o = j+1;
for (int i = 0; i < l; i++)
b[o+i] = s.charAt(i);
k = j+l;
dirty = true;
}
/* r(s) is used further down. */
void r(String s) { if (m() > 0) setto(s); }
/* step1() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
ties -> ti
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
*/
private final void step1() {
if (b[k] == 's') {
if (ends("sses")) k -= 2;
else if (ends("ies")) setto("i");
else if (b[k-1] != 's') k--;
}
if (ends("eed")) {
if (m() > 0)
k--;
}
else if ((ends("ed") || ends("ing")) && vowelinstem()) {
k = j;
if (ends("at")) setto("ate");
else if (ends("bl")) setto("ble");
else if (ends("iz")) setto("ize");
else if (doublec(k)) {
int ch = b[k--];
if (ch == 'l' || ch == 's' || ch == 'z')
k++;
}
else if (m() == 1 && cvc(k))
setto("e");
}
}
/* step2() turns terminal y to i when there is another vowel in the stem. */
private final void step2() {
if (ends("y") && vowelinstem()) {
b[k] = 'i';
dirty = true;
}
}
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
-ation) maps to -ize etc. note that the string before the suffix must give
m() > 0. */
private final void step3() {
if (k == k0) return; /* For Bug 1 */
switch (b[k-1]) {
case 'a':
if (ends("ational")) { r("ate"); break; }
if (ends("tional")) { r("tion"); break; }
break;
case 'c':
if (ends("enci")) { r("ence"); break; }
if (ends("anci")) { r("ance"); break; }
break;
case 'e':
if (ends("izer")) { r("ize"); break; }
break;
case 'l':
if (ends("bli")) { r("ble"); break; }
if (ends("alli")) { r("al"); break; }
if (ends("entli")) { r("ent"); break; }
if (ends("eli")) { r("e"); break; }
if (ends("ousli")) { r("ous"); break; }
break;
case 'o':
if (ends("ization")) { r("ize"); break; }
if (ends("ation")) { r("ate"); break; }
if (ends("ator")) { r("ate"); break; }
break;
case 's':
if (ends("alism")) { r("al"); break; }
if (ends("iveness")) { r("ive"); break; }
if (ends("fulness")) { r("ful"); break; }
if (ends("ousness")) { r("ous"); break; }
break;
case 't':
if (ends("aliti")) { r("al"); break; }
if (ends("iviti")) { r("ive"); break; }
if (ends("biliti")) { r("ble"); break; }
break;
case 'g':
if (ends("logi")) { r("log"); break; }
}
}
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
private final void step4() {
switch (b[k]) {
case 'e':
if (ends("icate")) { r("ic"); break; }
if (ends("ative")) { r(""); break; }
if (ends("alize")) { r("al"); break; }
break;
case 'i':
if (ends("iciti")) { r("ic"); break; }
break;
case 'l':
if (ends("ical")) { r("ic"); break; }
if (ends("ful")) { r(""); break; }
break;
case 's':
if (ends("ness")) { r(""); break; }
break;
}
}
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
private final void step5() {
if (k == k0) return; /* for Bug 1 */
switch (b[k-1]) {
case 'a':
if (ends("al")) break;
return;
case 'c':
if (ends("ance")) break;
if (ends("ence")) break;
return;
case 'e':
if (ends("er")) break; return;
case 'i':
if (ends("ic")) break; return;
case 'l':
if (ends("able")) break;
if (ends("ible")) break; return;
case 'n':
if (ends("ant")) break;
if (ends("ement")) break;
if (ends("ment")) break;
/* element etc. not stripped before the m */
if (ends("ent")) break;
return;
case 'o':
if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break;
return;
/* takes care of -ous */
case 's':
if (ends("ism")) break;
return;
case 't':
if (ends("ate")) break;
if (ends("iti")) break;
return;
case 'u':
if (ends("ous")) break;
return;
case 'v':
if (ends("ive")) break;
return;
case 'z':
if (ends("ize")) break;
return;
default:
return;
}
if (m() > 1)
k = j;
}
/* step6() removes a final -e if m() > 1. */
private final void step6() {
j = k;
if (b[k] == 'e') {
int a = m();
if (a > 1 || a == 1 && !cvc(k-1))
k--;
}
if (b[k] == 'l' && doublec(k) && m() > 1)
k--;
}
/**
* Stem a word provided as a String. Returns the result as a String.
*/
public String stem(String s) {
if (stem(s.toCharArray(), s.length()))
return toString();
else
return s;
}
/** Stem a word contained in a char[]. Returns true if the stemming process
* resulted in a word different from the input. You can retrieve the
* result with getResultLength()/getResultBuffer() or toString().
*/
public boolean stem(char[] word) {
return stem(word, word.length);
}
/** Stem a word contained in a portion of a char[] array. Returns
* true if the stemming process resulted in a word different from
* the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public boolean stem(char[] wordBuffer, int offset, int wordLen) {
reset();
if (b.length < wordLen) {
char[] new_b = new char[wordLen + EXTRA];
b = new_b;
}
for (int j=0; j<wordLen; j++)
b[j] = wordBuffer[offset+j];
i = wordLen;
return stem(0);
}
/** Stem a word contained in a leading portion of a char[] array.
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public boolean stem(char[] word, int wordLen) {
return stem(word, 0, wordLen);
}
/** Stem the word placed into the Stemmer buffer through calls to add().
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public boolean stem() {
return stem(0);
}
public boolean stem(int i0) {
k = i - 1;
k0 = i0;
if (k > k0+1) {
step1(); step2(); step3(); step4(); step5(); step6();
}
// Also, a word is considered dirty if we lopped off letters
// Thanks to Ifigenia Vairelles for pointing this out.
if (i != k+1)
dirty = true;
i = k+1;
return dirty;
}
/** Test program for demonstrating the Stemmer. It reads a file and
* stems each word, writing the result to standard out.
* Usage: Stemmer file-name
*/
public static void main(String[] args) {
PorterStemmer s = new PorterStemmer();
for (int i = 0; i < args.length; i++) {
try {
InputStream in = new FileInputStream(args[i]);
byte[] buffer = new byte[1024];
int bufferLen, offset, ch;
bufferLen = in.read(buffer);
offset = 0;
s.reset();
while(true) {
if (offset < bufferLen)
ch = buffer[offset++];
else {
bufferLen = in.read(buffer);
offset = 0;
if (bufferLen < 0)
ch = -1;
else
ch = buffer[offset++];
}
if (Character.isLetter((char) ch)) {
s.add(Character.toLowerCase((char) ch));
}
else {
s.stem();
System.out.print(s.toString());
s.reset();
if (ch < 0)
break;
else {
System.out.print((char) ch);
}
}
}
in.close();
}
catch (IOException e) {
System.out.println("error reading " + args[i]);
}
}
}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
public final class SimpleAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new LowerCaseTokenizer(reader);
}
}

View File

@ -0,0 +1,90 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
import java.util.Hashtable;
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
public final class StopAnalyzer extends Analyzer {
private Hashtable stopTable;
/** An array containing some common English words that are not usually useful
for searching. */
public static final String[] ENGLISH_STOP_WORDS = {
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
public StopAnalyzer() {
stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
}
/** Builds an analyzer which removes words in the provided array. */
public StopAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
}
/** Filters LowerCaseTokenizer with StopFilter. */
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
}
}

View File

@ -0,0 +1,99 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Hashtable;
/** Removes stop words from a token stream. */
public final class StopFilter extends TokenFilter {
private Hashtable table;
/** Constructs a filter which removes words from the input
TokenStream that are named in the array of words. */
public StopFilter(TokenStream in, String[] stopWords) {
input = in;
table = makeStopTable(stopWords);
}
/** Constructs a filter which removes words from the input
TokenStream that are named in the Hashtable. */
public StopFilter(TokenStream in, Hashtable stopTable) {
input = in;
table = stopTable;
}
/** Builds a Hashtable from an array of stop words, appropriate for passing
into the StopFilter constructor. This permits this table construction to
be cached once when an Analyzer is constructed. */
public final static Hashtable makeStopTable(String[] stopWords) {
Hashtable stopTable = new Hashtable(stopWords.length);
for (int i = 0; i < stopWords.length; i++)
stopTable.put(stopWords[i], stopWords[i]);
return stopTable;
}
/** Returns the next input Token whose termText() is not a stop word. */
public final Token next() throws IOException {
// return the first non-stop word found
for (Token token = input.next(); token != null; token = input.next())
if (table.get(token.termText) == null)
return token;
// reached EOS -- return null
return null;
}
}

View File

@ -0,0 +1,111 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
The start and end offsets permit applications to re-associate a token with
its source text, e.g., to display highlighted query terms in a document
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc.
The type is an interned string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word". */
public final class Token {
String termText; // the text of the term
int startOffset; // start in source text
int endOffset; // end in source text
String type = "word"; // lexical type
/** Constructs a Token with the given term text, and start & end offsets.
The type defaults to "word." */
public Token(String text, int start, int end) {
termText = text;
startOffset = start;
endOffset = end;
}
/** Constructs a Token with the given text, start and end offsets, & type. */
public Token(String text, int start, int end, String typ) {
termText = text;
startOffset = start;
endOffset = end;
type = typ;
}
/** Returns the Token's term text. */
public final String termText() { return termText; }
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
Note that the difference between endOffset() and startOffset() may not be
equal to termText.length(), as the term text may have been altered by a
stemmer or some other filter. */
public final int startOffset() { return startOffset; }
/** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. */
public final int endOffset() { return endOffset; }
/** Returns this Token's lexical type. Defaults to "word". */
public final String type() { return type; }
}

View File

@ -0,0 +1,74 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
/** A TokenFilter is a TokenStream whose input is another token stream.
<p>
This is an abstract class.
*/
abstract public class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
protected TokenStream input;
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();
}
}

View File

@ -0,0 +1,77 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
/** A TokenStream enumerates the sequence of tokens, either from
fields of a document or from query text.
<p>
This is an abstract class. Concrete subclasses are:
<ul>
<li>{@link Tokenizer}, a TokenStream
whose input is a Reader; and
<li>{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
</ul>
*/
abstract public class TokenStream {
/** Returns the next token in the stream, or null at EOS. */
abstract public Token next() throws IOException;
/** Releases resources associated with this stream. */
public void close() throws IOException {}
}

View File

@ -0,0 +1,74 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
import java.io.IOException;
/** A Tokenizer is a TokenStream whose input is a Reader.
<p>
This is an abstract class.
*/
abstract public class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
protected Reader input;
/** By default, closes the input Reader. */
public void close() throws IOException {
input.close();
}
}

View File

@ -0,0 +1,10 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Doug Cutting">
</head>
<body>
API and code to convert text into indexable tokens.
</body>
</html>

View File

@ -0,0 +1,6 @@
Token.java
StandardTokenizer.java
StandardTokenizerTokenManager.java
TokenMgrError.java
CharStream.java
StandardTokenizerConstants.java

View File

@ -0,0 +1,159 @@
// FastCharStream.java
package org.apache.lucene.analysis.standard;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.*;
/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
* this does not do line-number counting, but instead keeps track of the
* character position of the token in the input, as required by Lucene's {@link
* org.apache.lucene.analysis.Token} API. */
public final class FastCharStream implements CharStream {
char[] buffer = null;
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
Reader input; // source of chars
/** Constructs from a Reader. */
public FastCharStream(Reader r) {
input = r;
}
public final char readChar() throws IOException {
if (bufferPosition >= bufferLength)
refill();
return buffer[bufferPosition++];
}
private final void refill() throws IOException {
int newPosition = bufferLength - tokenStart;
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
} else if (bufferLength == buffer.length) { // grow buffer
char[] newBuffer = new char[buffer.length*2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
}
} else { // shift token to front
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
}
bufferLength = newPosition; // update state
bufferPosition = newPosition;
bufferStart += tokenStart;
tokenStart = 0;
int charsRead = // fill space in buffer
input.read(buffer, newPosition, buffer.length-newPosition);
if (charsRead == -1)
throw new IOException("read past eof");
else
bufferLength += charsRead;
}
public final char BeginToken() throws IOException {
tokenStart = bufferPosition;
return readChar();
}
public final void backup(int amount) {
bufferPosition -= amount;
}
public final String GetImage() {
return new String(buffer, tokenStart, bufferPosition - tokenStart);
}
public final char[] GetSuffix(int len) {
char[] value = new char[len];
System.arraycopy(buffer, bufferPosition - len, value, 0, len);
return value;
}
public final void Done() {
try {
input.close();
} catch (IOException e) {
System.err.println("Caught: " + e + "; ignoring.");
}
}
public final int getColumn() {
return bufferStart + bufferPosition;
}
public final int getLine() {
return 1;
}
public final int getEndColumn() {
return bufferStart + bufferPosition;
}
public final int getEndLine() {
return 1;
}
public final int getBeginColumn() {
return bufferStart + tokenStart;
}
public final int getBeginLine() {
return 1;
}
}

View File

@ -0,0 +1,7 @@
ROOT = ../../../..
include ../../rules.mk
# Don't delete ParseException.java -- we've changed it by hand.
DIRT := $(patsubst ParseException.java,,${DIRT})

View File

@ -0,0 +1,191 @@
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 0.7pre6 */
package org.apache.lucene.analysis.standard;
/**
* This exception is thrown when parse errors are encountered.
* You can explicitly create objects of this exception type by
* calling the method generateParseException in the generated
* parser.
*
* You can modify this class to customize your error reporting
* mechanisms so long as you retain the public fields.
*/
public class ParseException extends java.io.IOException {
/**
* This constructor is used by the method "generateParseException"
* in the generated parser. Calling this constructor generates
* a new object of this type with the fields "currentToken",
* "expectedTokenSequences", and "tokenImage" set. The boolean
* flag "specialConstructor" is also set to true to indicate that
* this constructor was used to create this object.
* This constructor calls its super class with the empty string
* to force the "toString" method of parent class "Throwable" to
* print the error message in the form:
* ParseException: <result of getMessage>
*/
public ParseException(Token currentTokenVal,
int[][] expectedTokenSequencesVal,
String[] tokenImageVal
)
{
super("");
specialConstructor = true;
currentToken = currentTokenVal;
expectedTokenSequences = expectedTokenSequencesVal;
tokenImage = tokenImageVal;
}
/**
* The following constructors are for use by you for whatever
* purpose you can think of. Constructing the exception in this
* manner makes the exception behave in the normal way - i.e., as
* documented in the class "Throwable". The fields "errorToken",
* "expectedTokenSequences", and "tokenImage" do not contain
* relevant information. The JavaCC generated code does not use
* these constructors.
*/
public ParseException() {
super();
specialConstructor = false;
}
public ParseException(String message) {
super(message);
specialConstructor = false;
}
/**
* This variable determines which constructor was used to create
* this object and thereby affects the semantics of the
* "getMessage" method (see below).
*/
protected boolean specialConstructor;
/**
* This is the last token that has been consumed successfully. If
* this object has been created due to a parse error, the token
* followng this token will (therefore) be the first error token.
*/
public Token currentToken;
/**
* Each entry in this array is an array of integers. Each array
* of integers represents a sequence of tokens (by their ordinal
* values) that is expected at this point of the parse.
*/
public int[][] expectedTokenSequences;
/**
* This is a reference to the "tokenImage" array of the generated
* parser within which the parse error occurred. This array is
* defined in the generated ...Constants interface.
*/
public String[] tokenImage;
/**
* This method has the standard behavior when this object has been
* created using the standard constructors. Otherwise, it uses
* "currentToken" and "expectedTokenSequences" to generate a parse
* error message and returns it. If this object has been created
* due to a parse error, and you do not catch it (it gets thrown
* from the parser), then this method is called during the printing
* of the final stack trace, and hence the correct error message
* gets displayed.
*/
public String getMessage() {
if (!specialConstructor) {
return super.getMessage();
}
String expected = "";
int maxSize = 0;
for (int i = 0; i < expectedTokenSequences.length; i++) {
if (maxSize < expectedTokenSequences[i].length) {
maxSize = expectedTokenSequences[i].length;
}
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
expected += tokenImage[expectedTokenSequences[i][j]] + " ";
}
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
expected += "...";
}
expected += eol + " ";
}
String retval = "Encountered \"";
Token tok = currentToken.next;
for (int i = 0; i < maxSize; i++) {
if (i != 0) retval += " ";
if (tok.kind == 0) {
retval += tokenImage[0];
break;
}
retval += add_escapes(tok.image);
tok = tok.next;
}
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn + "." + eol;
if (expectedTokenSequences.length == 1) {
retval += "Was expecting:" + eol + " ";
} else {
retval += "Was expecting one of:" + eol + " ";
}
retval += expected;
return retval;
}
/**
* The end of line string for this machine.
*/
protected String eol = System.getProperty("line.separator", "\n");
/**
* Used to convert raw characters to their escaped version
* when these raw version cannot be used as part of an ASCII
* string literal.
*/
protected String add_escapes(String str) {
StringBuffer retval = new StringBuffer();
char ch;
for (int i = 0; i < str.length(); i++) {
switch (str.charAt(i))
{
case 0 :
continue;
case '\b':
retval.append("\\b");
continue;
case '\t':
retval.append("\\t");
continue;
case '\n':
retval.append("\\n");
continue;
case '\f':
retval.append("\\f");
continue;
case '\r':
retval.append("\\r");
continue;
case '\"':
retval.append("\\\"");
continue;
case '\'':
retval.append("\\\'");
continue;
case '\\':
retval.append("\\\\");
continue;
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}
continue;
}
}
return retval.toString();
}
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.analysis.standard;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.*;
import java.io.Reader;
import java.util.Hashtable;
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}. */
public final class StandardAnalyzer extends Analyzer {
private Hashtable stopTable;
/** An array containing some common English words that are not usually useful
for searching. */
public static final String[] STOP_WORDS = {
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
/** Builds an analyzer. */
public StandardAnalyzer() {
this(STOP_WORDS);
}
/** Builds an analyzer with the given stop words. */
public StandardAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
* StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopTable);
return result;
}
}

View File

@ -0,0 +1,106 @@
package org.apache.lucene.analysis.standard;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.*;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
public final class StandardFilter extends TokenFilter
implements StandardTokenizerConstants {
/** Construct filtering <i>in</i>. */
public StandardFilter(TokenStream in) {
input = in;
}
private static final String APOSTROPHE_TYPE = tokenImage[APOSTROPHE];
private static final String ACRONYM_TYPE = tokenImage[ACRONYM];
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
org.apache.lucene.analysis.Token t = input.next();
if (t == null)
return null;
String text = t.termText();
String type = t.type();
if (type == APOSTROPHE_TYPE && // remove 's
(text.endsWith("'s") || text.endsWith("'S"))) {
return new org.apache.lucene.analysis.Token
(text.substring(0,text.length()-2),
t.startOffset(), t.endOffset(), type);
} else if (type == ACRONYM_TYPE) { // remove dots
StringBuffer trimmed = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c != '.')
trimmed.append(c);
}
return new org.apache.lucene.analysis.Token
(trimmed.toString(), t.startOffset(), t.endOffset(), type);
} else {
return t;
}
}
}

View File

@ -0,0 +1,197 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
options {
STATIC = false;
//IGNORE_CASE = true;
//BUILD_PARSER = false;
//UNICODE_INPUT = true;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(StandardTokenizer)
package org.apache.lucene.analysis.standard;
import java.io.*;
/** A grammar-based tokenizer constructed with JavaCC.
*
* <p> This should be a good tokenizer for most European-language documents.
*
* <p>Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*/
public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
/** Constructs a tokenizer for this Reader. */
public StandardTokenizer(Reader reader) {
this(new FastCharStream(reader));
this.input = reader;
}
}
PARSER_END(StandardTokenizer)
TOKEN : { // token patterns
// basic word: a sequence of digits & letters
<ALPHANUM: (<LETTER>|<DIGIT>)+ >
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
// company names like AT&T and Excite@Home.
| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
// email addresses
| <EMAIL: <ALPHANUM> "@" <ALPHANUM> ("." <ALPHANUM>)+ >
// hostname
| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
| <HAS_DIGIT> <P> <ALPHANUM>
| <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
| <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
| <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
| <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
)
>
| <#P: ("_"|"-"|"/"|"."|",") >
| <#HAS_DIGIT: // at least one digit
(<LETTER>|<DIGIT>)*
<DIGIT>
(<LETTER>|<DIGIT>)*
>
| < #ALPHA: (<LETTER>)+>
| < #LETTER: // unicode letters
[
"\u0041"-"\u005a",
"\u0061"-"\u007a",
"\u00c0"-"\u00d6",
"\u00d8"-"\u00f6",
"\u00f8"-"\u00ff",
"\u0100"-"\u1fff",
"\u3040"-"\u318f",
"\u3300"-"\u337f",
"\u3400"-"\u3d2d",
"\u4e00"-"\u9fff",
"\uf900"-"\ufaff"
]
>
| < #DIGIT: // unicode digits
[
"\u0030"-"\u0039",
"\u0660"-"\u0669",
"\u06f0"-"\u06f9",
"\u0966"-"\u096f",
"\u09e6"-"\u09ef",
"\u0a66"-"\u0a6f",
"\u0ae6"-"\u0aef",
"\u0b66"-"\u0b6f",
"\u0be7"-"\u0bef",
"\u0c66"-"\u0c6f",
"\u0ce6"-"\u0cef",
"\u0d66"-"\u0d6f",
"\u0e50"-"\u0e59",
"\u0ed0"-"\u0ed9",
"\u1040"-"\u1049"
]
>
}
SKIP : { // skip unrecognized chars
<NOISE: ~[] >
}
/** Returns the next token in the stream, or null at EOS.
* <p>The returned token's type is set to an element of {@link
* StandardTokenizerConstants.tokenImage}.
*/
org.apache.lucene.analysis.Token next() throws IOException :
{
Token token = null;
}
{
( token = <ALPHANUM> |
token = <APOSTROPHE> |
token = <ACRONYM> |
token = <COMPANY> |
token = <EMAIL> |
token = <HOST> |
token = <NUM> |
token = <EOF>
)
{
if (token.kind == EOF) {
return null;
} else {
return
new org.apache.lucene.analysis.Token(token.image,
token.beginColumn,token.endColumn,
tokenImage[token.kind]);
}
}
}

View File

@ -0,0 +1,15 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Doug Cutting">
</head>
<body>
A grammar-based tokenizer constructed with JavaCC.
<p>Note that JavaCC defines lots of public, classes, methods and fields
that do not need to be public.&nbsp; These clutter the documentation.&nbsp;
Sorry.
<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>org.apache.lucene.analysis.Token</tt>
must always be fully qualified in sourced code in this package.
</body>
</html>

View File

@ -0,0 +1,109 @@
package org.apache.lucene.document;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Date;
/** Provides support for converting dates to strings and vice-versa. The
* strings are structured so that lexicographic sorting orders by date. This
* makes them suitable for use as field values and search terms. */
public class DateField {
private DateField() {};
// make date strings long enough to last a millenium
private static int DATE_LEN = Long.toString(1000L*365*24*60*60*1000,
Character.MAX_RADIX).length();
public static String MIN_DATE_STRING() {
return timeToString(0);
}
public static String MAX_DATE_STRING() {
char[] buffer = new char[DATE_LEN];
char c = Character.forDigit(Character.MAX_RADIX-1, Character.MAX_RADIX);
for (int i = 0 ; i < DATE_LEN; i++)
buffer[i] = c;
return new String(buffer);
}
/** Converts a Date to a string suitable for indexing. */
public static String dateToString(Date date) {
return timeToString(date.getTime());
}
/** Converts a millisecond time to a string suitable for indexing. */
public static String timeToString(long time) {
if (time < 0)
throw new RuntimeException("time too early");
String s = Long.toString(time, Character.MAX_RADIX);
if (s.length() > DATE_LEN)
throw new RuntimeException("time too late");
while (s.length() < DATE_LEN)
s = "0" + s; // pad with leading zeros
return s;
}
/** Converts a string-encoded date into a millisecond time. */
public static long stringToTime(String s) {
return Long.parseLong(s, Character.MAX_RADIX);
}
/** Converts a string-encoded date into a Date object. */
public static Date stringToDate(String s) {
return new Date(stringToTime(s));
}
}

View File

@ -0,0 +1,145 @@
package org.apache.lucene.document;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Enumeration;
/** Documents are the unit of indexing and search.
*
* A Document is a set of fields. Each field has a name and a textual value.
* A field may be stored with the document, in which case it is returned with
* search hits on the document. Thus each document should typically contain
* stored fields which uniquely identify it.
* */
public final class Document {
DocumentFieldList fieldList = null;
/** Constructs a new document with no fields. */
public Document() {}
/** Adds a field to a document. Several fields may be added with
* the same name. In this case, if the fields are indexed, their text is
* treated as though appended for the purposes of search. */
public final void add(Field field) {
fieldList = new DocumentFieldList(field, fieldList);
}
/** Returns a field with the given name if any exist in this document, or
null. If multiple fields may exist with this name, this method returns the
last added such added. */
public final Field getField(String name) {
for (DocumentFieldList list = fieldList; list != null; list = list.next)
if (list.field.name().equals(name))
return list.field;
return null;
}
/** Returns the string value of the field with the given name if any exist in
this document, or null. If multiple fields may exist with this name, this
method returns the last added such added. */
public final String get(String name) {
Field field = getField(name);
if (field != null)
return field.stringValue();
else
return null;
}
/** Returns an Enumeration of all the fields in a document. */
public final Enumeration fields() {
return new DocumentFieldEnumeration(this);
}
/** Prints the fields of a document for human consumption. */
public final String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append("Document<");
for (DocumentFieldList list = fieldList; list != null; list = list.next) {
buffer.append(list.field.toString());
if (list.next != null)
buffer.append(" ");
}
buffer.append(">");
return buffer.toString();
}
}
final class DocumentFieldList {
DocumentFieldList(Field f, DocumentFieldList n) {
field = f;
next = n;
}
Field field;
DocumentFieldList next;
}
final class DocumentFieldEnumeration implements Enumeration {
DocumentFieldList fields;
DocumentFieldEnumeration(Document d) {
fields = d.fieldList;
}
public final boolean hasMoreElements() {
return fields == null ? false : true;
}
public final Object nextElement() {
Field result = fields.field;
fields = fields.next;
return result;
}
}

View File

@ -0,0 +1,169 @@
package org.apache.lucene.document;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/**
A field is a section of a Document. Each field has two parts, a name and a
value. Values may be free text, provided as a String or as a Reader, or they
may be atomic keywords, which are not further processed. Such keywords may
be used to represent dates, urls, etc. Fields are optionally stored in the
index, so that they may be returned with hits on the document.
*/
public final class Field {
private String name = "body";
private String stringValue = null;
private Reader readerValue = null;
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
/** Constructs a String-valued Field that is not tokenized, but is indexed
and stored. Useful for non-text fields, e.g. date or url. */
public static final Field Keyword(String name, String value) {
return new Field(name, value, true, true, false);
}
/** Constructs a String-valued Field that is not tokenized or indexed,
but is stored in the index, for return with hits. */
public static final Field UnIndexed(String name, String value) {
return new Field(name, value, true, false, false);
}
/** Constructs a String-valued Field that is tokenized and indexed,
and is stored in the index, for return with hits. Useful for short text
fields, like "title" or "subject". */
public static final Field Text(String name, String value) {
return new Field(name, value, true, true, true);
}
/** Constructs a String-valued Field that is tokenized and indexed,
but that is not stored in the index. */
public static final Field UnStored(String name, String value) {
return new Field(name, value, false, true, true);
}
/** Constructs a Reader-valued Field that is tokenized and indexed, but is
not stored in the index verbatim. Useful for longer text fields, like
"body". */
public static final Field Text(String name, Reader value) {
return new Field(name, value);
}
/** The name of the field (e.g., "date", "subject", "title", "body", etc.)
as an interned string. */
public String name() { return name; }
/** The value of the field as a String, or null. If null, the Reader value
is used. Exactly one of stringValue() and readerValue() must be set. */
public String stringValue() { return stringValue; }
/** The value of the field as a Reader, or null. If null, the String value
is used. Exactly one of stringValue() and readerValue() must be set. */
public Reader readerValue() { return readerValue; }
public Field(String name, String string,
boolean store, boolean index, boolean token) {
if (name == null)
throw new IllegalArgumentException("name cannot be null");
if (string == null)
throw new IllegalArgumentException("value cannot be null");
this.name = name.intern(); // field names are interned
this.stringValue = string;
this.isStored = store;
this.isIndexed = index;
this.isTokenized = token;
}
Field(String name, Reader reader) {
if (name == null)
throw new IllegalArgumentException("name cannot be null");
if (reader == null)
throw new IllegalArgumentException("value cannot be null");
this.name = name.intern(); // field names are interned
this.readerValue = reader;
}
/** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is
Reader-valued. */
public final boolean isStored() { return isStored; }
/** True iff the value of the field is to be indexed, so that it may be
searched on. */
public final boolean isIndexed() { return isIndexed; }
/** True iff the value of the field should be tokenized as text prior to
indexing. Un-tokenized fields are indexed as a single word and may not be
Reader-valued. */
public final boolean isTokenized() { return isTokenized; }
/** Prints a Field for human consumption. */
public final String toString() {
if (isStored && isIndexed && !isTokenized)
return "Keyword<" + name + ":" + stringValue + ">";
else if (isStored && !isIndexed && !isTokenized)
return "Unindexed<" + name + ":" + stringValue + ">";
else if (isStored && isIndexed && isTokenized && stringValue!=null)
return "Text<" + name + ":" + stringValue + ">";
else if (!isStored && isIndexed && isTokenized && readerValue!=null)
return "Text<" + name + ":" + readerValue + ">";
else
return super.toString();
}
}

View File

@ -0,0 +1,2 @@
# sub-directory makefile for lucene
include ../rules.mk

View File

@ -0,0 +1,10 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Doug Cutting">
</head>
<body>
The Document abstraction.
</body>
</html>

View File

@ -0,0 +1,336 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Hashtable;
import java.util.Enumeration;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.search.Similarity;
final class DocumentWriter {
private Analyzer analyzer;
private Directory directory;
private FieldInfos fieldInfos;
private int maxFieldLength;
DocumentWriter(Directory d, Analyzer a, int mfl) {
directory = d;
analyzer = a;
maxFieldLength = mfl;
}
final void addDocument(String segment, Document doc)
throws IOException {
// write field names
fieldInfos = new FieldInfos();
fieldInfos.add(doc);
fieldInfos.write(directory, segment + ".fnm");
// write field values
FieldsWriter fieldsWriter =
new FieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.addDocument(doc);
} finally {
fieldsWriter.close();
}
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
invertDocument(doc);
// sort postingTable into an array
Posting[] postings = sortPostingTable();
/*
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
System.out.print(posting.term);
System.out.print(" freq=" + posting.freq);
System.out.print(" pos=");
System.out.print(posting.positions[0]);
for (int j = 1; j < posting.freq; j++)
System.out.print("," + posting.positions[j]);
System.out.println("");
}
*/
// write postings
writePostings(postings, segment);
// write norms of indexed fields
writeNorms(doc, segment);
}
// Keys are Terms, values are Postings.
// Used to buffer a document before it is written to the index.
private final Hashtable postingTable = new Hashtable();
private int[] fieldLengths;
// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc)
throws IOException {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
String fieldName = field.name();
int fieldNumber = fieldInfos.fieldNumber(fieldName);
int position = fieldLengths[fieldNumber]; // position in field
if (field.isIndexed()) {
if (!field.isTokenized()) { // un-tokenized field
addPosition(fieldName, field.stringValue(), position++);
} else {
Reader reader; // find or make Reader
if (field.readerValue() != null)
reader = field.readerValue();
else if (field.stringValue() != null)
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException
("field must have either String or Reader value");
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
for (Token t = stream.next(); t != null; t = stream.next()) {
addPosition(fieldName, t.termText(), position++);
if (position > maxFieldLength) break;
}
} finally {
stream.close();
}
}
fieldLengths[fieldNumber] = position; // save field length
}
}
}
private final Term termBuffer = new Term("", ""); // avoid consing
private final void addPosition(String field, String text, int position) {
termBuffer.set(field, text);
Posting ti = (Posting)postingTable.get(termBuffer);
if (ti != null) { // word seen before
int freq = ti.freq;
if (ti.positions.length == freq) { // positions array is full
int[] newPositions = new int[freq * 2]; // double size
int[] positions = ti.positions;
for (int i = 0; i < freq; i++) // copy old positions to new
newPositions[i] = positions[i];
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
ti.freq = freq + 1; // update frequency
}
else { // word not seen before
Term term = new Term(field, text, false);
postingTable.put(term, new Posting(term, position));
}
}
private final Posting[] sortPostingTable() {
// copy postingTable into an array
Posting[] array = new Posting[postingTable.size()];
Enumeration postings = postingTable.elements();
for (int i = 0; postings.hasMoreElements(); i++)
array[i] = (Posting)postings.nextElement();
// sort the array
quickSort(array, 0, array.length - 1);
return array;
}
static private final void quickSort(Posting[] postings, int lo, int hi) {
if(lo >= hi)
return;
int mid = (lo + hi) / 2;
if(postings[lo].term.compareTo(postings[mid].term) > 0) {
Posting tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if(postings[mid].term.compareTo(postings[hi].term) > 0) {
Posting tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if(postings[lo].term.compareTo(postings[mid].term) > 0) {
Posting tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return;
Term partition = postings[mid].term;
for( ;; ) {
while(postings[right].term.compareTo(partition) > 0)
--right;
while(left < right && postings[left].term.compareTo(partition) <= 0)
++left;
if(left < right) {
Posting tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
} else {
break;
}
}
quickSort(postings, lo, left);
quickSort(postings, left + 1, hi);
}
private final void writePostings(Posting[] postings, String segment)
throws IOException {
OutputStream freq = null, prox = null;
TermInfosWriter tis = null;
try {
freq = directory.createFile(segment + ".frq");
prox = directory.createFile(segment + ".prx");
tis = new TermInfosWriter(directory, segment, fieldInfos);
TermInfo ti = new TermInfo();
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
// add an entry to the dictionary with pointers to prox and freq files
ti.set(1, freq.getFilePointer(), prox.getFilePointer());
tis.add(posting.term, ti);
// add an entry to the freq file
int f = posting.freq;
if (f == 1) // optimize freq=1
freq.writeVInt(1); // set low bit of doc num.
else {
freq.writeVInt(0); // the document number
freq.writeVInt(f); // frequency in doc
}
int lastPosition = 0; // write positions
int[] positions = posting.positions;
for (int j = 0; j < f; j++) { // use delta-encoding
int position = positions[j];
prox.writeVInt(position - lastPosition);
lastPosition = position;
}
}
}
finally {
if (freq != null) freq.close();
if (prox != null) prox.close();
if (tis != null) tis.close();
}
}
private final void writeNorms(Document doc, String segment)
throws IOException {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
if (field.isIndexed()) {
int fieldNumber = fieldInfos.fieldNumber(field.name());
OutputStream norm = directory.createFile(segment + ".f" + fieldNumber);
try {
norm.writeByte(Similarity.norm(fieldLengths[fieldNumber]));
} finally {
norm.close();
}
}
}
}
}
final class Posting { // info about a Term in a doc
Term term; // the Term
int freq; // its frequency in doc
int[] positions; // positions it occurs at
Posting(Term t, int position) {
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
final class FieldInfo {
String name;
boolean isIndexed;
int number;
FieldInfo(String na, boolean tk, int nu) {
name = na;
isIndexed = tk;
number = nu;
}
}

View File

@ -0,0 +1,167 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Hashtable;
import java.util.Vector;
import java.util.Enumeration;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.InputStream;
final class FieldInfos {
private Vector byNumber = new Vector();
private Hashtable byName = new Hashtable();
FieldInfos() {
add("", false);
}
FieldInfos(Directory d, String name) throws IOException {
InputStream input = d.openFile(name);
try {
read(input);
} finally {
input.close();
}
}
/** Adds field info for a Document. */
final void add(Document doc) {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
add(field.name(), field.isIndexed());
}
}
/** Merges in information from another FieldInfos. */
final void add(FieldInfos other) {
for (int i = 0; i < other.size(); i++) {
FieldInfo fi = other.fieldInfo(i);
add(fi.name, fi.isIndexed);
}
}
private final void add(String name, boolean isIndexed) {
FieldInfo fi = fieldInfo(name);
if (fi == null)
addInternal(name, isIndexed);
else if (fi.isIndexed != isIndexed)
throw new IllegalStateException("field " + name +
(fi.isIndexed ? " must" : " cannot") +
" be an indexed field.");
}
private final void addInternal(String name, boolean isIndexed) {
FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.size());
byNumber.addElement(fi);
byName.put(name, fi);
}
final int fieldNumber(String fieldName) {
FieldInfo fi = fieldInfo(fieldName);
if (fi != null)
return fi.number;
else
return -1;
}
final FieldInfo fieldInfo(String fieldName) {
return (FieldInfo)byName.get(fieldName);
}
final String fieldName(int fieldNumber) {
return fieldInfo(fieldNumber).name;
}
final FieldInfo fieldInfo(int fieldNumber) {
return (FieldInfo)byNumber.elementAt(fieldNumber);
}
final int size() {
return byNumber.size();
}
final void write(Directory d, String name) throws IOException {
OutputStream output = d.createFile(name);
try {
write(output);
} finally {
output.close();
}
}
final void write(OutputStream output) throws IOException {
output.writeVInt(size());
for (int i = 0; i < size(); i++) {
FieldInfo fi = fieldInfo(i);
output.writeString(fi.name);
output.writeByte((byte)(fi.isIndexed ? 1 : 0));
}
}
private final void read(InputStream input) throws IOException {
int size = input.readVInt();
for (int i = 0; i < size; i++)
addInternal(input.readString().intern(),
input.readByte() != 0);
}
}

View File

@ -0,0 +1,113 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Enumeration;
import java.util.Hashtable;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
final class FieldsReader {
private FieldInfos fieldInfos;
private InputStream fieldsStream;
private InputStream indexStream;
private int size;
FieldsReader(Directory d, String segment, FieldInfos fn)
throws IOException {
fieldInfos = fn;
fieldsStream = d.openFile(segment + ".fdt");
indexStream = d.openFile(segment + ".fdx");
size = (int)indexStream.length() / 8;
}
final void close() throws IOException {
fieldsStream.close();
indexStream.close();
}
final int size() {
return size;
}
final Document doc(int n) throws IOException {
indexStream.seek(n * 8L);
long position = indexStream.readLong();
fieldsStream.seek(position);
Document doc = new Document();
int numFields = fieldsStream.readVInt();
for (int i = 0; i < numFields; i++) {
int fieldNumber = fieldsStream.readVInt();
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
byte bits = fieldsStream.readByte();
doc.add(new Field(fi.name, // name
fieldsStream.readString(), // read value
true, // stored
fi.isIndexed, // indexed
(bits & 1) != 0)); // tokenized
}
return doc;
}
}

View File

@ -0,0 +1,110 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Enumeration;
import java.util.Hashtable;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
final class FieldsWriter {
private FieldInfos fieldInfos;
private OutputStream fieldsStream;
private OutputStream indexStream;
FieldsWriter(Directory d, String segment, FieldInfos fn)
throws IOException {
fieldInfos = fn;
fieldsStream = d.createFile(segment + ".fdt");
indexStream = d.createFile(segment + ".fdx");
}
final void close() throws IOException {
fieldsStream.close();
indexStream.close();
}
final void addDocument(Document doc) throws IOException {
indexStream.writeLong(fieldsStream.getFilePointer());
int storedCount = 0;
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
if (field.isStored())
storedCount++;
}
fieldsStream.writeVInt(storedCount);
fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
if (field.isStored()) {
fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
byte bits = 0;
if (field.isTokenized())
bits |= 1;
fieldsStream.writeByte(bits);
fieldsStream.writeString(field.stringValue());
}
}
}
}

View File

@ -0,0 +1,215 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.io.File;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document;
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
so that any subclass which implements it is searchable.
<p> Concrete subclasses of IndexReader are usually constructed with a call to
the static method {@link #open}.
<p> For efficiency, in this API documents are often referred to via
<it>document numbers</it>, non-negative integers which each name a unique
document in the index. These document numbers are ephemeral--they may change
as documents are added to and deleted from an index. Clients should thus not
rely on a given document having the same number between sessions. */
abstract public class IndexReader {
protected IndexReader() {};
/** Returns an IndexReader reading the index in an FSDirectory in the named
path. */
public static IndexReader open(String path) throws IOException {
return open(FSDirectory.getDirectory(path, false));
}
/** Returns an IndexReader reading the index in an FSDirectory in the named
path. */
public static IndexReader open(File path) throws IOException {
return open(FSDirectory.getDirectory(path, false));
}
/** Returns an IndexReader reading the index in the given Directory. */
public static IndexReader open(Directory directory) throws IOException {
synchronized (directory) {
SegmentInfos infos = new SegmentInfos();
infos.read(directory);
if (infos.size() == 1) // index is optimized
return new SegmentReader(infos.info(0), true);
SegmentReader[] readers = new SegmentReader[infos.size()];
for (int i = 0; i < infos.size(); i++)
readers[i] = new SegmentReader(infos.info(i), i == infos.size() - 1);
return new SegmentsReader(readers);
}
}
/** Returns the time the index in the named directory was last modified. */
public static long lastModified(String directory) throws IOException {
return lastModified(new File(directory));
}
/** Returns the time the index in the named directory was last modified. */
public static long lastModified(File directory) throws IOException {
return FSDirectory.fileModified(directory, "segments");
}
/** Returns the time the index in this directory was last modified. */
public static long lastModified(Directory directory) throws IOException {
return directory.fileModified("segments");
}
/** Returns the number of documents in this index. */
abstract public int numDocs();
/** Returns one greater than the largest possible document number.
This may be used to, e.g., determine how big to allocate an array which
will have an element for every document number in an index.
*/
abstract public int maxDoc();
/** Returns the stored fields of the <code>n</code><sup>th</sup>
<code>Document</code> in this index. */
abstract public Document document(int n) throws IOException;
/** Returns true if document <i>n</i> has been deleted */
abstract public boolean isDeleted(int n);
/** Returns the byte-encoded normalization factor for the named field of
every document. This is used by the search code to score documents.
@see org.apache.lucene.search.Similarity#norm
*/
abstract public byte[] norms(String field) throws IOException;
/** Returns an enumeration of all the terms in the index.
The enumeration is ordered by Term.compareTo(). Each term
is greater than all that precede it in the enumeration.
*/
abstract public TermEnum terms() throws IOException;
/** Returns an enumeration of all terms after a given term.
The enumeration is ordered by Term.compareTo(). Each term
is greater than all that precede it in the enumeration.
*/
abstract public TermEnum terms(Term t) throws IOException;
/** Returns the number of documents containing the term <code>t</code>. */
abstract public int docFreq(Term t) throws IOException;
/** Returns an enumeration of all the documents which contain
<code>Term</code>. For each document, the document number, the frequency of
the term in that document is also provided, for use in search scoring.
Thus, this method implements the mapping:
<p><ul>
Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup>
</ul>
<p>The enumeration is ordered by document number. Each document number
is greater than all that precede it in the enumeration. */
abstract public TermDocs termDocs(Term t) throws IOException;
/** Returns an enumeration of all the documents which contain
<code>Term</code>. For each document, in addition to the document number
and frequency of the term in that document, a list of all of the ordinal
positions of the term in the document is available. Thus, this method
implements the mapping:
<p><ul>
Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
&lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
pos<sub>freq-1</sub>&gt;
&gt;<sup>*</sup>
</ul>
<p> This positional information faciliates phrase and proximity searching.
<p>The enumeration is ordered by document number. Each document number is
greater than all that precede it in the enumeration. */
abstract public TermPositions termPositions(Term t) throws IOException;
/** Deletes the document numbered <code>docNum</code>. Once a document is
deleted it will not appear in TermDocs or TermPostitions enumerations.
Attempts to read its field with the {@link #document}
method will result in an error. The presence of this document may still be
reflected in the {@link #docFreq} statistic, though
this will be corrected eventually as the index is further modified. */
abstract public void delete(int docNum) throws IOException;
/** Deletes all documents containing <code>term</code>.
This is useful if one uses a document field to hold a unique ID string for
the document. Then to delete such a document, one merely constructs a
term with the appropriate field and the unique ID string as its text and
passes it to this method. Returns the number of documents deleted. */
public final int delete(Term term) throws IOException {
TermDocs docs = termDocs(term);
if ( docs == null ) return 0;
int n = 0;
try {
while (docs.next()) {
delete(docs.doc());
n++;
}
} finally {
docs.close();
}
return n;
}
/** Closes files associated with this index.
Also saves any new deletions to disk.
No other methods should be called after this has been called. */
abstract public void close() throws IOException;
}

View File

@ -0,0 +1,385 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.io.File;
import java.io.PrintStream;
import java.util.Vector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.analysis.Analyzer;
/**
An IndexWriter creates and maintains an index.
The third argument to the <a href="#IndexWriter"><b>constructor</b></a>
determines whether a new index is created, or whether an existing index is
opened for the addition of new documents.
In either case, documents are added with the <a
href="#addDocument"><b>addDocument</b></a> method. When finished adding
documents, <a href="#close"><b>close</b></a> should be called.
If an index will not have more documents added for a while and optimal search
performance is desired, then the <a href="#optimize"><b>optimize</b></a>
method should be called before the index is closed.
*/
public final class IndexWriter {
private Directory directory; // where this index resides
private Analyzer analyzer; // how to analyze text
private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
private final Directory ramDirectory = new RAMDirectory(); // for temp segs
/** Constructs an IndexWriter for the index in <code>path</code>. Text will
be analyzed with <code>a</code>. If <code>create</code> is true, then a
new, empty index will be created in <code>d</code>, replacing the index
already there, if any. */
public IndexWriter(String path, Analyzer a, boolean create)
throws IOException {
this(FSDirectory.getDirectory(path, create), a, create);
}
/** Constructs an IndexWriter for the index in <code>path</code>. Text will
be analyzed with <code>a</code>. If <code>create</code> is true, then a
new, empty index will be created in <code>d</code>, replacing the index
already there, if any. */
public IndexWriter(File path, Analyzer a, boolean create)
throws IOException {
this(FSDirectory.getDirectory(path, create), a, create);
}
/** Constructs an IndexWriter for the index in <code>d</code>. Text will be
analyzed with <code>a</code>. If <code>create</code> is true, then a new,
empty index will be created in <code>d</code>, replacing the index already
there, if any. */
public IndexWriter(Directory d, Analyzer a, boolean create)
throws IOException {
directory = d;
analyzer = a;
synchronized (directory) {
if (create)
segmentInfos.write(directory);
else
segmentInfos.read(directory);
}
}
/** Flushes all changes to an index, closes all associated files, and closes
the directory that the index is stored in. */
public final synchronized void close() throws IOException {
flushRamSegments();
ramDirectory.close();
directory.close();
}
/** Returns the number of documents currently in this index. */
public final synchronized int docCount() {
int count = 0;
for (int i = 0; i < segmentInfos.size(); i++) {
SegmentInfo si = segmentInfos.info(i);
count += si.docCount;
}
return count;
}
/** The maximum number of terms that will be indexed for a single field in a
document. This limits the amount of memory required for indexing, so that
collections with very large files will not crash the indexing process by
running out of memory.
<p>By default, no more than 10,000 terms will be indexed for a field. */
public int maxFieldLength = 10000;
/** Adds a document to this index.*/
public final void addDocument(Document doc) throws IOException {
DocumentWriter dw =
new DocumentWriter(ramDirectory, analyzer, maxFieldLength);
String segmentName = newSegmentName();
dw.addDocument(segmentName, doc);
synchronized (this) {
segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory));
maybeMergeSegments();
}
}
private final synchronized String newSegmentName() {
return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
}
/** Determines how often segment indexes are merged by addDocument(). With
* smaller values, less RAM is used while indexing, and searches on
* unoptimized indexes are faster, but indexing speed is slower. With larger
* values more RAM is used while indexing and searches on unoptimized indexes
* are slower, but indexing is faster. Thus larger values (> 10) are best
* for batched index creation, and smaller values (< 10) for indexes that are
* interactively maintained.
*
* <p>This must never be less than 2. The default value is 10.*/
public int mergeFactor = 10;
/** Determines the largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* <p>The default value is {@link Integer#MAX_VALUE}. */
public int maxMergeDocs = Integer.MAX_VALUE;
/** If non-null, information about merges will be printed to this. */
public PrintStream infoStream = null;
/** Merges all segments together into a single segment, optimizing an index
for search. */
public final synchronized void optimize() throws IOException {
flushRamSegments();
while (segmentInfos.size() > 1 ||
(segmentInfos.size() == 1 &&
SegmentReader.hasDeletions(segmentInfos.info(0)))){
int minSegment = segmentInfos.size() - mergeFactor;
mergeSegments(minSegment < 0 ? 0 : minSegment);
}
}
/** Merges all segments from an array of indexes into this index.
*
* <p>This may be used to parallelize batch indexing. A large document
* collection can be broken into sub-collections. Each sub-collection can be
* indexed in parallel, on a different thread, process or machine. The
* complete index can then be created by merging sub-collection indexes
* with this method.
*
* <p>After this completes, the index is optimized. */
public final synchronized void addIndexes(Directory[] dirs)
throws IOException {
optimize(); // start with zero or 1 seg
int minSegment = segmentInfos.size();
int segmentsAddedSinceMerge = 0;
for (int i = 0; i < dirs.length; i++) {
SegmentInfos sis = new SegmentInfos(); // read infos from dir
sis.read(dirs[i]);
for (int j = 0; j < sis.size(); j++) {
segmentInfos.addElement(sis.info(j)); // add each info
// merge whenever mergeFactor segments have been added
if (++segmentsAddedSinceMerge == mergeFactor) {
mergeSegments(minSegment++, false);
segmentsAddedSinceMerge = 0;
}
}
}
optimize(); // final cleanup
}
/** Merges all RAM-resident segments. */
private final void flushRamSegments() throws IOException {
int minSegment = segmentInfos.size()-1;
int docCount = 0;
while (minSegment >= 0 &&
(segmentInfos.info(minSegment)).dir == ramDirectory) {
docCount += segmentInfos.info(minSegment).docCount;
minSegment--;
}
if (minSegment < 0 || // add one FS segment?
(docCount + segmentInfos.info(minSegment).docCount) > mergeFactor ||
!(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory))
minSegment++;
if (minSegment >= segmentInfos.size())
return; // none to merge
mergeSegments(minSegment);
}
/** Incremental segment merger. */
private final void maybeMergeSegments() throws IOException {
long targetMergeDocs = mergeFactor;
while (targetMergeDocs <= maxMergeDocs) {
// find segments smaller than current target size
int minSegment = segmentInfos.size();
int mergeDocs = 0;
while (--minSegment >= 0) {
SegmentInfo si = segmentInfos.info(minSegment);
if (si.docCount >= targetMergeDocs)
break;
mergeDocs += si.docCount;
}
if (mergeDocs >= targetMergeDocs) // found a merge to do
mergeSegments(minSegment+1);
else
break;
targetMergeDocs *= mergeFactor; // increase target size
}
}
/** Pops segments off of segmentInfos stack down to minSegment, merges them,
and pushes the merged index onto the top of the segmentInfos stack. */
private final void mergeSegments(int minSegment) throws IOException {
mergeSegments(minSegment, true);
}
/** Pops segments off of segmentInfos stack down to minSegment, merges them,
and pushes the merged index onto the top of the segmentInfos stack. */
private final void mergeSegments(int minSegment, boolean delete)
throws IOException {
String mergedName = newSegmentName();
int mergedDocCount = 0;
if (infoStream != null) infoStream.print("merging segments");
SegmentMerger merger = new SegmentMerger(directory, mergedName);
Vector segmentsToDelete = new Vector();
for (int i = minSegment; i < segmentInfos.size(); i++) {
SegmentInfo si = segmentInfos.info(i);
if (infoStream != null)
infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
SegmentReader reader = new SegmentReader(si);
merger.add(reader);
if (delete)
segmentsToDelete.addElement(reader); // queue for deletion
mergedDocCount += si.docCount;
}
if (infoStream != null) {
infoStream.println();
infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
}
merger.merge();
segmentInfos.setSize(minSegment); // pop old infos & add new
segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount,
directory));
synchronized (directory) {
segmentInfos.write(directory); // commit before deleting
deleteSegments(segmentsToDelete); // delete now-unused segments
}
}
/* Some operating systems (e.g. Windows) don't permit a file to be deleted
while it is opened for read (e.g. by another process or thread). So we
assume that when a delete fails it is because the file is open in another
process, and queue the file for subsequent deletion. */
private final void deleteSegments(Vector segments) throws IOException {
Vector deletable = new Vector();
deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable
for (int i = 0; i < segments.size(); i++) {
SegmentReader reader = (SegmentReader)segments.elementAt(i);
if (reader.directory == this.directory)
deleteFiles(reader.files(), deletable); // try to delete our files
else
deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files
}
writeDeleteableFiles(deletable); // note files we can't delete
}
private final void deleteFiles(Vector files, Directory directory)
throws IOException {
for (int i = 0; i < files.size(); i++)
directory.deleteFile((String)files.elementAt(i));
}
private final void deleteFiles(Vector files, Vector deletable)
throws IOException {
for (int i = 0; i < files.size(); i++) {
String file = (String)files.elementAt(i);
try {
directory.deleteFile(file); // try to delete each file
} catch (IOException e) { // if delete fails
if (directory.fileExists(file)) {
if (infoStream != null)
infoStream.println(e.getMessage() + "; Will re-try later.");
deletable.addElement(file); // add to deletable
}
}
}
}
private final Vector readDeleteableFiles() throws IOException {
Vector result = new Vector();
if (!directory.fileExists("deletable"))
return result;
InputStream input = directory.openFile("deletable");
try {
for (int i = input.readInt(); i > 0; i--) // read file names
result.addElement(input.readString());
} finally {
input.close();
}
return result;
}
private final void writeDeleteableFiles(Vector files) throws IOException {
OutputStream output = directory.createFile("deleteable.new");
try {
output.writeInt(files.size());
for (int i = 0; i < files.size(); i++)
output.writeString((String)files.elementAt(i));
} finally {
output.close();
}
directory.renameFile("deleteable.new", "deletable");
}
}

View File

@ -0,0 +1,2 @@
# sub-directory makefile for lucene
include ../rules.mk

View File

@ -0,0 +1,69 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.store.Directory;
final class SegmentInfo {
public String name; // unique name in dir
public int docCount; // number of docs in seg
public Directory dir; // where segment resides
public SegmentInfo(String name, int docCount, Directory dir) {
this.name = name;
this.docCount = docCount;
this.dir = dir;
}
}

View File

@ -0,0 +1,101 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Vector;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.store.OutputStream;
final class SegmentInfos extends Vector {
public int counter = 0; // used to name new segments
public final SegmentInfo info(int i) {
return (SegmentInfo)elementAt(i);
}
public final void read(Directory directory) throws IOException {
InputStream input = directory.openFile("segments");
try {
counter = input.readInt(); // read counter
for (int i = input.readInt(); i > 0; i--) { // read segmentInfos
SegmentInfo si = new SegmentInfo(input.readString(), input.readInt(),
directory);
addElement(si);
}
} finally {
input.close();
}
}
public final void write(Directory directory) throws IOException {
OutputStream output = directory.createFile("segments.new");
try {
output.writeInt(counter); // write counter
output.writeInt(size()); // write infos
for (int i = 0; i < size(); i++) {
SegmentInfo si = info(i);
output.writeString(si.name);
output.writeInt(si.docCount);
}
} finally {
output.close();
}
// install new segment info
directory.renameFile("segments.new", "segments");
}
}

View File

@ -0,0 +1,106 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.util.BitVector;
final class SegmentMergeInfo {
Term term;
int base;
SegmentTermEnum termEnum;
SegmentReader reader;
SegmentTermPositions postings;
int[] docMap = null; // maps around deleted docs
SegmentMergeInfo(int b, SegmentTermEnum te, SegmentReader r)
throws IOException {
base = b;
reader = r;
termEnum = te;
term = te.term();
postings = new SegmentTermPositions(r);
if (reader.deletedDocs != null) {
// build array which maps document numbers around deletions
BitVector deletedDocs = reader.deletedDocs;
int maxDoc = reader.maxDoc();
docMap = new int[maxDoc];
int j = 0;
for (int i = 0; i < maxDoc; i++) {
if (deletedDocs.get(i))
docMap[i] = -1;
else
docMap[i] = j++;
}
}
}
final boolean next() throws IOException {
if (termEnum.next()) {
term = termEnum.term();
return true;
} else {
term = null;
return false;
}
}
final void close() throws IOException {
termEnum.close();
postings.close();
}
}

View File

@ -0,0 +1,80 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.util.PriorityQueue;
final class SegmentMergeQueue extends PriorityQueue {
SegmentMergeQueue(int size) {
initialize(size);
}
protected final boolean lessThan(Object a, Object b) {
SegmentMergeInfo stiA = (SegmentMergeInfo)a;
SegmentMergeInfo stiB = (SegmentMergeInfo)b;
int comparison = stiA.term.compareTo(stiB.term);
if (comparison == 0)
return stiA.base < stiB.base;
else
return comparison < 0;
}
final void close() throws IOException {
while (top() != null)
((SegmentMergeInfo)pop()).close();
}
}

View File

@ -0,0 +1,275 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Vector;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.BitVector;
final class SegmentMerger {
private Directory directory;
private String segment;
private Vector readers = new Vector();
private FieldInfos fieldInfos;
SegmentMerger(Directory dir, String name) {
directory = dir;
segment = name;
}
final void add(SegmentReader reader) {
readers.addElement(reader);
}
final SegmentReader segmentReader(int i) {
return (SegmentReader)readers.elementAt(i);
}
final void merge() throws IOException {
try {
mergeFields();
mergeTerms();
mergeNorms();
} finally {
for (int i = 0; i < readers.size(); i++) { // close readers
SegmentReader reader = (SegmentReader)readers.elementAt(i);
reader.close();
}
}
}
private final void mergeFields() throws IOException {
fieldInfos = new FieldInfos(); // merge field names
for (int i = 0; i < readers.size(); i++) {
SegmentReader reader = (SegmentReader)readers.elementAt(i);
fieldInfos.add(reader.fieldInfos);
}
fieldInfos.write(directory, segment + ".fnm");
FieldsWriter fieldsWriter = // merge field values
new FieldsWriter(directory, segment, fieldInfos);
try {
for (int i = 0; i < readers.size(); i++) {
SegmentReader reader = (SegmentReader)readers.elementAt(i);
BitVector deletedDocs = reader.deletedDocs;
int maxDoc = reader.maxDoc();
for (int j = 0; j < maxDoc; j++)
if (deletedDocs == null || !deletedDocs.get(j)) // skip deleted docs
fieldsWriter.addDocument(reader.document(j));
}
} finally {
fieldsWriter.close();
}
}
private OutputStream freqOutput = null;
private OutputStream proxOutput = null;
private TermInfosWriter termInfosWriter = null;
private SegmentMergeQueue queue = null;
private final void mergeTerms() throws IOException {
try {
freqOutput = directory.createFile(segment + ".frq");
proxOutput = directory.createFile(segment + ".prx");
termInfosWriter =
new TermInfosWriter(directory, segment, fieldInfos);
mergeTermInfos();
} finally {
if (freqOutput != null) freqOutput.close();
if (proxOutput != null) proxOutput.close();
if (termInfosWriter != null) termInfosWriter.close();
if (queue != null) queue.close();
}
}
private final void mergeTermInfos() throws IOException {
queue = new SegmentMergeQueue(readers.size());
int base = 0;
for (int i = 0; i < readers.size(); i++) {
SegmentReader reader = (SegmentReader)readers.elementAt(i);
SegmentTermEnum termEnum = (SegmentTermEnum)reader.terms();
SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
base += reader.numDocs();
if (smi.next())
queue.put(smi); // initialize queue
else
smi.close();
}
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
while (queue.size() > 0) {
int matchSize = 0; // pop matching terms
match[matchSize++] = (SegmentMergeInfo)queue.pop();
Term term = match[0].term;
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
while (top != null && term.compareTo(top.term) == 0) {
match[matchSize++] = (SegmentMergeInfo)queue.pop();
top = (SegmentMergeInfo)queue.top();
}
mergeTermInfo(match, matchSize); // add new TermInfo
while (matchSize > 0) {
SegmentMergeInfo smi = match[--matchSize];
if (smi.next())
queue.put(smi); // restore queue
else
smi.close(); // done with a segment
}
}
}
private final TermInfo termInfo = new TermInfo(); // minimize consing
private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
throws IOException {
long freqPointer = freqOutput.getFilePointer();
long proxPointer = proxOutput.getFilePointer();
int df = appendPostings(smis, n); // append posting data
if (df > 0) {
// add an entry to the dictionary with pointers to prox and freq files
termInfo.set(df, freqPointer, proxPointer);
termInfosWriter.add(smis[0].term, termInfo);
}
}
private final int appendPostings(SegmentMergeInfo[] smis, int n)
throws IOException {
int lastDoc = 0;
int df = 0; // number of docs w/ term
for (int i = 0; i < n; i++) {
SegmentMergeInfo smi = smis[i];
SegmentTermPositions postings = smi.postings;
int base = smi.base;
int[] docMap = smi.docMap;
smi.termEnum.termInfo(termInfo);
postings.seek(termInfo);
while (postings.next()) {
int doc;
if (docMap == null)
doc = base + postings.doc; // no deletions
else
doc = base + docMap[postings.doc]; // re-map around deletions
if (doc < lastDoc)
throw new IllegalStateException("docs out of order");
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
lastDoc = doc;
int freq = postings.freq;
if (freq == 1) {
freqOutput.writeVInt(docCode | 1); // write doc & freq=1
} else {
freqOutput.writeVInt(docCode); // write doc
freqOutput.writeVInt(freq); // write frequency in doc
}
int lastPosition = 0; // write position deltas
for (int j = 0; j < freq; j++) {
int position = postings.nextPosition();
proxOutput.writeVInt(position - lastPosition);
lastPosition = position;
}
df++;
}
}
return df;
}
private final void mergeNorms() throws IOException {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed) {
OutputStream output = directory.createFile(segment + ".f" + i);
try {
for (int j = 0; j < readers.size(); j++) {
SegmentReader reader = (SegmentReader)readers.elementAt(j);
BitVector deletedDocs = reader.deletedDocs;
InputStream input = reader.normStream(fi.name);
int maxDoc = reader.maxDoc();
try {
for (int k = 0; k < maxDoc; k++) {
byte norm = input != null ? input.readByte() : (byte)0;
if (deletedDocs == null || !deletedDocs.get(k))
output.writeByte(norm);
}
} finally {
if (input != null)
input.close();
}
}
} finally {
output.close();
}
}
}
}
}

View File

@ -0,0 +1,284 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Hashtable;
import java.util.Enumeration;
import java.util.Vector;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.document.Document;
final class SegmentReader extends IndexReader {
Directory directory;
private boolean closeDirectory = false;
private String segment;
FieldInfos fieldInfos;
private FieldsReader fieldsReader;
TermInfosReader tis;
BitVector deletedDocs = null;
private boolean deletedDocsDirty = false;
private InputStream freqStream;
private InputStream proxStream;
private static class Norm {
public Norm(InputStream in) { this.in = in; }
public InputStream in;
public byte[] bytes;
}
private Hashtable norms = new Hashtable();
SegmentReader(SegmentInfo si, boolean closeDir)
throws IOException {
this(si);
closeDirectory = closeDir;
}
SegmentReader(SegmentInfo si)
throws IOException {
directory = si.dir;
segment = si.name;
fieldInfos = new FieldInfos(directory, segment + ".fnm");
fieldsReader = new FieldsReader(directory, segment, fieldInfos);
tis = new TermInfosReader(directory, segment, fieldInfos);
if (hasDeletions(si))
deletedDocs = new BitVector(directory, segment + ".del");
// make sure that all index files have been read or are kept open
// so that if an index update removes them we'll still have them
freqStream = directory.openFile(segment + ".frq");
proxStream = directory.openFile(segment + ".prx");
openNorms();
}
public final synchronized void close() throws IOException {
if (deletedDocsDirty) {
synchronized (directory) {
deletedDocs.write(directory, segment + ".tmp");
directory.renameFile(segment + ".tmp", segment + ".del");
}
deletedDocsDirty = false;
}
fieldsReader.close();
tis.close();
if (freqStream != null)
freqStream.close();
if (proxStream != null)
proxStream.close();
closeNorms();
if (closeDirectory)
directory.close();
}
final static boolean hasDeletions(SegmentInfo si) throws IOException {
return si.dir.fileExists(si.name + ".del");
}
public final synchronized void delete(int docNum) throws IOException {
if (deletedDocs == null)
deletedDocs = new BitVector(maxDoc());
deletedDocsDirty = true;
deletedDocs.set(docNum);
}
final Vector files() throws IOException {
Vector files = new Vector(16);
files.addElement(segment + ".fnm");
files.addElement(segment + ".fdx");
files.addElement(segment + ".fdt");
files.addElement(segment + ".tii");
files.addElement(segment + ".tis");
files.addElement(segment + ".frq");
files.addElement(segment + ".prx");
if (directory.fileExists(segment + ".del"))
files.addElement(segment + ".del");
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed)
files.addElement(segment + ".f" + i);
}
return files;
}
public final TermEnum terms() throws IOException {
return tis.terms();
}
public final TermEnum terms(Term t) throws IOException {
return tis.terms(t);
}
public final synchronized Document document(int n) throws IOException {
if (isDeleted(n))
throw new IllegalArgumentException
("attempt to access a deleted document");
return fieldsReader.doc(n);
}
public final synchronized boolean isDeleted(int n) {
return (deletedDocs != null && deletedDocs.get(n));
}
public final TermDocs termDocs(Term t) throws IOException {
TermInfo ti = tis.get(t);
if (ti != null)
return new SegmentTermDocs(this, ti);
else
return null;
}
final InputStream getFreqStream () {
return (InputStream)freqStream.clone();
}
public final TermPositions termPositions(Term t) throws IOException {
TermInfo ti = tis.get(t);
if (ti != null)
return new SegmentTermPositions(this, ti);
else
return null;
}
final InputStream getProxStream () {
return (InputStream)proxStream.clone();
}
public final int docFreq(Term t) throws IOException {
TermInfo ti = tis.get(t);
if (ti != null)
return ti.docFreq;
else
return 0;
}
public final int numDocs() {
int n = maxDoc();
if (deletedDocs != null)
n -= deletedDocs.count();
return n;
}
public final int maxDoc() {
return fieldsReader.size();
}
public final byte[] norms(String field) throws IOException {
Norm norm = (Norm)norms.get(field);
if (norm == null)
return null;
if (norm.bytes == null) {
byte[] bytes = new byte[maxDoc()];
norms(field, bytes, 0);
norm.bytes = bytes;
}
return norm.bytes;
}
final void norms(String field, byte[] bytes, int offset) throws IOException {
InputStream normStream = normStream(field);
if (normStream == null)
return; // use zeros in array
try {
normStream.readBytes(bytes, offset, maxDoc());
} finally {
normStream.close();
}
}
final InputStream normStream(String field) throws IOException {
Norm norm = (Norm)norms.get(field);
if (norm == null)
return null;
InputStream result = (InputStream)norm.in.clone();
result.seek(0);
return result;
}
private final void openNorms() throws IOException {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed)
norms.put(fi.name,
new Norm(directory.openFile(segment + ".f" + fi.number)));
}
}
private final void closeNorms() throws IOException {
synchronized (norms) {
Enumeration enum = norms.elements();
while (enum.hasMoreElements()) {
Norm norm = (Norm)enum.nextElement();
norm.in.close();
}
}
}
}

View File

@ -0,0 +1,150 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.store.InputStream;
class SegmentTermDocs implements TermDocs {
protected SegmentReader parent;
private InputStream freqStream;
private int freqCount;
private BitVector deletedDocs;
int doc = 0;
int freq;
SegmentTermDocs(SegmentReader p) throws IOException {
parent = p;
freqStream = parent.getFreqStream();
deletedDocs = parent.deletedDocs;
}
SegmentTermDocs(SegmentReader p, TermInfo ti) throws IOException {
this(p);
seek(ti);
}
void seek(TermInfo ti) throws IOException {
freqCount = ti.docFreq;
doc = 0;
freqStream.seek(ti.freqPointer);
}
public void close() throws IOException {
freqStream.close();
}
public final int doc() { return doc; }
public final int freq() { return freq; }
protected void skippingDoc() throws IOException {
}
public boolean next() throws IOException {
while (true) {
if (freqCount == 0)
return false;
int docCode = freqStream.readVInt();
doc += docCode >>> 1; // shift off low bit
if ((docCode & 1) != 0) // if low bit is set
freq = 1; // freq is one
else
freq = freqStream.readVInt(); // else read freq
freqCount--;
if (deletedDocs == null || !deletedDocs.get(doc))
break;
skippingDoc();
}
return true;
}
/** Optimized implementation. */
public int read(final int[] docs, final int[] freqs)
throws IOException {
final int end = docs.length;
int i = 0;
while (i < end && freqCount > 0) {
// manually inlined call to next() for speed
final int docCode = freqStream.readVInt();
doc += docCode >>> 1; // shift off low bit
if ((docCode & 1) != 0) // if low bit is set
freq = 1; // freq is one
else
freq = freqStream.readVInt(); // else read freq
freqCount--;
if (deletedDocs == null || !deletedDocs.get(doc)) {
docs[i] = doc;
freqs[i] = freq;
++i;
}
}
return i;
}
/** As yet unoptimized implementation. */
public boolean skipTo(int target) throws IOException {
do {
if (!next())
return false;
} while (target > doc);
return true;
}
}

View File

@ -0,0 +1,184 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.store.InputStream;
final class SegmentTermEnum extends TermEnum implements Cloneable {
private InputStream input;
private FieldInfos fieldInfos;
int size;
int position = -1;
private Term term = new Term("", "");
private TermInfo termInfo = new TermInfo();
boolean isIndex = false;
long indexPointer = 0;
Term prev;
private char[] buffer = {};
SegmentTermEnum(InputStream i, FieldInfos fis, boolean isi)
throws IOException {
input = i;
fieldInfos = fis;
size = input.readInt();
isIndex = isi;
}
protected Object clone() {
SegmentTermEnum clone = null;
try {
clone = (SegmentTermEnum)super.clone();
} catch (CloneNotSupportedException e) {}
clone.input = (InputStream)input.clone();
clone.termInfo = new TermInfo(termInfo);
clone.growBuffer(term.text.length());
return clone;
}
final void seek(long pointer, int p, Term t, TermInfo ti)
throws IOException {
input.seek(pointer);
position = p;
term = t;
prev = null;
termInfo.set(ti);
growBuffer(term.text.length()); // copy term text into buffer
}
/** Increments the enumeration to the next element. True if one exists.*/
public final boolean next() throws IOException {
if (position++ >= size-1) {
term = null;
return false;
}
prev = term;
term = readTerm();
termInfo.docFreq = input.readVInt(); // read doc freq
termInfo.freqPointer += input.readVLong(); // read freq pointer
termInfo.proxPointer += input.readVLong(); // read prox pointer
if (isIndex)
indexPointer += input.readVLong(); // read index pointer
return true;
}
private final Term readTerm() throws IOException {
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
if (buffer.length < totalLength)
growBuffer(totalLength);
input.readChars(buffer, start, length);
return new Term(fieldInfos.fieldName(input.readVInt()),
new String(buffer, 0, totalLength), false);
}
private final void growBuffer(int length) {
buffer = new char[length];
for (int i = 0; i < term.text.length(); i++) // copy contents
buffer[i] = term.text.charAt(i);
}
/** Returns the current Term in the enumeration.
Initially invalid, valid after next() called for the first time.*/
public final Term term() {
return term;
}
/** Returns the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
final TermInfo termInfo() {
return new TermInfo(termInfo);
}
/** Sets the argument to the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
final void termInfo(TermInfo ti) {
ti.set(termInfo);
}
/** Returns the docFreq from the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
public final int docFreq() {
return termInfo.docFreq;
}
/* Returns the freqPointer from the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
final long freqPointer() {
return termInfo.freqPointer;
}
/* Returns the proxPointer from the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
final long proxPointer() {
return termInfo.proxPointer;
}
/** Closes the enumeration to further activity, freeing resources. */
public final void close() throws IOException {
input.close();
}
}

View File

@ -0,0 +1,114 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.store.InputStream;
final class SegmentTermPositions
extends SegmentTermDocs implements TermPositions {
private InputStream proxStream;
private int proxCount;
private int position;
SegmentTermPositions(SegmentReader p) throws IOException {
super(p);
proxStream = parent.getProxStream();
}
SegmentTermPositions(SegmentReader p, TermInfo ti)
throws IOException {
this(p);
seek(ti);
}
final void seek(TermInfo ti) throws IOException {
super.seek(ti);
proxStream.seek(ti.proxPointer);
}
public final void close() throws IOException {
super.close();
proxStream.close();
}
public final int nextPosition() throws IOException {
proxCount--;
return position += proxStream.readVInt();
}
protected final void skippingDoc() throws IOException {
for (int f = freq; f > 0; f--) // skip all positions
proxStream.readVInt();
}
public final boolean next() throws IOException {
for (int f = proxCount; f > 0; f--) // skip unread positions
proxStream.readVInt();
if (super.next()) { // run super
proxCount = freq; // note frequency
position = 0; // reset position
return true;
}
return false;
}
public final int read(final int[] docs, final int[] freqs)
throws IOException {
throw new RuntimeException();
}
}

View File

@ -0,0 +1,329 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Hashtable;
import org.apache.lucene.store.Directory;
import org.apache.lucene.document.Document;
final class SegmentsReader extends IndexReader {
protected SegmentReader[] readers;
protected int[] starts; // 1st docno for each segment
private Hashtable normsCache = new Hashtable();
private int maxDoc = 0;
private int numDocs = -1;
SegmentsReader(SegmentReader[] r) throws IOException {
readers = r;
starts = new int[readers.length + 1]; // build starts array
for (int i = 0; i < readers.length; i++) {
starts[i] = maxDoc;
maxDoc += readers[i].maxDoc(); // compute maxDocs
}
starts[readers.length] = maxDoc;
}
public final int numDocs() {
if (numDocs == -1) { // check cache
int n = 0; // cache miss--recompute
for (int i = 0; i < readers.length; i++)
n += readers[i].numDocs(); // sum from readers
numDocs = n;
}
return numDocs;
}
public final int maxDoc() {
return maxDoc;
}
public final Document document(int n) throws IOException {
int i = readerIndex(n); // find segment num
return readers[i].document(n - starts[i]); // dispatch to segment reader
}
public final boolean isDeleted(int n) {
int i = readerIndex(n); // find segment num
return readers[i].isDeleted(n - starts[i]); // dispatch to segment reader
}
public final void delete(int n) throws IOException {
numDocs = -1; // invalidate cache
int i = readerIndex(n); // find segment num
readers[i].delete(n - starts[i]); // dispatch to segment reader
}
private final int readerIndex(int n) { // find reader for doc n:
int lo = 0; // search starts array
int hi = readers.length - 1; // for first element less
// than n, return its index
while (hi >= lo) {
int mid = (lo + hi) >> 1;
int midValue = starts[mid];
if (n < midValue)
hi = mid - 1;
else if (n > midValue)
lo = mid + 1;
else
return mid;
}
return hi;
}
public final synchronized byte[] norms(String field) throws IOException {
byte[] bytes = (byte[])normsCache.get(field);
if (bytes != null)
return bytes; // cache hit
bytes = new byte[maxDoc()];
for (int i = 0; i < readers.length; i++)
readers[i].norms(field, bytes, starts[i]);
normsCache.put(field, bytes); // update cache
return bytes;
}
public final TermEnum terms() throws IOException {
return new SegmentsTermEnum(readers, starts, null);
}
public final TermEnum terms(Term term) throws IOException {
return new SegmentsTermEnum(readers, starts, term);
}
public final int docFreq(Term t) throws IOException {
int total = 0; // sum freqs in segments
for (int i = 0; i < readers.length; i++)
total += readers[i].docFreq(t);
return total;
}
public final TermDocs termDocs(Term term) throws IOException {
return new SegmentsTermDocs(readers, starts, term);
}
public final TermPositions termPositions(Term term) throws IOException {
return new SegmentsTermPositions(readers, starts, term);
}
public final void close() throws IOException {
for (int i = 0; i < readers.length; i++)
readers[i].close();
}
}
class SegmentsTermEnum extends TermEnum {
private SegmentMergeQueue queue;
private Term term;
private int docFreq;
SegmentsTermEnum(SegmentReader[] readers, int[] starts, Term t)
throws IOException {
queue = new SegmentMergeQueue(readers.length);
for (int i = 0; i < readers.length; i++) {
SegmentReader reader = readers[i];
SegmentTermEnum termEnum;
if (t != null) {
termEnum = (SegmentTermEnum)reader.terms(t);
} else
termEnum = (SegmentTermEnum)reader.terms();
SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader);
if (t == null ? smi.next() : termEnum.term() != null)
queue.put(smi); // initialize queue
else
smi.close();
}
if (t != null && queue.size() > 0) {
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
term = top.termEnum.term();
docFreq = top.termEnum.docFreq();
}
}
public final boolean next() throws IOException {
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
if (top == null) {
term = null;
return false;
}
term = top.term;
docFreq = 0;
while (top != null && term.compareTo(top.term) == 0) {
queue.pop();
docFreq += top.termEnum.docFreq(); // increment freq
if (top.next())
queue.put(top); // restore queue
else
top.close(); // done with a segment
top = (SegmentMergeInfo)queue.top();
}
return true;
}
public final Term term() {
return term;
}
public final int docFreq() {
return docFreq;
}
public final void close() throws IOException {
queue.close();
}
}
class SegmentsTermDocs implements TermDocs {
protected SegmentReader[] readers;
protected int[] starts;
protected Term term;
protected int base = 0;
protected int pointer = 0;
SegmentsTermDocs(SegmentReader[] r, int[] s, Term t) {
readers = r;
starts = s;
term = t;
}
protected SegmentTermDocs current;
public final int doc() {
return base + current.doc;
}
public final int freq() {
return current.freq;
}
public final boolean next() throws IOException {
if (current != null && current.next()) {
return true;
} else if (pointer < readers.length) {
if (current != null)
current.close();
base = starts[pointer];
current = termDocs(readers[pointer++]);
return next();
} else
return false;
}
/** Optimized implementation. */
public final int read(final int[] docs, final int[] freqs)
throws IOException {
while (true) {
while (current == null) {
if (pointer < readers.length) { // try next segment
base = starts[pointer];
current = termDocs(readers[pointer++]);
} else {
return 0;
}
}
int end = current.read(docs, freqs);
if (end == 0) { // none left in segment
current.close();
current = null;
} else { // got some
final int b = base; // adjust doc numbers
for (int i = 0; i < end; i++)
docs[i] += b;
return end;
}
}
}
/** As yet unoptimized implementation. */
public boolean skipTo(int target) throws IOException {
do {
if (!next())
return false;
} while (target > doc());
return true;
}
protected SegmentTermDocs termDocs(SegmentReader reader)
throws IOException {
return (SegmentTermDocs)reader.termDocs(term);
}
public final void close() throws IOException {
if (current != null)
current.close();
}
}
class SegmentsTermPositions extends SegmentsTermDocs implements TermPositions {
SegmentsTermPositions(SegmentReader[] r, int[] s, Term t) {
super(r,s,t);
}
protected final SegmentTermDocs termDocs(SegmentReader reader)
throws IOException {
return (SegmentTermDocs)reader.termPositions(term);
}
public final int nextPosition() throws IOException {
return ((SegmentTermPositions)current).nextPosition();
}
}

View File

@ -0,0 +1,122 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
A Term represents a word from text. This is the unit of search. It is
composed of two elements, the text of the word, as a string, and the name of
the field that the text occured in, an interned string.
Note that terms may represent more than words from text fields, but also
things like dates, email addresses, urls, etc. */
public final class Term {
String field;
String text;
/** Constructs a Term with the given field and text. */
public Term(String fld, String txt) {
this(fld, txt, true);
}
Term(String fld, String txt, boolean intern) {
field = intern ? fld.intern() : fld; // field names are interned
text = txt; // unless already known to be
}
/** Returns the field of this term, an interned string. The field indicates
the part of a document which this term came from. */
public final String field() { return field; }
/** Returns the text of this term. In the case of words, this is simply the
text of the word. In the case of dates and other types, this is an
encoding of the object as a string. */
public final String text() { return text; }
/** Compares two terms, returning true iff they have the same
field and text. */
public final boolean equals(Object o) {
if (o == null)
return false;
Term other = (Term)o;
return field == other.field && text.equals(other.text);
}
/** Combines the hashCode() of the field and the text. */
public final int hashCode() {
return field.hashCode() + text.hashCode();
}
/** Compares two terms, returning an integer which is less than zero iff this
term belongs after the argument, equal zero iff this term is equal to the
argument, and greater than zero iff this term belongs after the argument.
The ordering of terms is first by field, then by text.*/
public final int compareTo(Term other) {
if (field == other.field) // fields are interned
return text.compareTo(other.text);
else
return field.compareTo(other.field);
}
/** Resets the field and text of a Term. */
final void set(String fld, String txt) {
field = fld;
text = txt;
}
public final String toString() {
return "Term<" + field + ":" + text + ">";
}
}

View File

@ -0,0 +1,110 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.document.Document;
/** TermDocs provides an interface for enumerating &lt;document, frequency&gt;
pairs for a term. <p> The document portion names each document containing
the term. Documents are indicated by number. The frequency portion gives
the number of times the term occurred in each document. <p> The pairs are
ordered by document number.
@see IndexReader#termDocs
*/
public interface TermDocs {
/** Returns the current document number. <p> This is invalid until {@link
#next()} is called for the first time.*/
public int doc();
/** Returns the frequency of the term within the current document. <p> This
is invalid until {@link #next()} is called for the first time.*/
public int freq();
/** Moves to the next pair in the enumeration. <p> Returns true iff there is
such a next pair in the enumeration. */
public boolean next() throws IOException;
/** Attempts to read multiple entries from the enumeration, up to length of
* <i>docs</i>. Document numbers are stored in <i>docs</i>, and term
* frequencies are stored in <i>freqs</i>. The <i>freqs</i> array must be as
* long as the <i>docs</i> array.
*
* <p>Returns the number of entries read. Zero is only returned when the
* stream has been exhausted. */
public int read(int[] docs, int[] freqs) throws IOException;
/** Skips entries to the first beyond the current whose document number is
* greater than or equal to <i>target</i>. <p>Returns true iff there is such
* an entry. <p>Behaves as if written: <pre>
* public boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>
* Some implementations are considerably more efficient than that.
*/
public boolean skipTo(int target) throws IOException;
/** Frees associated resources. */
public void close() throws IOException;
}

View File

@ -0,0 +1,78 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
/** Abstract class for enumerating terms.
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
public abstract class TermEnum {
/** Increments the enumeration to the next element. True if one exists.*/
abstract public boolean next() throws IOException;
/** Returns the current Term in the enumeration.
Initially invalid, valid after next() called for the first time.*/
abstract public Term term();
/** Returns the docFreq of the current Term in the enumeration.
Initially invalid, valid after next() called for the first time.*/
abstract public int docFreq();
/** Closes the enumeration to further activity, freeing resources. */
abstract public void close() throws IOException;
}

View File

@ -0,0 +1,91 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/** A TermInfo is the record of information stored for a term.*/
final class TermInfo {
/** The number of documents which contain the term. */
int docFreq = 0;
long freqPointer = 0;
long proxPointer = 0;
TermInfo() {}
TermInfo(int df, long fp, long pp) {
docFreq = df;
freqPointer = fp;
proxPointer = pp;
}
TermInfo(TermInfo ti) {
docFreq = ti.docFreq;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;
}
final void set(int df, long fp, long pp) {
docFreq = df;
freqPointer = fp;
proxPointer = pp;
}
final void set(TermInfo ti) {
docFreq = ti.docFreq;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;
}
}

View File

@ -0,0 +1,222 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
* Directory. Pairs are accessed either by Term or by ordinal position the
* set. */
final class TermInfosReader {
private Directory directory;
private String segment;
private FieldInfos fieldInfos;
private SegmentTermEnum enum;
private int size;
TermInfosReader(Directory dir, String seg, FieldInfos fis)
throws IOException {
directory = dir;
segment = seg;
fieldInfos = fis;
enum = new SegmentTermEnum(directory.openFile(segment + ".tis"),
fieldInfos, false);
size = enum.size;
readIndex();
}
final void close() throws IOException {
if (enum != null)
enum.close();
}
/** Returns the number of term/value pairs in the set. */
final int size() {
return size;
}
Term[] indexTerms = null;
TermInfo[] indexInfos;
long[] indexPointers;
private final void readIndex() throws IOException {
SegmentTermEnum indexEnum =
new SegmentTermEnum(directory.openFile(segment + ".tii"),
fieldInfos, true);
try {
int indexSize = indexEnum.size;
indexTerms = new Term[indexSize];
indexInfos = new TermInfo[indexSize];
indexPointers = new long[indexSize];
for (int i = 0; indexEnum.next(); i++) {
indexTerms[i] = indexEnum.term();
indexInfos[i] = indexEnum.termInfo();
indexPointers[i] = indexEnum.indexPointer;
}
} finally {
indexEnum.close();
}
}
/** Returns the offset of the greatest index entry which is less than term.*/
private final int getIndexOffset(Term term) throws IOException {
int lo = 0; // binary search indexTerms[]
int hi = indexTerms.length - 1;
while (hi >= lo) {
int mid = (lo + hi) >> 1;
int delta = term.compareTo(indexTerms[mid]);
if (delta < 0)
hi = mid - 1;
else if (delta > 0)
lo = mid + 1;
else
return mid;
}
return hi;
}
private final void seekEnum(int indexOffset) throws IOException {
enum.seek(indexPointers[indexOffset],
(indexOffset * TermInfosWriter.INDEX_INTERVAL) - 1,
indexTerms[indexOffset], indexInfos[indexOffset]);
}
/** Returns the TermInfo for a Term in the set, or null. */
final synchronized TermInfo get(Term term) throws IOException {
if (size == 0) return null;
// optimize sequential access: first try scanning cached enum w/o seeking
if (enum.term() != null // term is at or past current
&& ((enum.prev != null && term.compareTo(enum.prev) > 0)
|| term.compareTo(enum.term()) >= 0)) {
int enumOffset = (enum.position/TermInfosWriter.INDEX_INTERVAL)+1;
if (indexTerms.length == enumOffset // but before end of block
|| term.compareTo(indexTerms[enumOffset]) < 0)
return scanEnum(term); // no need to seek
}
// random-access: must seek
seekEnum(getIndexOffset(term));
return scanEnum(term);
}
/** Scans within block for matching term. */
private final TermInfo scanEnum(Term term) throws IOException {
while (term.compareTo(enum.term()) > 0 && enum.next()) {}
if (enum.term() != null && term.compareTo(enum.term()) == 0)
return enum.termInfo();
else
return null;
}
/** Returns the nth term in the set. */
final synchronized Term get(int position) throws IOException {
if (size == 0) return null;
if (enum != null && enum.term() != null && position >= enum.position &&
position < (enum.position + TermInfosWriter.INDEX_INTERVAL))
return scanEnum(position); // can avoid seek
seekEnum(position / TermInfosWriter.INDEX_INTERVAL); // must seek
return scanEnum(position);
}
private final Term scanEnum(int position) throws IOException {
while(enum.position < position)
if (!enum.next())
return null;
return enum.term();
}
/** Returns the position of a Term in the set or -1. */
final synchronized int getPosition(Term term) throws IOException {
if (size == 0) return -1;
int indexOffset = getIndexOffset(term);
seekEnum(indexOffset);
while(term.compareTo(enum.term()) > 0 && enum.next()) {}
if (term.compareTo(enum.term()) == 0)
return enum.position;
else
return -1;
}
/** Returns an enumeration of all the Terms and TermInfos in the set. */
final synchronized SegmentTermEnum terms() throws IOException {
if (enum.position != -1) // if not at start
seekEnum(0); // reset to start
return (SegmentTermEnum)enum.clone();
}
/** Returns an enumeration of terms starting at or after the named term. */
final synchronized SegmentTermEnum terms(Term term) throws IOException {
get(term); // seek enum to term
return (SegmentTermEnum)enum.clone();
}
}

View File

@ -0,0 +1,159 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.Directory;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
Directory. A TermInfos can be written once, in order. */
final class TermInfosWriter {
private FieldInfos fieldInfos;
private OutputStream output;
private Term lastTerm = new Term("", "");
private TermInfo lastTi = new TermInfo();
private int size = 0;
static final int INDEX_INTERVAL = 128;
private long lastIndexPointer = 0;
private boolean isIndex = false;
private TermInfosWriter other = null;
TermInfosWriter(Directory directory, String segment, FieldInfos fis)
throws IOException, SecurityException {
initialize(directory, segment, fis, false);
other = new TermInfosWriter(directory, segment, fis, true);
other.other = this;
}
private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
boolean isIndex) throws IOException {
initialize(directory, segment, fis, isIndex);
}
private void initialize(Directory directory, String segment, FieldInfos fis,
boolean isi) throws IOException {
fieldInfos = fis;
isIndex = isi;
output = directory.createFile(segment + (isIndex ? ".tii" : ".tis"));
output.writeInt(0); // leave space for size
}
/** Adds a new <Term, TermInfo> pair to the set.
Term must be lexicographically greater than all previous Terms added.
TermInfo pointers must be positive and greater than all previous.*/
final void add(Term term, TermInfo ti)
throws IOException, SecurityException {
if (!isIndex && term.compareTo(lastTerm) <= 0)
throw new IOException("term out of order");
if (ti.freqPointer < lastTi.freqPointer)
throw new IOException("freqPointer out of order");
if (ti.proxPointer < lastTi.proxPointer)
throw new IOException("proxPointer out of order");
if (!isIndex && size % INDEX_INTERVAL == 0)
other.add(lastTerm, lastTi); // add an index term
writeTerm(term); // write term
output.writeVInt(ti.docFreq); // write doc freq
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
output.writeVLong(ti.proxPointer - lastTi.proxPointer);
if (isIndex) {
output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
lastIndexPointer = other.output.getFilePointer(); // write pointer
}
lastTi.set(ti);
size++;
}
private final void writeTerm(Term term)
throws IOException {
int start = stringDifference(lastTerm.text, term.text);
int length = term.text.length() - start;
output.writeVInt(start); // write shared prefix length
output.writeVInt(length); // write delta length
output.writeChars(term.text, start, length); // write delta chars
output.writeVInt(fieldInfos.fieldNumber(term.field)); // write field num
lastTerm = term;
}
private static final int stringDifference(String s1, String s2) {
int len1 = s1.length();
int len2 = s2.length();
int len = len1 < len2 ? len1 : len2;
for (int i = 0; i < len; i++)
if (s1.charAt(i) != s2.charAt(i))
return i;
return len;
}
/** Called to complete TermInfos creation. */
final void close() throws IOException, SecurityException {
output.seek(0); // write size at start
output.writeInt(size);
output.close();
if (!isIndex)
other.close();
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.document.Document;
/** TermPositions provides an interface for enumerating the &lt;document,
frequency, &lt;position&gt;* &gt; tuples for a term. <p> The document and
frequency are as for a TermDocs. The positions portion lists the ordinal
positions of each occurence of a term in a document.
@see IndexReader#termPositions
*/
public interface TermPositions extends TermDocs {
/** Returns next position in the current document. It is an error to call
this more than {@link #freq()} times
without calling {@link #next()}<p> This is
invalid until {@link #next()} is called for
the first time.*/
public int nextPosition() throws IOException;
}

View File

@ -0,0 +1,10 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Doug Cutting">
</head>
<body>
Code to maintain and access indices.
</body>
</html>

View File

@ -0,0 +1,8 @@
Name: com/lucene
Specification-Title: Lucene Search Engine
Specification-Version: $Name$
Specification-Vendor: Lucene
Implementation-Title: com.lucene
Implementation-Version: $Name$ $Date$
Implementation-Vendor: Lucene

View File

@ -0,0 +1,6 @@
QueryParser.java
TokenMgrError.java
ParseException.java
Token.java
TokenManager.java
QueryParserConstants.java

View File

@ -0,0 +1,2 @@
# sub-directory makefile for lucene
include ../rules.mk

View File

@ -0,0 +1,366 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
options {
STATIC= false;
}
PARSER_BEGIN(QueryParser)
package org.apache.lucene.queryParser;
import java.util.Vector;
import java.io.*;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.*;
import org.apache.lucene.search.*;
/**
* This class is generated by JavaCC. The only method that clients should need
* to call is <a href="#parse">parse()</a>.
*
* The syntax for query strings is as follows:
* A Query is a series of clauses.
* A clause may be prefixed by:
* <ul>
* <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign, indicating
* that the clause is required or prohibited respectively; or
* <li> a term followed by a colon, indicating the field to be searched.
* This enables one to construct queries which search multiple fields.
* </ul>
*
* A clause may be either a:
* <ul>
* <li> a term, indicating all the documents that contain this term; or
* <li> a nested query, enclosed in parentheses. Note that this may be used
* with a <code>+</code>/<code>-</code> prefix to require any of a set of
* terms.
* </ul>
*
* Thus, in BNF, the query grammar is:
* <pre>
* Query ::= ( Clause )*
* Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
* </pre>
*/
public class QueryParser {
/** Parses a query string, returning a
* <a href="lucene.search.Query.html">Query</a>.
* @param query the query string to be parsed.
* @param field the default field for query terms.
* @param analyzer used to find terms in the query text.
*/
static public Query parse(String query, String field, Analyzer analyzer)
throws ParseException {
QueryParser parser = new QueryParser(field, analyzer);
return parser.parse(query);
}
Analyzer analyzer;
String field;
int phraseSlop = 0;
/** Constructs a query parser.
* @param field the default field for query terms.
* @param analyzer used to find terms in the query text.
*/
public QueryParser(String f, Analyzer a) {
this(new StringReader(""));
analyzer = a;
field = f;
}
/** Parses a query string, returning a
* <a href="lucene.search.Query.html">Query</a>.
* @param query the query string to be parsed.
*/
public Query parse(String query) throws ParseException {
ReInit(new StringReader(query));
return Query(field);
}
/** Sets the default slop for phrases. If zero, then exact phrase matches
are required. Zero by default. */
public void setPhraseSlop(int s) { phraseSlop = s; }
/** Gets the default slop for phrases. */
public int getPhraseSlop() { return phraseSlop; }
private void addClause(Vector clauses, int conj, int mods,
Query q) {
boolean required, prohibited;
// If this term is introduced by AND, make the preceding term required,
// unless it's already prohibited
if (conj == CONJ_AND) {
BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
if (!c.prohibited)
c.required = true;
}
// We might have been passed a null query; the term might have been
// filtered away by the analyzer.
if (q == null)
return;
// We set REQUIRED if we're introduced by AND or +; PROHIBITED if
// introduced by NOT or -; make sure not to set both.
prohibited = (mods == MOD_NOT);
required = (mods == MOD_REQ);
if (conj == CONJ_AND && !prohibited)
required = true;
clauses.addElement(new BooleanClause(q, required, prohibited));
}
private Query getFieldQuery(String field, Analyzer analyzer, String queryText) {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
org.apache.lucene.analysis.Token t;
while (true) {
try {
t = source.next();
}
catch (IOException e) {
t = null;
}
if (t == null)
break;
v.addElement(t.termText());
}
if (v.size() == 0)
return null;
else if (v.size() == 1)
return new TermQuery(new Term(field, (String) v.elementAt(0)));
else {
PhraseQuery q = new PhraseQuery();
q.setSlop(phraseSlop);
for (int i=0; i<v.size(); i++) {
q.add(new Term(field, (String) v.elementAt(i)));
}
return q;
}
}
public static void main(String[] args) throws Exception {
QueryParser qp = new QueryParser("field",
new org.apache.lucene.analysis.SimpleAnalyzer());
Query q = qp.parse(args[0]);
System.out.println(q.toString("field"));
}
private static final int CONJ_NONE = 0;
private static final int CONJ_AND = 1;
private static final int CONJ_OR = 2;
private static final int MOD_NONE = 0;
private static final int MOD_NOT = 10;
private static final int MOD_REQ = 11;
}
PARSER_END(QueryParser)
/* ***************** */
/* Token Definitions */
/* ***************** */
<*> TOKEN : {
<#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
| <#_NUM_CHAR: ["0"-"9"] >
| <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
| <#_NEWLINE: ( "\r\n" | "\r" | "\n" ) >
| <#_WHITESPACE: ( " " | "\t" ) >
| <#_QCHAR: ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
| <#_RESTOFLINE: (~["\r", "\n"])* >
}
<DEFAULT> TOKEN : {
<AND: ("AND" | "&&") >
| <OR: ("OR" | "||") >
| <NOT: ("NOT" | "!") >
| <PLUS: "+" >
| <MINUS: "-" >
| <LPAREN: "(" >
| <RPAREN: ")" >
| <COLON: ":" >
| <CARAT: "^" >
| <STAR: "*" >
| <QUOTED: "\"" (~["\""])+ "\"">
| <NUMBER: (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
| <TERM: <_IDENTIFIER_CHAR>
( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "*", "?", "~" ] )* >
| <FUZZY: "~" >
| <WILDTERM: <_IDENTIFIER_CHAR>
( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "~" ] )* <_IDENTIFIER_CHAR>>
}
<DEFAULT> SKIP : {
<<_WHITESPACE>>
}
// * Query ::= ( Clause )*
// * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
int Conjunction() : {
int ret = CONJ_NONE;
}
{
[
<AND> { ret = CONJ_AND; }
| <OR> { ret = CONJ_OR; }
]
{ return ret; }
}
int Modifiers() : {
int ret = MOD_NONE;
}
{
[
<PLUS> { ret = MOD_REQ; }
| <MINUS> { ret = MOD_NOT; }
| <NOT> { ret = MOD_NOT; }
]
{ return ret; }
}
Query Query(String field) :
{
Vector clauses = new Vector();
Query q;
int conj, mods;
}
{
mods=Modifiers() q=Clause(field)
{ addClause(clauses, CONJ_NONE, mods, q); }
(
conj=Conjunction() mods=Modifiers() q=Clause(field)
{ addClause(clauses, conj, mods, q); }
)*
{
BooleanQuery query = new BooleanQuery();
for (int i = 0; i < clauses.size(); i++)
query.add((BooleanClause)clauses.elementAt(i));
return query;
}
}
Query Clause(String field) : {
Query q;
Token fieldToken=null;
}
{
[
LOOKAHEAD(2)
fieldToken=<TERM> <COLON> { field = fieldToken.image; }
]
(
q=Term(field)
| <LPAREN> q=Query(field) <RPAREN>
)
{
return q;
}
}
Query Term(String field) : {
Token term, boost=null;
boolean prefix = false;
boolean wildcard = false;
boolean fuzzy = false;
Query q;
}
{
(
(term=<TERM>|term=<WILDTERM>{wildcard=true;}|term=<NUMBER>)[<STAR>{prefix=true;}|<FUZZY>{fuzzy=true;}][<CARAT> boost=<NUMBER>]
{ if (wildcard)
q = new WildcardQuery(new Term(field, term.image));
else if (prefix)
q = new PrefixQuery(new Term(field, term.image));
else if (fuzzy)
q = new FuzzyQuery(new Term(field, term.image));
else
q = getFieldQuery(field, analyzer, term.image); }
| term=<QUOTED>
{ q = getFieldQuery(field, analyzer,
term.image.substring(1, term.image.length()-1)); }
)
{
if (boost != null) {
float f = (float) 1.0;
try {
f = Float.valueOf(boost.image).floatValue();
}
catch (Exception ignored) { }
if (q instanceof TermQuery)
((TermQuery) q).setBoost(f);
else if (q instanceof PhraseQuery)
((PhraseQuery) q).setBoost(f);
else if (q instanceof MultiTermQuery)
((MultiTermQuery) q).setBoost(f);
}
return q;
}
}

View File

@ -0,0 +1,15 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Doug Cutting">
</head>
<body>
A simple query parser implemented with JavaCC.
<p>Note that JavaCC defines lots of public, classes, methods and fields
that do not need to be public.&nbsp; These clutter the documentation.&nbsp;
Sorry.
<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>com.lucene.analysis.Token</tt>
must always be fully qualified in sourced code in this package.
</body>
</html>

View File

@ -0,0 +1,58 @@
# rules to enable the running of "make jar" and the like from any dir..
# directories containing java source code
DIRS = store util document analysis analysis/standard index search queryParser
PACKAGES = $(subst /,.,$(patsubst %,com.lucene.%,$(DIRS)))
ifeq ($(JAVALINK),)
JAVALINK = http://java.sun.com/products/jdk/1.3/docs/api/
endif
# OLDJAVA does not have a -link option
ifeq ($(OLDJAVA),)
JLINK_OPT = -link $(JAVALINK)
JAR_CMD = $(JAR) -cvfm lucene.jar com/lucene/manifest
else
JAR_CMD = $(JAR) -cvf lucene.jar
endif
.PHONY: jar doc demo release
jar: all_classes
cd $(ROOT) && $(JAR_CMD) \
`ls com/lucene/*/*.class` `ls com/lucene/*/*/*.class`
doc: all_classes
if [ -d $(ROOT)/doc/api ]; then rm -rf $(ROOT)/doc/api ;fi
mkdir $(ROOT)/doc/api
$(JAVADOC) -classpath '$(CLASSPATH)' -author -version \
-d $(ROOT)/doc/api $(JLINK_OPT) $(PACKAGES)
demo: all_classes
$(MAKE) -C $(ROOT)/demo/HTMLParser -w
$(MAKE) -C $(ROOT)/demo -w CLASSPATH=..
release: jar demo doc
cd $(ROOT) && tar cvf lucene.tar lucene.jar doc/*.html doc/api \
demo/*.java demo/*.class demo/*.html demo/*.jhtml \
demo/HTMLParser/*.class demo/HTMLParser/*.jj \
demo/HTMLParser/*.java
# make all the Lucene classes
all_classes : TARGET = classes
all_classes : $(DIRS)
.PHONY: $(DIRS)
$(DIRS):
$(MAKE) -C $(ROOT)/com/lucene/$@ -w $(TARGET)
# Removes all generated files from src directories.
src_clean: TARGET = clean
src_clean: $(DIRS) clean
# Removes all generated files.
real_clean: DIRS += demo
real_clean: DIRS += demo/HTMLParser
real_clean: TARGET = clean
real_clean: $(DIRS) clean
cd $(ROOT) && rm -rf lucene.jar lucene.tar doc/api

View File

@ -0,0 +1,128 @@
# GNU make rules for lucene
# determine whether we're on Win32 or Unix
ifeq ($(findstring CYGWIN,$(shell uname)),CYGWIN)
OS = win32
else
OS = unix
endif
# DOS compatibility:
# These should be used in variables that end up in CLASSPATH.
ifeq ($(OS),win32)
SLASH=\\
COLON=;
else
SLASH=/
COLON=:
endif
# ROOT should be set to the root directory of the Lucene package
# hierarchy. This is typically ../../.., as most packages are of the
# form com.lucene.<package>.
ifeq ($(ROOT),)
ROOT = ..$(SLASH)..$(SLASH)..
else
ROOT := $(subst /,$(SLASH),$(ROOT))
endif
#include all the relevant variables
include $(subst $(SLASH),/,$(ROOT))/com/lucene/variables.mk
# directories containing java source code
DIRS = store util document analysis analysis/standard index search queryParser
PACKAGES = $(subst /,.,$(patsubst %,com.lucene.%,$(DIRS)))
ifeq ($(JDK_HOME),)
ifneq ($(JAVA_HOME),)
JDK_HOME=$(JAVA_HOME)
else
ifeq ($(OS),win32)
JDK_HOME = C:/jdk1.3.1
else
JDK_HOME = /usr/local/java/jdk1.3.1
endif
endif
endif
# Location of JavaCC
ifeq ($(JAVACC),)
ifeq ($(OS),win32)
JAVACC = C:/javacc2_0/bin/lib/JavaCC.zip
else
JAVACC = /usr/local/java/javacc2_0/bin/lib/JavaCC.zip
endif
endif
JAVADIR = $(subst \,/,$(JDK_HOME))
# The compiler executable.
ifeq ($(JAVAC),)
JAVAC = $(JAVADIR)/bin/javac
endif
# The java executable
JAVA = $(JAVADIR)/bin/java
# The jar executable
JAR = $(JAVADIR)/bin/jar
# javadoc
JAVADOC = $(JAVADIR)/bin/javadoc
# Options to pass to Java compiler
ifeq ($(JFLAGS),)
JFLAGS = -O
endif
# CLASSPATH
# By default include the Lucene root, and Java's builtin classes
ifeq ($(OLDJAVA),)
export CLASSPATH=$(PREPENDCLASSPATH)$(COLON)$(ROOT)$(COLON)$(JDK_HOME)$(SLASH)jre$(SLASH)lib$(SLASH)rt.jar
else
export CLASSPATH=$(PREPENDCLASSPATH)$(COLON)$(ROOT)$(COLON)$(JDK_HOME)$(SLASH)lib$(SLASH)classes.zip
endif
# JIKESPATH overrides the classpath variable for jikes, so we need to set it
# here to avoid problems with a jikes user
export JIKESPATH=$(CLASSPATH)
## Rules
# Use JAVAC to compile .java files into .class files
%.class : %.java
$(JAVAC) $(JFLAGS) $<
# Compile .jj files to .java with JavaCC
%.java : %.jj
$(JAVA) -classpath '$(CLASSPATH)$(COLON)$(JAVACC)' COM.sun.labs.javacc.Main $<
# Add JavaCC generated files to 'classes' and 'clean' targets.
JJFILES = $(wildcard *.jj)
ifneq ($(JJFILES),)
CLASSES += $(patsubst %.jj,%.class, $(JJFILES))
DIRT += $(patsubst %.jj,%.java, $(JJFILES))
DIRT += $(patsubst %.jj,%Constants.java, $(JJFILES))
DIRT += $(patsubst %.jj,%TokenManager.java, $(JJFILES))
DIRT += Token.java TokenMgrError.java TokenManager.java \
CharStream.java ASCII_CharStream.java ParseException.java
endif
# Don't delete parser's .java file -- it's needed by javadoc.
.PRECIOUS: $(patsubst %.jj,%.java, $(JJFILES))
# Assume all .java files should have a .class file.
CLASSES += $(patsubst %.java,%.class,$(wildcard *.java))
# default rule
classes : $(CLASSES)
# Removes all generated files from the connected src directory.
clean:
rm -f *.class $(DIRT)
# include all the rules for the root directory..
include $(subst $(SLASH),/,$(ROOT))/com/lucene/rootrules.mk

View File

@ -0,0 +1,75 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/** A clause in a BooleanQuery. */
public final class BooleanClause {
/** The query whose matching documents are combined by the boolean query. */
public Query query;
/** If true, documents documents which <i>do not</i>
match this sub-query will <it>not</it> match the boolean query. */
public boolean required = false;
/** If true, documents documents which <i>do</i>
match this sub-query will <it>not</it> match the boolean query. */
public boolean prohibited = false;
/** Constructs a BooleanClause with query <code>q</code>, required
<code>r</code> and prohibited <code>p</code>. */
public BooleanClause(Query q, boolean r, boolean p) {
query = q;
required = r;
prohibited = p;
}
}

View File

@ -0,0 +1,177 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import org.apache.lucene.index.IndexReader;
/** A Query that matches documents matching boolean combinations of other
queries, typically {@link TermQuery}s or {@link PhraseQuery}s.
*/
final public class BooleanQuery extends Query {
private Vector clauses = new Vector();
/** Constructs an empty boolean query. */
public BooleanQuery() {}
/** Adds a clause to a boolean query. Clauses may be:
<ul>
<li><code>required</code> which means that documents which <i>do not</i>
match this sub-query will <it>not</it> match the boolean query;
<li><code>prohibited</code> which means that documents which <i>do</i>
match this sub-query will <it>not</it> match the boolean query; or
<li>neither, in which case matched documents are neither prohibited from
nor required to match the sub-query.
</ul>
It is an error to specify a clause as both <code>required</code> and
<code>prohibited</code>.
*/
public final void add(Query query, boolean required, boolean prohibited) {
clauses.addElement(new BooleanClause(query, required, prohibited));
}
/** Adds a clause to a boolean query. */
public final void add(BooleanClause clause) {
clauses.addElement(clause);
}
void prepare(IndexReader reader) {
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = (BooleanClause)clauses.elementAt(i);
c.query.prepare(reader);
}
}
final float sumOfSquaredWeights(Searcher searcher)
throws IOException {
float sum = 0.0f;
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = (BooleanClause)clauses.elementAt(i);
if (!c.prohibited)
sum += c.query.sumOfSquaredWeights(searcher); // sum sub-query weights
}
return sum;
}
final void normalize(float norm) {
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = (BooleanClause)clauses.elementAt(i);
if (!c.prohibited)
c.query.normalize(norm);
}
}
final Scorer scorer(IndexReader reader)
throws IOException {
if (clauses.size() == 1) { // optimize 1-term queries
BooleanClause c = (BooleanClause)clauses.elementAt(0);
if (!c.prohibited) // just return term scorer
return c.query.scorer(reader);
}
BooleanScorer result = new BooleanScorer();
int theMask = 1, thisMask;
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = (BooleanClause)clauses.elementAt(i);
if (c.required || c.prohibited) {
thisMask = theMask;
theMask = theMask << 1;
} else
thisMask = 0;
Scorer subScorer = c.query.scorer(reader);
if (subScorer != null)
result.add(subScorer, c.required, c.prohibited);
else if (c.required)
return null;
}
if (theMask == 0)
throw new IndexOutOfBoundsException
("More than 32 required/prohibited clauses in query.");
return result;
}
/** Prints a user-readable version of this query. */
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = (BooleanClause)clauses.elementAt(i);
if (c.prohibited)
buffer.append("-");
else if (c.required)
buffer.append("+");
Query subQuery = c.query;
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
BooleanQuery bq = (BooleanQuery)subQuery;
buffer.append("(");
buffer.append(c.query.toString(field));
buffer.append(")");
} else
buffer.append(c.query.toString(field));
if (i != clauses.size()-1)
buffer.append(" ");
}
return buffer.toString();
}
}

View File

@ -0,0 +1,204 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.index.*;
final class BooleanScorer extends Scorer {
private int currentDoc;
private SubScorer scorers = null;
private BucketTable bucketTable = new BucketTable(this);
private int maxCoord = 1;
private float[] coordFactors = null;
private int requiredMask = 0;
private int prohibitedMask = 0;
private int nextMask = 1;
static final class SubScorer {
public Scorer scorer;
public boolean required = false;
public boolean prohibited = false;
public HitCollector collector;
public SubScorer next;
public SubScorer(Scorer scorer, boolean required, boolean prohibited,
HitCollector collector, SubScorer next) {
this.scorer = scorer;
this.required = required;
this.prohibited = prohibited;
this.collector = collector;
this.next = next;
}
}
final void add(Scorer scorer, boolean required, boolean prohibited) {
int mask = 0;
if (required || prohibited) {
if (nextMask == 0)
throw new IndexOutOfBoundsException
("More than 32 required/prohibited clauses in query.");
mask = nextMask;
nextMask = nextMask << 1;
} else
mask = 0;
if (!prohibited)
maxCoord++;
if (prohibited)
prohibitedMask |= mask; // update prohibited mask
else if (required)
requiredMask |= mask; // update required mask
scorers = new SubScorer(scorer, required, prohibited,
bucketTable.newCollector(mask), scorers);
}
private final void computeCoordFactors() throws IOException {
coordFactors = new float[maxCoord];
for (int i = 0; i < maxCoord; i++)
coordFactors[i] = Similarity.coord(i, maxCoord);
}
final void score(HitCollector results, int maxDoc) throws IOException {
if (coordFactors == null)
computeCoordFactors();
while (currentDoc < maxDoc) {
currentDoc = Math.min(currentDoc+BucketTable.SIZE, maxDoc);
for (SubScorer t = scorers; t != null; t = t.next)
t.scorer.score(t.collector, currentDoc);
bucketTable.collectHits(results);
}
}
static final class Bucket {
int doc = -1; // tells if bucket is valid
float score; // incremental score
int bits; // used for bool constraints
int coord; // count of terms in score
Bucket next; // next valid bucket
}
/** A simple hash table of document scores within a range. */
static final class BucketTable {
public static final int SIZE = 1 << 10;
public static final int MASK = SIZE - 1;
final Bucket[] buckets = new Bucket[SIZE];
Bucket first = null; // head of valid list
private BooleanScorer scorer;
public BucketTable(BooleanScorer scorer) {
this.scorer = scorer;
}
public final void collectHits(HitCollector results) {
final int required = scorer.requiredMask;
final int prohibited = scorer.prohibitedMask;
final float[] coord = scorer.coordFactors;
for (Bucket bucket = first; bucket!=null; bucket = bucket.next) {
if ((bucket.bits & prohibited) == 0 && // check prohibited
(bucket.bits & required) == required){// check required
results.collect(bucket.doc, // add to results
bucket.score * coord[bucket.coord]);
}
}
first = null; // reset for next round
}
public final int size() { return SIZE; }
public HitCollector newCollector(int mask) {
return new Collector(mask, this);
}
}
static final class Collector extends HitCollector {
private BucketTable bucketTable;
private int mask;
public Collector(int mask, BucketTable bucketTable) {
this.mask = mask;
this.bucketTable = bucketTable;
}
public final void collect(final int doc, final float score) {
final BucketTable table = bucketTable;
final int i = doc & BucketTable.MASK;
Bucket bucket = table.buckets[i];
if (bucket == null)
table.buckets[i] = bucket = new Bucket();
if (bucket.doc != doc) { // invalid bucket
bucket.doc = doc; // set doc
bucket.score = score; // initialize score
bucket.bits = mask; // initialize mask
bucket.coord = 1; // initialize coord
bucket.next = table.first; // push onto valid list
table.first = bucket;
} else { // valid bucket
bucket.score += score; // increment score
bucket.bits |= mask; // add bits in mask
bucket.coord++; // increment coord
}
}
}
}

View File

@ -0,0 +1,161 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.BitSet;
import java.util.Date;
import java.io.IOException;
import org.apache.lucene.document.DateField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
/** A Filter that restricts search results to a range of time.
<p>For this to work, documents must have been indexed with a {@link
DateField}. */
public final class DateFilter extends Filter {
String field;
String start = DateField.MIN_DATE_STRING();
String end = DateField.MAX_DATE_STRING();
private DateFilter(String f) {
field = f;
}
/** Constructs a filter for field <code>f</code> matching dates between
<code>from</code> and <code>to</code>. */
public DateFilter(String f, Date from, Date to) {
field = f;
start = DateField.dateToString(from);
end = DateField.dateToString(to);
}
/** Constructs a filter for field <code>f</code> matching times between
<code>from</code> and <code>to</code>. */
public DateFilter(String f, long from, long to) {
field = f;
start = DateField.timeToString(from);
end = DateField.timeToString(to);
}
/** Constructs a filter for field <code>f</code> matching dates before
<code>date</code>. */
public static DateFilter Before(String field, Date date) {
DateFilter result = new DateFilter(field);
result.end = DateField.dateToString(date);
return result;
}
/** Constructs a filter for field <code>f</code> matching times before
<code>time</code>. */
public static DateFilter Before(String field, long time) {
DateFilter result = new DateFilter(field);
result.end = DateField.timeToString(time);
return result;
}
/** Constructs a filter for field <code>f</code> matching dates before
<code>date</code>. */
public static DateFilter After(String field, Date date) {
DateFilter result = new DateFilter(field);
result.start = DateField.dateToString(date);
return result;
}
/** Constructs a filter for field <code>f</code> matching times before
<code>time</code>. */
public static DateFilter After(String field, long time) {
DateFilter result = new DateFilter(field);
result.start = DateField.timeToString(time);
return result;
}
/** Returns a BitSet with true for documents which should be permitted in
search results, and false for those that should not. */
final public BitSet bits(IndexReader reader) throws IOException {
BitSet bits = new BitSet(reader.maxDoc());
TermEnum enum = reader.terms(new Term(field, start));
try {
Term stop = new Term(field, end);
while (enum.term().compareTo(stop) <= 0) {
TermDocs termDocs = reader.termDocs(enum.term());
try {
while (termDocs.next())
bits.set(termDocs.doc());
} finally {
termDocs.close();
}
if (!enum.next()) {
break;
}
}
} finally {
enum.close();
}
return bits;
}
public final String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append(field);
buffer.append(":");
buffer.append(DateField.stringToDate(start).toString());
buffer.append("-");
buffer.append(DateField.stringToDate(end).toString());
return buffer.toString();
}
}

View File

@ -0,0 +1,91 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import org.apache.lucene.util.*;
import org.apache.lucene.index.*;
final class ExactPhraseScorer extends PhraseScorer {
ExactPhraseScorer(TermPositions[] tps, byte[] n, float w)
throws IOException {
super(tps, n, w);
}
protected final float phraseFreq() throws IOException {
// sort list with pq
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
pp.firstPosition();
pq.put(pp); // build pq from list
}
pqToList(); // rebuild list from pq
int freq = 0;
do { // find position w/ all terms
while (first.position < last.position) { // scan forward in first
do {
if (!first.nextPosition())
return (float)freq;
} while (first.position < last.position);
firstToLast();
}
freq++; // all equal: a match
} while (last.nextPosition());
return (float)freq;
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.BitSet;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
/** Abstract base class providing a mechanism to restrict searches to a subset
of an index. */
abstract public class Filter {
/** Returns a BitSet with true for documents which should be permitted in
search results, and false for those that should not. */
abstract public BitSet bits(IndexReader reader) throws IOException;
}

View File

@ -0,0 +1,130 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
/** Abstract class for enumerating a subset of all terms.
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
public abstract class FilteredTermEnum extends TermEnum {
private Term currentTerm = null;
private TermEnum actualEnum = null;
public FilteredTermEnum(IndexReader reader, Term term) throws IOException {}
/** Equality compare on the term */
protected abstract boolean termCompare(Term term);
/** Equality measure on the term */
protected abstract float difference();
/** Indiciates the end of the enumeration has been reached */
protected abstract boolean endEnum();
protected void setEnum(TermEnum actualEnum) throws IOException {
this.actualEnum = actualEnum;
// Find the first term that matches
Term term = actualEnum.term();
if (termCompare(term))
currentTerm = term;
else next();
}
/**
* Returns the docFreq of the current Term in the enumeration.
* Initially invalid, valid after next() called for the first time.
*/
public int docFreq() {
if (actualEnum == null) return -1;
return actualEnum.docFreq();
}
/** Increments the enumeration to the next element. True if one exists. */
public boolean next() throws IOException {
if (actualEnum == null) return false; // the actual enumerator is not initialized!
currentTerm = null;
while (currentTerm == null) {
if (endEnum()) return false;
if (actualEnum.next()) {
Term term = actualEnum.term();
if (termCompare(term)) {
currentTerm = term;
return true;
}
}
else return false;
}
currentTerm = null;
return false;
}
/** Returns the current Term in the enumeration.
* Initially invalid, valid after next() called for the first time. */
public Term term() {
return currentTerm;
}
/** Closes the enumeration to further activity, freeing resources. */
public void close() throws IOException {
actualEnum.close();
currentTerm = null;
actualEnum = null;
}
}

View File

@ -0,0 +1,79 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import java.io.IOException;
/** Implements the fuzzy search query */
final public class FuzzyQuery extends MultiTermQuery {
private Term fuzzyTerm;
public FuzzyQuery(Term term) {
super(term);
fuzzyTerm = term;
}
final void prepare(IndexReader reader) {
try {
setEnum(new FuzzyTermEnum(reader, fuzzyTerm));
} catch (IOException e) {}
}
public String toString(String field) {
return super.toString(field) + '~';
}
}

View File

@ -0,0 +1,175 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
/** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
final public class FuzzyTermEnum extends FilteredTermEnum {
double distance;
boolean fieldMatch = false;
boolean endEnum = false;
Term searchTerm = null;
String field = "";
String text = "";
int textlen;
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
super(reader, term);
searchTerm = term;
field = searchTerm.field();
text = searchTerm.text();
textlen = text.length();
setEnum(reader.terms(new Term(searchTerm.field(), "")));
}
/**
The termCompare method in FuzzyTermEnum uses Levenshtein distance to
calculate the distance between the given term and the comparing term.
*/
final protected boolean termCompare(Term term) {
if (field == term.field()) {
String target = term.text();
int targetlen = target.length();
int dist = editDistance(text, target, textlen, targetlen);
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
return (distance > FUZZY_THRESHOLD);
}
endEnum = true;
return false;
}
final protected float difference() {
return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
}
final public boolean endEnum() {
return endEnum;
}
/******************************
* Compute Levenshtein distance
******************************/
public static final double FUZZY_THRESHOLD = 0.5;
public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
/**
Finds and returns the smallest of three integers
*/
private final static int min(int a, int b, int c) {
int t = (a < b) ? a : b;
return (t < c) ? t : c;
}
/**
* This static array saves us from the time required to create a new array
* everytime editDistance is called.
*/
private int e[][] = new int[0][0];
/**
Levenshtein distance also known as edit distance is a measure of similiarity
between two strings where the distance is measured as the number of character
deletions, insertions or substitutions required to transform one string to
the other string.
<p>This method takes in four parameters; two strings and their respective
lengths to compute the Levenshtein distance between the two strings.
The result is returned as an integer.
*/
private final int editDistance(String s, String t, int n, int m) {
if (e.length <= n || e[0].length <= m) {
e = new int[Math.max(e.length, n+1)][Math.max(e.length, m+1)];
}
int d[][] = e; // matrix
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
if (n == 0) return m;
if (m == 0) return n;
// init matrix d
for (i = 0; i <= n; i++) d[i][0] = i;
for (j = 0; j <= m; j++) d[0][j] = j;
// start computing edit distance
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
for (j = 1; j <= m; j++) {
if (s_i != t.charAt(j-1))
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
}
}
// we got the result!
return d[n][m];
}
public void close() throws IOException {
super.close();
searchTerm = null;
field = null;
text = null;
}
}

View File

@ -0,0 +1,76 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/** Lower-level search API.
* @see IndexSearcher#search(Query,HitCollector)
*/
public abstract class HitCollector {
/** Called once for every non-zero scoring document, with the document number
* and its score.
*
* <P>If, for example, an application wished to collect all of the hits for a
* query in a BitSet, then it might:<pre>
* Searcher = new IndexSearcher(indexReader);
* final BitSet bits = new BitSet(indexReader.maxDoc());
* searcher.search(query, new HitCollector() {
* public void collect(int doc, float score) {
* bits.set(doc);
* }
* });
* </pre>
*/
public abstract void collect(int doc, float score);
}

View File

@ -0,0 +1,72 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.util.PriorityQueue;
final class HitQueue extends PriorityQueue {
HitQueue(int size) {
initialize(size);
}
protected final boolean lessThan(Object a, Object b) {
ScoreDoc hitA = (ScoreDoc)a;
ScoreDoc hitB = (ScoreDoc)b;
if (hitA.score == hitB.score)
return hitA.doc > hitB.doc;
else
return hitA.score < hitB.score;
}
}

View File

@ -0,0 +1,188 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import java.util.BitSet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
/** A ranked list of documents, used to hold search results. */
public final class Hits {
private Query query;
private Searcher searcher;
private Filter filter = null;
private int length; // the total number of hits
private Vector hitDocs = new Vector(); // cache of hits retrieved
private HitDoc first; // head of LRU cache
private HitDoc last; // tail of LRU cache
private int numDocs = 0; // number cached
private int maxDocs = 200; // max to cache
Hits(Searcher s, Query q, Filter f) throws IOException {
query = q;
searcher = s;
filter = f;
getMoreDocs(50); // retrieve 100 initially
}
// Tries to add new documents to hitDocs.
// Ensures that the hit numbered <code>min</code> has been retrieved.
private final void getMoreDocs(int min) throws IOException {
if (hitDocs.size() > min)
min = hitDocs.size();
int n = min * 2; // double # retrieved
TopDocs topDocs = searcher.search(query, filter, n);
length = topDocs.totalHits;
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
float scoreNorm = 1.0f;
if (length > 0 && scoreDocs[0].score > 1.0f)
scoreNorm = 1.0f / scoreDocs[0].score;
int end = scoreDocs.length < length ? scoreDocs.length : length;
for (int i = hitDocs.size(); i < end; i++)
hitDocs.addElement(new HitDoc(scoreDocs[i].score*scoreNorm,
scoreDocs[i].doc));
}
/** Returns the total number of hits available in this set. */
public final int length() {
return length;
}
/** Returns the nth document in this set.
<p>Documents are cached, so that repeated requests for the same element may
return the same Document object. */
public final Document doc(int n) throws IOException {
HitDoc hitDoc = hitDoc(n);
// Update LRU cache of documents
remove(hitDoc); // remove from list, if there
addToFront(hitDoc); // add to front of list
if (numDocs > maxDocs) { // if cache is full
HitDoc oldLast = last;
remove(last); // flush last
oldLast.doc = null; // let doc get gc'd
}
if (hitDoc.doc == null)
hitDoc.doc = searcher.doc(hitDoc.id); // cache miss: read document
return hitDoc.doc;
}
/** Returns the score for the nth document in this set. */
public final float score(int n) throws IOException {
return hitDoc(n).score;
}
private final HitDoc hitDoc(int n) throws IOException {
if (n >= length)
throw new IndexOutOfBoundsException("Not a valid hit number: " + n);
if (n >= hitDocs.size())
getMoreDocs(n);
return (HitDoc)hitDocs.elementAt(n);
}
private final void addToFront(HitDoc hitDoc) { // insert at front of cache
if (first == null)
last = hitDoc;
else
first.prev = hitDoc;
hitDoc.next = first;
first = hitDoc;
hitDoc.prev = null;
numDocs++;
}
private final void remove(HitDoc hitDoc) { // remove from cache
if (hitDoc.doc == null) // it's not in the list
return; // abort
if (hitDoc.next == null)
last = hitDoc.prev;
else
hitDoc.next.prev = hitDoc.prev;
if (hitDoc.prev == null)
first = hitDoc.next;
else
hitDoc.prev.next = hitDoc.next;
numDocs--;
}
}
final class HitDoc {
float score;
int id;
Document doc = null;
HitDoc next; // in doubly-linked cache
HitDoc prev; // in doubly-linked cache
HitDoc(float s, int i) {
score = s;
id = i;
}
}

View File

@ -0,0 +1,178 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.store.Directory;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.PriorityQueue;
/** Implements search over a single IndexReader. */
public final class IndexSearcher extends Searcher {
IndexReader reader;
/** Creates a searcher searching the index in the named directory. */
public IndexSearcher(String path) throws IOException {
this(IndexReader.open(path));
}
/** Creates a searcher searching the index in the provided directory. */
public IndexSearcher(Directory directory) throws IOException {
this(IndexReader.open(directory));
}
/** Creates a searcher searching the provided index. */
public IndexSearcher(IndexReader r) {
reader = r;
}
/** Frees resources associated with this Searcher. */
public final void close() throws IOException {
reader.close();
}
final int docFreq(Term term) throws IOException {
return reader.docFreq(term);
}
final Document doc(int i) throws IOException {
return reader.document(i);
}
final int maxDoc() throws IOException {
return reader.maxDoc();
}
final TopDocs search(Query query, Filter filter, final int nDocs)
throws IOException {
Scorer scorer = Query.scorer(query, this, reader);
if (scorer == null)
return new TopDocs(0, new ScoreDoc[0]);
final BitSet bits = filter != null ? filter.bits(reader) : null;
final HitQueue hq = new HitQueue(nDocs);
final int[] totalHits = new int[1];
scorer.score(new HitCollector() {
private float minScore = 0.0f;
public final void collect(int doc, float score) {
if (score > 0.0f && // ignore zeroed buckets
(bits==null || bits.get(doc))) { // skip docs not in bits
totalHits[0]++;
if (score >= minScore) {
hq.put(new ScoreDoc(doc, score)); // update hit queue
if (hq.size() > nDocs) { // if hit queue overfull
hq.pop(); // remove lowest in hit queue
minScore = ((ScoreDoc)hq.top()).score; // reset minScore
}
}
}
}
}, reader.maxDoc());
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
for (int i = hq.size()-1; i >= 0; i--) // put docs in array
scoreDocs[i] = (ScoreDoc)hq.pop();
return new TopDocs(totalHits[0], scoreDocs);
}
/** Lower-level search API.
*
* <p>{@link HitCollector#collect(int,float)} is called for every non-zero
* scoring document.
*
* <p>Applications should only use this if they need <it>all</it> of the
* matching documents. The high-level search API ({@link
* Searcher#search(Query)}) is usually more efficient, as it skips
* non-high-scoring hits. */
public final void search(Query query, HitCollector results)
throws IOException {
search(query, null, results);
}
/** Lower-level search API.
*
* <p>{@link HitCollector#collect(int,float)} is called for every non-zero
* scoring document.
*
* <p>Applications should only use this if they need <it>all</it> of the
* matching documents. The high-level search API ({@link
* Searcher#search(Query)}) is usually more efficient, as it skips
* non-high-scoring hits. */
public final void search(Query query, Filter filter,
final HitCollector results) throws IOException {
HitCollector collector = results;
if (filter != null) {
final BitSet bits = filter.bits(reader);
collector = new HitCollector() {
public final void collect(int doc, float score) {
if (bits.get(doc)) { // skip docs not in bits
results.collect(doc, score);
}
}
};
}
Scorer scorer = Query.scorer(query, this, reader);
if (scorer == null)
return;
scorer.score(collector, reader.maxDoc());
}
}

View File

@ -0,0 +1,2 @@
# sub-directory makefile for lucene
include ../rules.mk

View File

@ -0,0 +1,152 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.PriorityQueue;
/** Implements search over a set of Searcher's. */
public final class MultiSearcher extends Searcher {
private Searcher[] searchers;
private int[] starts;
private int maxDoc = 0;
/** Creates a searcher which searches <i>searchers</i>. */
public MultiSearcher(Searcher[] searchers) throws IOException {
this.searchers = searchers;
starts = new int[searchers.length + 1]; // build starts array
for (int i = 0; i < searchers.length; i++) {
starts[i] = maxDoc;
maxDoc += searchers[i].maxDoc(); // compute maxDocs
}
starts[searchers.length] = maxDoc;
}
/** Frees resources associated with this Searcher. */
public final void close() throws IOException {
for (int i = 0; i < searchers.length; i++)
searchers[i].close();
}
final int docFreq(Term term) throws IOException {
int docFreq = 0;
for (int i = 0; i < searchers.length; i++)
docFreq += searchers[i].docFreq(term);
return docFreq;
}
final Document doc(int n) throws IOException {
int i = searcherIndex(n); // find searcher index
return searchers[i].doc(n - starts[i]); // dispatch to searcher
}
// replace w/ call to Arrays.binarySearch in Java 1.2
private final int searcherIndex(int n) { // find searcher for doc n:
int lo = 0; // search starts array
int hi = searchers.length - 1; // for first element less
// than n, return its index
while (hi >= lo) {
int mid = (lo + hi) >> 1;
int midValue = starts[mid];
if (n < midValue)
hi = mid - 1;
else if (n > midValue)
lo = mid + 1;
else
return mid;
}
return hi;
}
final int maxDoc() throws IOException {
return maxDoc;
}
final TopDocs search(Query query, Filter filter, int nDocs)
throws IOException {
HitQueue hq = new HitQueue(nDocs);
float minScore = 0.0f;
int totalHits = 0;
for (int i = 0; i < searchers.length; i++) { // search each searcher
TopDocs docs = searchers[i].search(query, filter, nDocs);
totalHits += docs.totalHits; // update totalHits
ScoreDoc[] scoreDocs = docs.scoreDocs;
for (int j = 0; j < scoreDocs.length; j++) { // merge scoreDocs into hq
ScoreDoc scoreDoc = scoreDocs[j];
if (scoreDoc.score >= minScore) {
scoreDoc.doc += starts[i]; // convert doc
hq.put(scoreDoc); // update hit queue
if (hq.size() > nDocs) { // if hit queue overfull
hq.pop(); // remove lowest in hit queue
minScore = ((ScoreDoc)hq.top()).score; // reset minScore
}
} else
break; // no more scores > minScore
}
}
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
for (int i = hq.size()-1; i >= 0; i--) // put docs in array
scoreDocs[i] = (ScoreDoc)hq.pop();
return new TopDocs(totalHits, scoreDocs);
}
}

View File

@ -0,0 +1,161 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
/** A Query that matches documents containing a subset of terms provided by a
FilteredTermEnum enumeration. MultiTermQuery is not designed to be used by
itself. The reason being that it is not intialized with a FilteredTermEnum
enumeration. A FilteredTermEnum enumeration needs to be provided. For example,
WildcardQuery and FuzzyQuery extends MultiTermQuery to provide WildcardTermEnum
and FuzzyTermEnum respectively. */
public class MultiTermQuery extends Query {
private Term term;
private FilteredTermEnum enum;
private IndexReader reader;
private float boost = 1.0f;
private BooleanQuery query;
/** Enable or disable lucene style toString(field) format */
private static boolean LUCENE_STYLE_TOSTRING = false;
/** Constructs a query for terms matching <code>term</code>. */
public MultiTermQuery(Term term) {
this.term = term;
this.query = query;
}
/** Set the TermEnum to be used */
protected void setEnum(FilteredTermEnum enum) {
this.enum = enum;
}
/** Sets the boost for this term to <code>b</code>. Documents containing
* this term will (in addition to the normal weightings) have their score
* multiplied by <code>boost</code>. */
final public void setBoost(float boost) {
this.boost = boost;
}
/** Returns the boost for this term. */
final public float getBoost() {
return boost;
}
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
return getQuery().sumOfSquaredWeights(searcher);
}
final void normalize(float norm) {
try {
getQuery().normalize(norm);
} catch (IOException e) {
throw new RuntimeException(e.toString());
}
}
final Scorer scorer(IndexReader reader) throws IOException {
return getQuery().scorer(reader);
}
final private BooleanQuery getQuery() throws IOException {
if (query == null) {
BooleanQuery q = new BooleanQuery();
try {
do {
Term t = enum.term();
if (t != null) {
TermQuery tq = new TermQuery(t); // found a match
tq.setBoost(boost * enum.difference()); // set the boost
q.add(tq, false, false); // add to q
}
} while (enum.next());
} finally {
enum.close();
}
query = q;
}
return query;
}
/** Prints a user-readable version of this query. */
public String toString(String field) {
if (!LUCENE_STYLE_TOSTRING) {
Query q = null;
try {
q = getQuery();
} catch (Exception e) {}
if (q != null) {
return "(" + q.toString(field) + ")";
}
}
StringBuffer buffer = new StringBuffer();
if (!term.field().equals(field)) {
buffer.append(term.field());
buffer.append(":");
}
buffer.append(term.text());
if (boost != 1.0f) {
buffer.append("^");
buffer.append(Float.toString(boost));
}
return buffer.toString();
}
}

View File

@ -0,0 +1,96 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.index.*;
final class PhrasePositions {
int doc; // current doc
int position; // position in doc
int count; // remaining pos in this doc
int offset; // position in phrase
TermPositions tp; // stream of positions
PhrasePositions next; // used to make lists
PhrasePositions(TermPositions t, int o) throws IOException {
tp = t;
offset = o;
next();
}
final void next() throws IOException { // increments to next doc
if (!tp.next()) {
tp.close(); // close stream
doc = Integer.MAX_VALUE; // sentinel value
return;
}
doc = tp.doc();
position = 0;
}
final void firstPosition() throws IOException {
count = tp.freq(); // read first pos
nextPosition();
}
final boolean nextPosition() throws IOException {
if (count-- > 0) { // read subsequent pos's
position = tp.nextPosition() - offset;
return true;
} else
return false;
}
}

View File

@ -0,0 +1,183 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.IndexReader;
/** A Query that matches documents containing a particular sequence of terms.
This may be combined with other terms with a {@link BooleanQuery}.
*/
final public class PhraseQuery extends Query {
private String field;
private Vector terms = new Vector();
private float idf = 0.0f;
private float weight = 0.0f;
private float boost = 1.0f;
private int slop = 0;
/** Constructs an empty phrase query. */
public PhraseQuery() {
}
/** Sets the boost for this term to <code>b</code>. Documents containing
this term will (in addition to the normal weightings) have their score
multiplied by <code>b</code>. */
public final void setBoost(float b) { boost = b; }
/** Gets the boost for this term. Documents containing
this term will (in addition to the normal weightings) have their score
multiplied by <code>b</code>. The boost is 1.0 by default. */
public final float getBoost() { return boost; }
/** Sets the number of other words permitted between words in query phrase.
If zero, then this is an exact phrase search. For larger values this works
like a <code>WITHIN</code> or <code>NEAR</code> operator.
<p>The slop is in fact an edit-distance, where the units correspond to
moves of terms in the query phrase out of position. For example, to switch
the order of two words requires two moves (the first move places the words
atop one another), so to permit re-orderings of phrases, the slop must be
at least two.
<p>More exact matches are scored higher than sloppier matches, thus search
results are sorted by exactness.
<p>The slop is zero by default, requiring exact matches.*/
public final void setSlop(int s) { slop = s; }
/** Returns the slop. See setSlop(). */
public final int getSlop() { return slop; }
/** Adds a term to the end of the query phrase. */
public final void add(Term term) {
if (terms.size() == 0)
field = term.field();
else if (term.field() != field)
throw new IllegalArgumentException
("All phrase terms must be in the same field: " + term);
terms.addElement(term);
}
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
for (int i = 0; i < terms.size(); i++) // sum term IDFs
idf += Similarity.idf((Term)terms.elementAt(i), searcher);
weight = idf * boost;
return weight * weight; // square term weights
}
final void normalize(float norm) {
weight *= norm; // normalize for query
weight *= idf; // factor from document
}
final Scorer scorer(IndexReader reader) throws IOException {
if (terms.size() == 0) // optimize zero-term case
return null;
if (terms.size() == 1) { // optimize one-term case
Term term = (Term)terms.elementAt(0);
TermDocs docs = reader.termDocs(term);
if (docs == null)
return null;
return new TermScorer(docs, reader.norms(term.field()), weight);
}
TermPositions[] tps = new TermPositions[terms.size()];
for (int i = 0; i < terms.size(); i++) {
TermPositions p = reader.termPositions((Term)terms.elementAt(i));
if (p == null)
return null;
tps[i] = p;
}
if (slop == 0) // optimize exact case
return new ExactPhraseScorer(tps, reader.norms(field), weight);
else
return
new SloppyPhraseScorer(tps, slop, reader.norms(field), weight);
}
/** Prints a user-readable version of this query. */
public final String toString(String f) {
StringBuffer buffer = new StringBuffer();
if (!field.equals(f)) {
buffer.append(field);
buffer.append(":");
}
buffer.append("\"");
for (int i = 0; i < terms.size(); i++) {
buffer.append(((Term)terms.elementAt(i)).text());
if (i != terms.size()-1)
buffer.append(" ");
}
buffer.append("\"");
if (boost != 1.0f) {
buffer.append("^");
buffer.append(Float.toString(boost));
}
return buffer.toString();
}
}

View File

@ -0,0 +1,72 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.util.PriorityQueue;
final class PhraseQueue extends PriorityQueue {
PhraseQueue(int size) {
initialize(size);
}
protected final boolean lessThan(Object o1, Object o2) {
PhrasePositions pp1 = (PhrasePositions)o1;
PhrasePositions pp2 = (PhrasePositions)o2;
if (pp1.doc == pp2.doc)
return pp1.position < pp2.position;
else
return pp1.doc < pp2.doc;
}
}

View File

@ -0,0 +1,124 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Vector;
import org.apache.lucene.util.*;
import org.apache.lucene.index.*;
abstract class PhraseScorer extends Scorer {
protected byte[] norms;
protected float weight;
protected PhraseQueue pq;
protected PhrasePositions first, last;
PhraseScorer(TermPositions[] tps, byte[] n, float w) throws IOException {
norms = n;
weight = w;
// use PQ to build a sorted list of PhrasePositions
pq = new PhraseQueue(tps.length);
for (int i = 0; i < tps.length; i++)
pq.put(new PhrasePositions(tps[i], i));
pqToList();
}
final void score(HitCollector results, int end) throws IOException {
while (last.doc < end) { // find doc w/ all the terms
while (first.doc < last.doc) { // scan forward in first
do {
first.next();
} while (first.doc < last.doc);
firstToLast();
if (last.doc >= end)
return;
}
// found doc with all terms
float freq = phraseFreq(); // check for phrase
if (freq > 0.0) {
float score = Similarity.tf(freq)*weight; // compute score
score *= Similarity.norm(norms[first.doc]); // normalize
results.collect(first.doc, score); // add to results
}
last.next(); // resume scanning
}
}
abstract protected float phraseFreq() throws IOException;
protected final void pqToList() {
last = first = null;
while (pq.top() != null) {
PhrasePositions pp = (PhrasePositions)pq.pop();
if (last != null) { // add next to end of list
last.next = pp;
} else
first = pp;
last = pp;
pp.next = null;
}
}
protected final void firstToLast() {
last.next = first; // move first to end of list
last = first;
first = first.next;
last.next = null;
}
}

View File

@ -0,0 +1,153 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.IndexReader;
/** A Query that matches documents containing terms with a specified prefix. */
final public class PrefixQuery extends Query {
private Term prefix;
private IndexReader reader;
private float boost = 1.0f;
private BooleanQuery query;
/** Constructs a query for terms starting with <code>prefix</code>. */
public PrefixQuery(Term prefix) {
this.prefix = prefix;
this.reader = reader;
}
/** Sets the boost for this term to <code>b</code>. Documents containing
this term will (in addition to the normal weightings) have their score
multiplied by <code>boost</code>. */
public void setBoost(float boost) {
this.boost = boost;
}
/** Returns the boost for this term. */
public float getBoost() {
return boost;
}
final void prepare(IndexReader reader) {
this.query = null;
this.reader = reader;
}
final float sumOfSquaredWeights(Searcher searcher)
throws IOException {
return getQuery().sumOfSquaredWeights(searcher);
}
void normalize(float norm) {
try {
getQuery().normalize(norm);
} catch (IOException e) {
throw new RuntimeException(e.toString());
}
}
Scorer scorer(IndexReader reader) throws IOException {
return getQuery().scorer(reader);
}
private BooleanQuery getQuery() throws IOException {
if (query == null) {
BooleanQuery q = new BooleanQuery();
TermEnum enum = reader.terms(prefix);
try {
String prefixText = prefix.text();
String prefixField = prefix.field();
do {
Term term = enum.term();
if (term != null &&
term.text().startsWith(prefixText) &&
term.field() == prefixField) {
TermQuery tq = new TermQuery(term); // found a match
tq.setBoost(boost); // set the boost
q.add(tq, false, false); // add to q
//System.out.println("added " + term);
} else {
break;
}
} while (enum.next());
} finally {
enum.close();
}
query = q;
}
return query;
}
/** Prints a user-readable version of this query. */
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
if (!prefix.field().equals(field)) {
buffer.append(prefix.field());
buffer.append(":");
}
buffer.append(prefix.text());
buffer.append('*');
if (boost != 1.0f) {
buffer.append("^");
buffer.append(Float.toString(boost));
}
return buffer.toString();
}
}

View File

@ -0,0 +1,101 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.util.Hashtable;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
/** The abstract base class for queries.
<p>Instantiable subclasses are:
<ul>
<li> {@link TermQuery}
<li> {@link PhraseQuery}
<li> {@link BooleanQuery}
</ul>
<p>A parser for queries is contained in:
<ul>
<li><a href="doc/lucene.queryParser.QueryParser.html">QueryParser</a>
</ul>
*/
abstract public class Query {
// query weighting
abstract float sumOfSquaredWeights(Searcher searcher) throws IOException;
abstract void normalize(float norm);
// query evaluation
abstract Scorer scorer(IndexReader reader) throws IOException;
void prepare(IndexReader reader) {}
static Scorer scorer(Query query, Searcher searcher, IndexReader reader)
throws IOException {
query.prepare(reader);
float sum = query.sumOfSquaredWeights(searcher);
float norm = 1.0f / (float)Math.sqrt(sum);
query.normalize(norm);
return query.scorer(reader);
}
/** Prints a query to a string, with <code>field</code> as the default field
for terms.
<p>The representation used is one that is readable by
<a href="doc/lucene.queryParser.QueryParser.html">QueryParser</a>
(although, if the query was created by the parser, the printed
representation may not be exactly what was parsed). */
abstract public String toString(String field);
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
final class ScoreDoc {
float score;
int doc;
ScoreDoc(int d, float s) {
doc = d;
score = s;
}
}

Some files were not shown because too many files have changed in this diff Show More