diff --git a/BUILD.txt b/BUILD.txt index ea1ab5063f6..df2ec4c1d9f 100644 --- a/BUILD.txt +++ b/BUILD.txt @@ -3,15 +3,15 @@ Lucene Build Instructions $Id$ Basic steps: - 0) Install JDK 1.2 (or greater), Ant 1.4 (or greater), and the Ant + 0) Install JDK 1.2 (or greater), Ant 1.5 (or greater), and the Ant optional.jar 1) Download Lucene from Apache and unpack it 2) Connect to the top-level of your Lucene installation - 3) Install JavaCC + 3) Install JavaCC (optional) 4) Run ant Step 0) Set up your development environment (JDK 1.2 or greater, -Ant 1.4 or greater) +Ant 1.5 or greater) We'll assume that you know how to get and set up the JDK - if you don't, then we suggest starting at http://java.sun.com and learning @@ -22,26 +22,22 @@ with the development version of Lucene, we recommend you stick with the most current version of Java (at the time of this writing, JDK 1.4). Also, note that if you're working with the Lucene source, you'll need to use Ant (see below) and Ant requires at least JDK 1.1 -(and in the future will likely move to requiring JDK 1.2, according to +(and in the future will move to requiring JDK 1.2, according to the Ant install docs). Like most of the Jakarta projects, Lucene uses Apache Ant for build -control. Specifically, you MUST use Ant version 1.4 or greater. +control. Specifically, you MUST use Ant version 1.5 or greater. Ant is "kind of like make without make's wrinkles". Ant is implemented in java and uses XML-based configuration files. You can get it at: - http://jakarta.apache.org/ant - -Specifically, you can get the binary distributions at: - - http://jakarta.apache.org/builds/jakarta-ant/release/ + http://ant.apache.org You'll need to download both the Ant binary distribution and the "optional" jar file. Install these according to the instructions at: - http://jakarta.apache.org/ant/manual + http://ant.apache.org/manual Step 1) Download Lucene from Apache @@ -79,21 +75,16 @@ NOTE: the ~ character represents your user account home directory. Step 3) Install JavaCC -Building the Lucene distribution from the source requires the JavaCC -parser generator. This software has a separate license agreement that -must be agreed to before you can use it. The web page for JavaCC is here: +Building the Lucene distribution from the source does not require the JavaCC +parser generator, but if you wish to regenerate any of the pre-generated +parser pieces, you will need to install JavaCC. - http://www.experimentalstuff.com/Technologies/JavaCC/ + http://javacc.dev.java.net Follow the download links and download the zip file to a temporary -location on your file system. Unzip the file and run the large class file -in the directory. On windows, use this command from the temp directory: +location on your file system. - java -cp . JavaCC2_1 - -This will launch a Java GUI installer. There is also a command line -installer available, and the installation class will give you those -directions. After JavaCC is installed, edit your build properties +After JavaCC is installed, edit your build.properties (as in step 2), and add the line javacc.home=/javacc/bin @@ -107,14 +98,16 @@ location of your ant installation, typing "ant" at the shell prompt and command prompt should run ant. Ant will by default look for the "build.xml" file in your current directory, and compile Lucene. +To rebuild any of the JavaCC-based parsers, run "ant javacc". + For further information on Lucene, go to: http://jakarta.apache.org/lucene/ Please join the Lucene-User mailing list by visiting this site: http://jakarta.apache.org/site/mail.html - + Please post suggestions, questions, corrections or additions to this -document to the Lucene-User mailing list. +document to the lucene-user mailing list. This file was originally written by Steven J. Owens . This file was modified by Jon S. Stevens . diff --git a/build.xml b/build.xml index 241a0c67796..49643336db2 100644 --- a/build.xml +++ b/build.xml @@ -9,6 +9,8 @@ + + @@ -52,8 +54,8 @@ - - + + ################################################################## JavaCC not found. JavaCC Home: ${javacc.home} - JavaCC Zip: ${javacc.zip} + JavaCC Zip: ${javacc.jar} - Please download and install JavaCC 2.0 from: + Please download and install JavaCC from: - <http://www.experimentalstuff.com/Technologies/JavaCC/> + <http://javacc.dev.java.net> Then, create a build.properties file either in your home directory, or within the Lucene directory and set the javacc.home property to the path where JavaCC.zip is located. For example, - if you installed JavaCC in /usr/local/java/javacc2.0, then set the + if you installed JavaCC in /usr/local/java/javacc3.2, then set the javacc.home property to: javacc.home=/usr/local/java/javacc2.0/bin @@ -89,9 +91,10 @@ If you get an error like the one below, then you have not installed things correctly. Please check all your paths and try again. - java.lang.NoClassDefFoundError: COM/sun/labs/javacc/Main + java.lang.NoClassDefFoundError: org.javacc.parser.Main ################################################################## - + + @@ -99,25 +102,10 @@ - - - - - - - - + @@ -135,7 +123,7 @@ - + Manifest-Version: 1.0 @@ -158,7 +146,7 @@ Implementation-Vendor: Lucene /> - + - + @@ -202,22 +190,8 @@ Implementation-Vendor: Lucene - + - - - - - - @@ -228,7 +202,7 @@ Implementation-Vendor: Lucene - + @@ -239,11 +213,6 @@ Implementation-Vendor: Lucene - - + --> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/default.properties b/default.properties index fd4607e7393..e85de7880e9 100644 --- a/default.properties +++ b/default.properties @@ -58,8 +58,8 @@ junit.reports = ${build.dir}/unit-reports # Home directory of JavaCC javacc.home = . -javacc.zip.dir = ${javacc.home}/lib -javacc.zip = ${javacc.zip.dir}/JavaCC.zip +javacc.zip.dir = ${javacc.home}/bin/lib +javacc.jar = ${javacc.zip.dir}/javacc.jar # Home directory of jakarta-site2 jakarta.site2.home = ../jakarta-site2 diff --git a/src/demo/org/apache/lucene/demo/html/HTMLParser.java b/src/demo/org/apache/lucene/demo/html/HTMLParser.java new file mode 100644 index 00000000000..e05fa0aab57 --- /dev/null +++ b/src/demo/org/apache/lucene/demo/html/HTMLParser.java @@ -0,0 +1,688 @@ +/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */ +package org.apache.lucene.demo.html; + +import java.io.*; +import java.util.Properties; + +public class HTMLParser implements HTMLParserConstants { + public static int SUMMARY_LENGTH = 200; + + StringBuffer title = new StringBuffer(SUMMARY_LENGTH); + StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); + Properties metaTags=new Properties(); + String currentMetaTag=""; + int length = 0; + boolean titleComplete = false; + boolean inTitle = false; + boolean inMetaTag = false; + boolean inStyle = false; + boolean inScript = false; + boolean afterTag = false; + boolean afterSpace = false; + String eol = System.getProperty("line.separator"); + PipedReader pipeIn = null; + PipedWriter pipeOut; + + public HTMLParser(File file) throws FileNotFoundException { + this(new FileInputStream(file)); + } + + public String getTitle() throws IOException, InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (titleComplete || (length > SUMMARY_LENGTH)) + break; + wait(10); + } + } + return title.toString().trim(); + } + + public Properties getMetaTags() throws IOException, +InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (titleComplete || (length > SUMMARY_LENGTH)) + break; + wait(10); + } + } + return metaTags; + } + + + public String getSummary() throws IOException, InterruptedException { + if (pipeIn == null) + getReader(); // spawn parsing thread + while (true) { + synchronized(this) { + if (summary.length() >= SUMMARY_LENGTH) + break; + wait(10); + } + } + if (summary.length() > SUMMARY_LENGTH) + summary.setLength(SUMMARY_LENGTH); + + String sum = summary.toString().trim(); + String tit = getTitle(); + if (sum.startsWith(tit)) + return sum.substring(tit.length()); + else + return sum; + } + + public Reader getReader() throws IOException { + if (pipeIn == null) { + pipeIn = new PipedReader(); + pipeOut = new PipedWriter(pipeIn); + + Thread thread = new ParserThread(this); + thread.start(); // start parsing + } + + return pipeIn; + } + + void addToSummary(String text) { + if (summary.length() < SUMMARY_LENGTH) { + summary.append(text); + if (summary.length() >= SUMMARY_LENGTH) { + synchronized(this) { + notifyAll(); + } + } + } + } + + void addText(String text) throws IOException { + if (inScript) + return; + if (inStyle) + return; + if (inMetaTag) + { + metaTags.setProperty(currentMetaTag, text); + return; + } + if (inTitle) + title.append(text); + else { + addToSummary(text); + if (!titleComplete && !title.equals("")) { // finished title + synchronized(this) { + titleComplete = true; // tell waiting threads + notifyAll(); + } + } + } + + length += text.length(); + pipeOut.write(text); + + afterSpace = false; + } + + void addSpace() throws IOException { + if (inScript) + return; + if (!afterSpace) { + if (inTitle) + title.append(" "); + else + addToSummary(" "); + + String space = afterTag ? eol : " "; + length += space.length(); + pipeOut.write(space); + afterSpace = true; + } + } + + final public void HTMLDocument() throws ParseException, IOException { + Token t; + label_1: + while (true) { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case TagName: + case DeclName: + case Comment1: + case Comment2: + case Word: + case Entity: + case Space: + case Punct: + ; + break; + default: + jj_la1[0] = jj_gen; + break label_1; + } + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case TagName: + Tag(); + afterTag = true; + break; + case DeclName: + t = Decl(); + afterTag = true; + break; + case Comment1: + case Comment2: + CommentTag(); + afterTag = true; + break; + case Word: + t = jj_consume_token(Word); + addText(t.image); afterTag = false; + break; + case Entity: + t = jj_consume_token(Entity); + addText(Entities.decode(t.image)); afterTag = false; + break; + case Punct: + t = jj_consume_token(Punct); + addText(t.image); afterTag = false; + break; + case Space: + jj_consume_token(Space); + addSpace(); afterTag = false; + break; + default: + jj_la1[1] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } + jj_consume_token(0); + } + + final public void Tag() throws ParseException, IOException { + Token t1, t2; + boolean inImg = false; + t1 = jj_consume_token(TagName); + inTitle = t1.image.equalsIgnoreCase(" + inMetaTag = t1.image.equalsIgnoreCase(" + inStyle = t1.image.equalsIgnoreCase(" + inImg = t1.image.equalsIgnoreCase(" + if (inScript) { // keep track if in