From 672827d4a423e41cafccbb03a5b3132682192725 Mon Sep 17 00:00:00 2001
From: Otis Gospodnetic <otis@apache.org>
Date: Fri, 22 Apr 2005 04:30:42 +0000
Subject: [PATCH] - Not referenced from anywhere, and not really needed

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@164169 13f79535-47bb-0310-9956-ffa450edef68
---
 docs/luceneplan.html | 648 -------------------------------------------
 xdocs/luceneplan.xml | 380 -------------------------
 2 files changed, 1028 deletions(-)
 delete mode 100644 docs/luceneplan.html
 delete mode 100644 xdocs/luceneplan.xml
diff --git a/docs/luceneplan.html b/docs/luceneplan.html
deleted file mode 100644
index 5fb0b0b614d..00000000000
--- a/docs/luceneplan.html
+++ /dev/null
@@ -1,648 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-
-<!--
-Copyright 1999-2004 The Apache Software Foundation
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-
-<!-- Content Stylesheet for Site -->
-
-        
-<!-- start the processing -->
-    <!-- ====================================================================== -->
-    <!-- GENERATED FILE, DO NOT EDIT, EDIT THE XML FILE IN xdocs INSTEAD! -->
-    <!-- Main Page Section -->
-    <!-- ====================================================================== -->
-    <html>
-        <head>
-            <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"/>
-
-                        
-           
-                                    
-                        
-            <title>Apache Lucene - Plan for enhancements to Lucene</title>
-        </head>
-
-        <body bgcolor="#ffffff" text="#000000" link="#525D76">        
-            <table border="0" width="100%" cellspacing="0">
-                <!-- TOP IMAGE -->
-                <tr>
-                    <td align="left">
-<a href="http://jakarta.apache.org"><img src="http://jakarta.apache.org/images/jakarta-logo.gif" border="0"/></a>
-</td>
-<td align="right">
-<a href="http://lucene.apache.org/"><img src="./images/lucene_green_300.gif" alt="Apache Lucene" border="0"/></a>
-</td>
-                </tr>
-            </table>
-            <table border="0" width="100%" cellspacing="4">
-                <tr><td colspan="2">
-                    <hr noshade="" size="1"/>
-                </td></tr>
-                
-                <tr>
-                    <!-- LEFT SIDE NAVIGATION -->
-                    <td width="20%" valign="top" nowrap="true">
-                    
-    <!-- ============================================================ -->
-
-                <p><strong>About</strong></p>
-        <ul>
-                    <li>    <a href="./index.html">Overview</a>
-</li>
-                    <li>    <a href="http://wiki.apache.org/jakarta-lucene/PoweredBy">Powered by Lucene</a>
-</li>
-                    <li>    <a href="./whoweare.html">Who We Are</a>
-</li>
-                    <li>    <a href="./mailinglists.html">Mailing Lists</a>
-</li>
-                </ul>
-            <p><strong>Resources</strong></p>
-        <ul>
-                    <li>    <a href="http://wiki.apache.org/jakarta-lucene">Wiki</a>
-</li>
-                    <li>    <a href="http://wiki.apache.org/jakarta-lucene/LuceneFAQ">FAQ</a>
-</li>
-                    <li>    <a href="./gettingstarted.html">Getting Started</a>
-</li>
-                    <li>    <a href="./queryparsersyntax.html">Query Syntax</a>
-</li>
-                    <li>    <a href="./fileformats.html">File Formats</a>
-</li>
-                    <li>    <a href="./api/index.html">Javadoc</a>
-</li>
-                    <li>    <a href="./contributions.html">Contributions</a>
-</li>
-                    <li>    <a href="./resources.html">Articles, etc.</a>
-</li>
-                    <li>    <a href="./benchmarks.html">Benchmarks</a>
-</li>
-                    <li>    <a href="http://issues.apache.org/bugzilla/buglist.cgi?bug_status=NEW&bug_status=ASSIGNED&bug_status=REOPENED&email1=&emailtype1=substring&emailassigned_to1=1&email2=&emailtype2=substring&emailreporter2=1&bugidtype=include&bug_id=&changedin=&votes=&chfieldfrom=&chfieldto=Now&chfieldvalue=&product=Lucene&short_desc=%5BPATCH%5D&short_desc_type=allwordssubstr&long_desc=&long_desc_type=allwordssubstr&bug_file_loc=&bug_file_loc_type=allwordssubstr&keywords=&keywords_type=anywords&field0-0-0=noop&type0-0-0=noop&value0-0-0=&cmdtype=doit&order=Importance">Patches</a>
-</li>
-                    <li>    <a href="http://jakarta.apache.org/site/bugs.html">Bugs</a>
-</li>
-                    <li>    <a href="http://issues.apache.org/bugzilla/buglist.cgi?bug_status=NEW&bug_status=ASSIGNED&bug_status=REOPENED&email1=&emailtype1=substring&emailassigned_to1=1&email2=&emailtype2=substring&emailreporter2=1&bugidtype=include&bug_id=&changedin=&votes=&chfieldfrom=&chfieldto=Now&chfieldvalue=&product=Lucene&short_desc=&short_desc_type=allwordssubstr&long_desc=&long_desc_type=allwordssubstr&bug_file_loc=&bug_file_loc_type=allwordssubstr&keywords=&keywords_type=anywords&field0-0-0=noop&type0-0-0=noop&value0-0-0=&cmdtype=doit&order=Importance">Lucene Bugs</a>
-</li>
-                    <li>    <a href="./lucene-sandbox/">Lucene Sandbox</a>
-</li>
-                </ul>
-            <p><strong>Download</strong></p>
-        <ul>
-                    <li>    <a href="http://www.apache.org/dyn/closer.cgi/jakarta/lucene/binaries/">Binaries</a>
-</li>
-                    <li>    <a href="http://www.apache.org/dyn/closer.cgi/jakarta/lucene/source/">Source Code</a>
-</li>
-                    <li>    <a href="http://svn.apache.org/viewcvs.cgi/lucene/">Source Repository</a>
-</li>
-                </ul>
-                        </td>
-                    <td width="80%" align="left" valign="top">
-                                                                    <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="Purpose"><strong>Purpose</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        The purpose of this document is to outline plans for
-                        making <a href="http://jakarta.apache.org/lucene">
-                        Apache Lucene</a> work as a more general drop-in
-                        component.  It makes the assumption that this is an
-                        objective for the Lucene user and development community.
-                </p>
-                                                <p>
-                        The best reference is <a href="http://www.htdig.org">
-                        htDig</a>, though it is not quite as sophisticated as
-                        Lucene, it has a number of features that make it
-                        desirable.  It however is a traditional c-compiled app
-                        which makes it somewhat unpleasant to install on some
-                        platforms (like Solaris!).
-                </p>
-                                                <p>
-                        This plan is being submitted to the Lucene developer
-                        community for an initial reaction, advice, feedback and
-                        consent.  Following this it will be submitted to the
-                        Lucene user community for support.  Although, I'm (Andy
-                        Oliver) capable of providing these enhancements by
-                        myself, I'd of course prefer to work on them in concert
-                        with others.
-                </p>
-                                                <p>
-                        While I'm outlaying a fairly large feature set, these can
-                        be implemented incrementally of course (and are probably
-                        best if done that way).
-                </p>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                                <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="Goal and Objectives"><strong>Goal and Objectives</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        The goal is to provide features to Lucene that allow it
-                        to be used as a drop-in search engine.  It should provide
-                        many of the features of projects like <a href="http://www.htdig.org">htDig</a> while surpassing
-                        them with unique Lucene features and capabilities such as
-                        easy installation on and java-supporting platform,
-                        and support for document fields and field searches.  And
-                        of course, <a href="http://apache.org/LICENSE">
-                        a pragmatic software license</a>.
-                </p>
-                                                <p>
-                        To reach this goal we'll implement code to support the
-                        following objectives that augment but do not replace
-                        the current Lucene feature set.
-                </p>
-                                                <ul>
-                        <li>
-                                Document Location Independence - meaning mapping
-                                real contexts to runtime contexts.
-                                Essentially, if the document is at
-                                /var/www/htdocs/mydoc.html, I probably want it
-                                indexed as
-                                http://www.bigevilmegacorp.com/mydoc.html.
-                        </li>
-                        <li>
-                                Standard methods of creating central indicies -
-                                file system indexing is probably less useful in
-                                many environments than is *remote* indexing (for
-                                instance http).  I would suggest that most folks
-                                would prefer that general functionality be
-                                supported by Lucene instead of having to write
-                                code for every indexing project.  Obviously, if
-                                what they are doing is *special* they'll have to
-                                code, but general document indexing across
-                                web servers would not qualify.
-                        </li>
-                        <li>
-                                Document interpretation abstraction - currently
-                                one must handle document object construction via
-                                custom code.  A standard interface for plugging
-                                in format handlers should be supported.
-                        </li>
-                        <li>
-                                Mime and file-extension to document
-                                interpretation mapping.
-                        </li>
-                </ul>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                                <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="Crawlers"><strong>Crawlers</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        Crawlers are data source executable code.  They crawl a file
-                        system, ftp site, web site, etc. to create the index.
-                        These standard crawlers may not make ALL of Lucene's
-                        functionality available, though they should be able to
-                        make most of it available through configuration.
-                </p>
-                                                <p>
-                        <b> Abstract Crawler </b>
-                </p>
-                                                <p>
-                                The AbstractCrawler is basically the parent for all
-                                Crawler classes.  It provides implementation for the
-                                following functions/properties:
-                        </p>
-                                                <ul>
-                                <li>
-                                        index path - where to write the index.
-                                </li>
-                                <li>
-                                        cui - create or update the index
-                                </li>
-                                <li>
-                                        root context - the start of the pathname
-                                        that should be replaced by the
-                                        replace with property or dropped
-                                        entirely.  Example: /opt/tomcat/webapps
-                                </li>
-                                <li>
-                                        replace with - when specified replaces
-                                        the root context.  Example:
-                                        http://jakarta.apache.org.
-                                </li>
-                                <li>
-                                        replacement type - the type of
-                                        replace with path:  relative, URL or
-                                        path.
-                                </li>
-                                <li>
-                                        location - the location to start
-                                        indexing at.
-                                </li>
-                                <li>
-                                        doctypes - only index documents with
-                                        these doctypes.  If not specified all
-                                        registered mime-types are used.
-                                        Example: "xml,doc,html"
-                                </li>
-                                <li>
-                                        recursive - if not specified is turned
-                                        off.
-                                </li>
-                                <li>
-                                        level - optional level of directory or
-                                        links to traverse.  By default is
-                                        assumed to be infinite.  Recursive must
-                                        be turned on or this is ignored.  Range:
-                                        0 - Long.MAX_VALUE.
-                                </li>
-                                <li>
-                                        SleeptimeBetweenCalls - can be used to
-                                        avoid flooding a machine with too many
-                                        requests
-                                </li>
-                                <li>
-                                        RequestTimeout - kill the crawler
-                                        request after the specified period of
-                                        inactivity.
-                                </li>
-                                <li>
-                                        IncludeFilter - include only items
-                                        matching filter.  (can occur multiple
-                                        times)
-                                </li>
-                                <li>
-                                        ExcludeFilter - exclude only items
-                                        matching filter.  (can occur multiple
-                                        times)
-                                </li>
-                                <li>
-                                        ExpandOnly - use but do not index items
-                                        that match this pattern (regex?) (can
-                                        occur multiple times)
-                                </li>
-                                <li>
-                                        NoExpand - Index but do not follow the
-                                        links in items that match this pattern
-                                        (regex?) (can occur multiple times)
-                                </li>
-                                <li>
-                                        MaxItems - stops indexing after x
-                                        documents have been indexed.
-                                </li>
-                                <li>
-                                        MaxMegs - stops indexing after x megs
-                                        have been indexed..  (should this be in
-                                        specific crawlers?)
-                                </li>
-                                <li>
-                                        properties - in addition to the settings
-                                        (probably from the command line) read
-                                        this properties file and get them from
-                                        it.  Command line options override
-                                        the properties file in the case of
-                                        duplicates.  There should also be an
-                                        environment variable or VM parameter to
-                                        set this.
-                                </li>
-                        </ul>
-                                                <p>
-                              <b>FileSystemCrawler</b>
-                        </p>
-                                                <p>
-                                This should extend the AbstractCrawler and
-                                support any additional options required for a
-                                file system index.
-                        </p>
-                                                <p>
-                  <b>HTTP Crawler </b>
-                        </p>
-                                                <p>
-                                Supports the AbstractCrawler options as well as:
-                        </p>
-                                                <ul>
-                                <li>
-                                        span hosts - Whether to span hosts or not,
-                                        by default this should be no.
-                                </li>
-                                <li>
-                                        restrict domains - (ignored if span
-                                        hosts is not enabled).  Whether all
-                                        spanned hosts must be in the same domain
-                                        (default is off).
-                                </li>
-                                <li>
-                                        try directories - Whether to attempt
-                                        directory listings or not (so if you
-                                        recurse and go to
-                                        /nextcontext/index.html this option says
-                                        to also try /nextcontext to get the dir
-                                        listing)
-                                </li>
-                                <li>
-                                        map extensions -
-                                        (always/default/never/fallback).  Whether
-                                        to always use extension mapping, by
-                                        default (fallback to mime type), NEVER
-                                        or fallback if mime is not available
-                                        (default).
-                                </li>
-                                <li>
-                                        ignore robots - ignore robots.txt, on or
-                                        off (default - off)
-                                </li>
-                        </ul>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                                <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="MIMEMap"><strong>MIMEMap</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        A configurable registry of document types, their
-                        description, an identifier, mime-type and file
-                        extension.  This should map both MIME -&gt; factory
-                        and extension -&gt; factory.
-                </p>
-                                                <p>
-                        This might be configured at compile time or by a
-                        properties file, etc.  For example:
-                </p>
-                                                <table>
-                        <tr>
-                        <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                Description
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                Identifier
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                Extensions
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                MimeType
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                DocumentFactory
-            </font>
-</td>
-            </tr>
-                                <tr>
-                        <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "Word Document"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "doc"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "doc"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "vnd.application/ms-word"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                POIWordDocumentFactory
-            </font>
-</td>
-            </tr>
-                                <tr>
-                        <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "HTML Document"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "html"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                "html,htm"
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                &nbsp;
-            </font>
-</td>
-                                <td bgcolor="#a0ddf0" colspan="" rowspan="" valign="top" align="left">
-    <font color="#000000" size="-1" face="arial,helvetica,sanserif">
-                HTMLDocumentFactory
-            </font>
-</td>
-            </tr>
-            </table>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                                <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="DocumentFactory"><strong>DocumentFactory</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        An interface for classes which create document objects
-                        for particular file types.  Examples:
-                        HTMLDocumentFactory, DOCDocumentFactory,
-                        XLSDocumentFactory, XML DocumentFactory.
-                </p>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                                <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="FieldMapping classes"><strong>FieldMapping classes</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        A class that maps standard fields from the
-                        DocumentFactories into *fields* in the Document objects
-                        they create.  I suggest that a regular expression system
-                        or xpath might be the most universal way to do this.
-                        For instance if perhaps I had an XML factory that
-                        represented XML elements as fields, I could map content
-                        from particular fields to their fields or suppress them
-                        entirely.  We could even make this configurable.
-                </p>
-                                                <p>
-
-                        for example:
-                </p>
-                                                <ul>
-                        <li>
-                                htmldoc.properties
-                        </li>
-                        <li>
-                        suppress=*
-                        </li>
-                        <li>
-                        author=content:g/author\:\ ........................................./
-                        </li>
-                        <li>
-                        author.suppress=false
-                        </li>
-                        <li>
-                        title=content:g/title\:\ ........................................./
-                        </li>
-                        <li>
-                        title.suppress=false
-                        </li>
-                </ul>
-                                                <p>
-                        In this example we map html documents such that all
-                        fields are suppressed but author and title.  We map
-                        author and title to anything in the content matching
-                        author: (and x characters).  Okay my regular expresions
-                        suck but hopefully you get the idea.
-                </p>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                                <table border="0" cellspacing="0" cellpadding="2" width="100%">
-      <tr><td bgcolor="#525D76">
-        <font color="#ffffff" face="arial,helvetica,sanserif">
-          <a name="Final Thoughts"><strong>Final Thoughts</strong></a>
-        </font>
-      </td></tr>
-      <tr><td>
-        <p>
-        <blockquote>
-                                    <p>
-                        We might also consider eliminating the DocumentFactory
-                        entirely by making an AbstractDocument from which the
-                        current document object would inherit from.  I
-                        experimented with this locally, and it was a relatively
-                        minor code change and there was of course no difference
-                        in performance.  The Document Factory classes would
-                        instead be instances of various subclasses of
-                        AbstractDocument.
-                </p>
-                                                <p>
-                        My inspiration for this is HTDig (http://www.htdig.org/).
-                        While this goes slightly beyond what HTDig provides by
-                        providing field mapping (where HTDIG is just interested
-                        in Strings/numbers wherever they are found), it provides
-                        at least what I would need to use this as a drop-in for
-                        most places I contract at (with the obvious exception of
-                        a default set of content handlers which would of course
-                        develop naturally over time).
-                </p>
-                                                <p>
-                        I am able to certainly contribute to this effort if the
-                        development community is open to it.  I'd suggest we do
-                        it iteratively in stages and not aim for all of this at
-                        once (for instance leave out the field mapping at first).
-                </p>
-                                                <p>
-
-                        Anyhow, please give me some feedback, counter
-                        suggestions, let me know if I'm way off base or out of
-                        line, etc. -Andy
-                </p>
-                            </blockquote>
-        </p>
-      </td></tr>
-      <tr><td><br/></td></tr>
-    </table>
-                                        </td>
-                </tr>
-
-                <!-- FOOTER -->
-                <tr><td colspan="2">
-                    <hr noshade="" size="1"/>
-                </td></tr>
-                <tr><td colspan="2">
-                    <div align="center"><font color="#525D76" size="-1"><em>
-                    Copyright &#169; 1999-2005, The Apache Software Foundation
-                    </em></font></div>
-                </td></tr>
-            </table>
-        </body>
-    </html>
-<!-- end the processing -->
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/xdocs/luceneplan.xml b/xdocs/luceneplan.xml
deleted file mode 100644
index 5ca0ec7af3e..00000000000
--- a/xdocs/luceneplan.xml
+++ /dev/null
@@ -1,380 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<document>
-  <properties>
-   <title>Plan for enhancements to Lucene</title>
-   <authors>
-    <person email="acoliver@apache.org" name="Andrew C. Oliver" id="AO"/>
-   </authors>
-  </properties>
-  <body>
-
-        <section name="Purpose">
-                <p>
-                        The purpose of this document is to outline plans for
-                        making <a href="http://jakarta.apache.org/lucene">
-                        Apache Lucene</a> work as a more general drop-in
-                        component.  It makes the assumption that this is an
-                        objective for the Lucene user and development community.
-                </p>
-                <p>
-                        The best reference is <a href="http://www.htdig.org">
-                        htDig</a>, though it is not quite as sophisticated as
-                        Lucene, it has a number of features that make it
-                        desirable.  It however is a traditional c-compiled app
-                        which makes it somewhat unpleasant to install on some
-                        platforms (like Solaris!).
-                </p>
-                <p>
-                        This plan is being submitted to the Lucene developer
-                        community for an initial reaction, advice, feedback and
-                        consent.  Following this it will be submitted to the
-                        Lucene user community for support.  Although, I'm (Andy
-                        Oliver) capable of providing these enhancements by
-                        myself, I'd of course prefer to work on them in concert
-                        with others.
-                </p>
-                <p>
-                        While I'm outlaying a fairly large feature set, these can
-                        be implemented incrementally of course (and are probably
-                        best if done that way).
-                </p>
-        </section>
-
-        <section name="Goal and Objectives">
-                <p>
-                        The goal is to provide features to Lucene that allow it
-                        to be used as a drop-in search engine.  It should provide
-                        many of the features of projects like <a
-                        href="http://www.htdig.org">htDig</a> while surpassing
-                        them with unique Lucene features and capabilities such as
-                        easy installation on and java-supporting platform,
-                        and support for document fields and field searches.  And
-                        of course, <a href="http://apache.org/LICENSE">
-                        a pragmatic software license</a>.
-                </p>
-                <p>
-                        To reach this goal we'll implement code to support the
-                        following objectives that augment but do not replace
-                        the current Lucene feature set.
-                </p>
-                <ul>
-                        <li>
-                                Document Location Independence - meaning mapping
-                                real contexts to runtime contexts.
-                                Essentially, if the document is at
-                                /var/www/htdocs/mydoc.html, I probably want it
-                                indexed as
-                                http://www.bigevilmegacorp.com/mydoc.html.
-                        </li>
-                        <li>
-                                Standard methods of creating central indicies -
-                                file system indexing is probably less useful in
-                                many environments than is *remote* indexing (for
-                                instance http).  I would suggest that most folks
-                                would prefer that general functionality be
-                                supported by Lucene instead of having to write
-                                code for every indexing project.  Obviously, if
-                                what they are doing is *special* they'll have to
-                                code, but general document indexing across
-                                web servers would not qualify.
-                        </li>
-                        <li>
-                                Document interpretation abstraction - currently
-                                one must handle document object construction via
-                                custom code.  A standard interface for plugging
-                                in format handlers should be supported.
-                        </li>
-                        <li>
-                                Mime and file-extension to document
-                                interpretation mapping.
-                        </li>
-                </ul>
-        </section>
-        <section name="Crawlers">
-                <p>
-                        Crawlers are data source executable code.  They crawl a file
-                        system, ftp site, web site, etc. to create the index.
-                        These standard crawlers may not make ALL of Lucene's
-                        functionality available, though they should be able to
-                        make most of it available through configuration.
-                </p>
-                <!--<section name="AbstractIndexer">-->
-                <p>
-                        <b> Abstract Crawler </b>
-                </p>
-                        <p>
-                                The AbstractCrawler is basically the parent for all
-                                Crawler classes.  It provides implementation for the
-                                following functions/properties:
-                        </p>
-                        <ul>
-                                <li>
-                                        index path - where to write the index.
-                                </li>
-                                <li>
-                                        cui - create or update the index
-                                </li>
-                                <li>
-                                        root context - the start of the pathname
-                                        that should be replaced by the
-                                        replace with property or dropped
-                                        entirely.  Example: /opt/tomcat/webapps
-                                </li>
-                                <li>
-                                        replace with - when specified replaces
-                                        the root context.  Example:
-                                        http://jakarta.apache.org.
-                                </li>
-                                <li>
-                                        replacement type - the type of
-                                        replace with path:  relative, URL or
-                                        path.
-                                </li>
-                                <li>
-                                        location - the location to start
-                                        indexing at.
-                                </li>
-                                <li>
-                                        doctypes - only index documents with
-                                        these doctypes.  If not specified all
-                                        registered mime-types are used.
-                                        Example: "xml,doc,html"
-                                </li>
-                                <li>
-                                        recursive - if not specified is turned
-                                        off.
-                                </li>
-                                <li>
-                                        level - optional level of directory or
-                                        links to traverse.  By default is
-                                        assumed to be infinite.  Recursive must
-                                        be turned on or this is ignored.  Range:
-                                        0 - Long.MAX_VALUE.
-                                </li>
-                                <li>
-                                        SleeptimeBetweenCalls - can be used to
-                                        avoid flooding a machine with too many
-                                        requests
-                                </li>
-                                <li>
-                                        RequestTimeout - kill the crawler
-                                        request after the specified period of
-                                        inactivity.
-                                </li>
-                                <li>
-                                        IncludeFilter - include only items
-                                        matching filter.  (can occur multiple
-                                        times)
-                                </li>
-                                <li>
-                                        ExcludeFilter - exclude only items
-                                        matching filter.  (can occur multiple
-                                        times)
-                                </li>
-                                <li>
-                                        ExpandOnly - use but do not index items
-                                        that match this pattern (regex?) (can
-                                        occur multiple times)
-                                </li>
-                                <li>
-                                        NoExpand - Index but do not follow the
-                                        links in items that match this pattern
-                                        (regex?) (can occur multiple times)
-                                </li>
-                                <li>
-                                        MaxItems - stops indexing after x
-                                        documents have been indexed.
-                                </li>
-                                <li>
-                                        MaxMegs - stops indexing after x megs
-                                        have been indexed..  (should this be in
-                                        specific crawlers?)
-                                </li>
-                                <li>
-                                        properties - in addition to the settings
-                                        (probably from the command line) read
-                                        this properties file and get them from
-                                        it.  Command line options override
-                                        the properties file in the case of
-                                        duplicates.  There should also be an
-                                        environment variable or VM parameter to
-                                        set this.
-                                </li>
-                        </ul>
-                <!--</section>-->
-                <!--<s2 title="FileSystemIndexer">-->
-                        <p>
-                              <b>FileSystemCrawler</b>
-                        </p>
-                        <p>
-                                This should extend the AbstractCrawler and
-                                support any additional options required for a
-                                file system index.
-                        </p>
-                <!--</s2>-->
-                <!--<s2 title="HTTPIndexer">-->
-                        <p>
-                  <b>HTTP Crawler </b>
-                        </p>
-                        <p>
-                                Supports the AbstractCrawler options as well as:
-                        </p>
-                        <ul>
-                                <li>
-                                        span hosts - Whether to span hosts or not,
-                                        by default this should be no.
-                                </li>
-                                <li>
-                                        restrict domains - (ignored if span
-                                        hosts is not enabled).  Whether all
-                                        spanned hosts must be in the same domain
-                                        (default is off).
-                                </li>
-                                <li>
-                                        try directories - Whether to attempt
-                                        directory listings or not (so if you
-                                        recurse and go to
-                                        /nextcontext/index.html this option says
-                                        to also try /nextcontext to get the dir
-                                        listing)
-                                </li>
-                                <li>
-                                        map extensions -
-                                        (always/default/never/fallback).  Whether
-                                        to always use extension mapping, by
-                                        default (fallback to mime type), NEVER
-                                        or fallback if mime is not available
-                                        (default).
-                                </li>
-                                <li>
-                                        ignore robots - ignore robots.txt, on or
-                                        off (default - off)
-                                </li>
-                        </ul>
-        <!--        </s2> -->
-        </section>
-
-        <section name="MIMEMap">
-                <p>
-                        A configurable registry of document types, their
-                        description, an identifier, mime-type and file
-                        extension.  This should map both MIME -> factory
-                        and extension -> factory.
-                </p>
-                <p>
-                        This might be configured at compile time or by a
-                        properties file, etc.  For example:
-                </p>
-                        <table>
-                                <tr>
-                                        <td>Description</td>
-                                        <td>Identifier</td>
-                                        <td>Extensions</td>
-                                        <td>MimeType</td>
-                                        <td>DocumentFactory</td>
-                                </tr>
-                                <tr>
-                                        <td>"Word Document"</td>
-                                        <td>"doc"</td>
-                                        <td>"doc"</td>
-                                        <td>"vnd.application/ms-word"</td>
-                                        <td>POIWordDocumentFactory</td>
-                                </tr>
-                                <tr>
-                                        <td>"HTML Document"</td>
-                                        <td>"html"</td>
-                                        <td>"html,htm"</td>
-                                        <td></td>
-                                        <td>HTMLDocumentFactory</td>
-                                </tr>
-                        </table>
-        </section>
-        <section name="DocumentFactory">
-                <p>
-                        An interface for classes which create document objects
-                        for particular file types.  Examples:
-                        HTMLDocumentFactory, DOCDocumentFactory,
-                        XLSDocumentFactory, XML DocumentFactory.
-                </p>
-        </section>
-        <section name="FieldMapping classes">
-                <p>
-                        A class that maps standard fields from the
-                        DocumentFactories into *fields* in the Document objects
-                        they create.  I suggest that a regular expression system
-                        or xpath might be the most universal way to do this.
-                        For instance if perhaps I had an XML factory that
-                        represented XML elements as fields, I could map content
-                        from particular fields to their fields or suppress them
-                        entirely.  We could even make this configurable.
-                </p>
-                <p>
-
-                        for example:
-                </p>
-                <ul>
-                        <li>
-                                htmldoc.properties
-                        </li>
-                        <li>
-                        suppress=*
-                        </li>
-                        <li>
-                        author=content:g/author\:\ ........................................./
-                        </li>
-                        <li>
-                        author.suppress=false
-                        </li>
-                        <li>
-                        title=content:g/title\:\ ........................................./
-                        </li>
-                        <li>
-                        title.suppress=false
-                        </li>
-                </ul>
-                <p>
-                        In this example we map html documents such that all
-                        fields are suppressed but author and title.  We map
-                        author and title to anything in the content matching
-                        author: (and x characters).  Okay my regular expresions
-                        suck but hopefully you get the idea.
-                </p>
-        </section>
-        <section name="Final Thoughts">
-                <p>
-                        We might also consider eliminating the DocumentFactory
-                        entirely by making an AbstractDocument from which the
-                        current document object would inherit from.  I
-                        experimented with this locally, and it was a relatively
-                        minor code change and there was of course no difference
-                        in performance.  The Document Factory classes would
-                        instead be instances of various subclasses of
-                        AbstractDocument.
-                </p>
-                <p>
-                        My inspiration for this is HTDig (http://www.htdig.org/).
-                        While this goes slightly beyond what HTDig provides by
-                        providing field mapping (where HTDIG is just interested
-                        in Strings/numbers wherever they are found), it provides
-                        at least what I would need to use this as a drop-in for
-                        most places I contract at (with the obvious exception of
-                        a default set of content handlers which would of course
-                        develop naturally over time).
-                </p>
-                <p>
-                        I am able to certainly contribute to this effort if the
-                        development community is open to it.  I'd suggest we do
-                        it iteratively in stages and not aim for all of this at
-                        once (for instance leave out the field mapping at first).
-                </p>
-                <p>
-
-                        Anyhow, please give me some feedback, counter
-                        suggestions, let me know if I'm way off base or out of
-                        line, etc. -Andy
-                </p>
-        </section>
-
-  </body>
-</document>

Description	Identifier	Extensions	MimeType	DocumentFactory
"Word Document"	"doc"	"doc"	"vnd.application/ms-word"	POIWordDocumentFactory
"HTML Document"	"html"	"html,htm"		HTMLDocumentFactory