LUCENE-10166: removed module-level README.txt and modified a few links, removed a few obsolete instructions from 20 years ago. (#379)

2021-10-19 09:45:49 +02:00 · 2021-10-19 09:45:49 +02:00 · e290f91bb2
parent 6f67e8287f
commit e290f91bb2
11 changed files with 18 additions and 124 deletions
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -52,7 +52,7 @@ The snowball stopword lists in
  analysis/common/src/resources/org/apache/lucene/analysis/snowball
 were developed by Martin Porter and Richard Boulton.
 The full snowball package is available from
-  http://snowball.tartarus.org/
+  https://snowballstem.org/

 The KStem stemmer in
  analysis/common/src/org/apache/lucene/analysis/en
--- a/lucene/analysis/README.txt
+++ b/lucene/analysis/README.txt
@ -1,70 +0,0 @@
-Analysis README file
-
-INTRODUCTION
-
-The Analysis Module provides analysis capabilities to Lucene and Solr
-applications.
-
-The Lucene web site is at:
-  http://lucene.apache.org/
-
-Please join the Lucene-User mailing list by sending a message to:
-  java-user-subscribe@lucene.apache.org
-
-FILES
-
-lucene-analysis-common-XX.jar
-  The primary analysis module library, containing general-purpose analysis
-  components and support for various languages.
-
-lucene-analysis-icu-XX.jar
-  An add-on analysis library that provides improved Unicode support via
-  International Components for Unicode (ICU). Note: this module depends on
-  the ICU4j jar file (version >= 4.6.0)
-
-lucene-analysis-kuromoji-XX.jar
-  An analyzer with morphological analysis for Japanese.
-
-lucene-analysis-morfologik-XX.jar
-  An analyzer using the Morfologik stemming library.
-
-lucene-analysis-nori-XX.jar
-  An analyzer with morphological analysis for Korean.
-
-lucene-analysis-opennlp-XX.jar
-  An analyzer using the OpenNLP natural-language processing library.
-
-lucene-analysis-phonetic-XX.jar
-  An add-on analysis library that provides phonetic encoders via Apache
-  Commons-Codec. Note: this module depends on the commons-codec jar 
-  file
-  
-lucene-analysis-smartcn-XX.jar
-  An add-on analysis library that provides word segmentation for Simplified
-  Chinese.
-
-lucene-analysis-stempel-XX.jar
-  An add-on analysis library that contains a universal algorithmic stemmer,
-  including tables for the Polish language.
-
-common/src/java
-icu/src/java
-kuromoji/src/java
-morfologik/src/java
-nori/src/java
-opennlp/src/java
-phonetic/src/java
-smartcn/src/java
-stempel/src/java
-  The source code for the libraries.
-
-common/src/test
-icu/src/test
-kuromoji/src/test
-morfologik/src/test
-nori/src/test
-opennlp/src/test
-phonetic/src/test
-smartcn/src/test
-stempel/src/test
-  Unit tests for the libraries.
--- a/lucene/analysis/common/README.txt
+++ b/lucene/analysis/common/README.txt
@ -1,22 +0,0 @@
-Lucene Analyzers README file
-
-This project provides pre-compiled version of the Snowball stemmers,
-now located at https://github.com/snowballstem/snowball/tree/53739a805cfa6c77ff8496dc711dc1c106d987c1 (GitHub),
-together with classes integrating them with the Lucene search engine.
-
-The snowball tree needs patches applied to properly generate efficient code for lucene.
-You can regenerate everything with 'gradlew snowball'
-Refer to gradle/generation/snowball* files in the build for upgrading snowball.
-
-IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!
-
-An index created using the Snowball module in Lucene 2.3.2 and below
-might not be compatible with the Snowball module in Lucene 2.4 or greater.
-
-For more information about this issue see:
-https://issues.apache.org/jira/browse/LUCENE-1142
-
-
-For more information on Snowball, see:
-  http://snowball.tartarus.org/
-
--- a/lucene/analysis/common/checksums.properties
+++ b/lucene/analysis/common/checksums.properties
@ -1,2 +0,0 @@
-
-checksum.jflexClassicTokenizerImpl=8c4eac5fd02be551e666783df5531afda23cbc96
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java
@ -24,8 +24,8 @@ import org.apache.lucene.analysis.util.StemmerUtil;

 /**
 * Normalizes German characters according to the heuristics of the <a
- * href="http://snowball.tartarus.org/algorithms/german2/stemmer.html">German2 snowball
- * algorithm</a>. It allows for the fact that ä, ö and ü are sometimes written as ae, oe and ue.
+ * href="https://snowballstem.org/algorithms/german2/stemmer.html">German2 snowball algorithm</a>.
+ * It allows for the fact that ä, ö and ü are sometimes written as ae, oe and ue.
 *
 * <ul>
 *   <li>'ß' is replaced by 'ss'
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java
@ -23,7 +23,7 @@ package org.apache.lucene.analysis.en;
       Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
       no. 3, pp 130-137,

-   See also http://www.tartarus.org/~martin/PorterStemmer/index.html
+   See also https://snowballstem.org/algorithms/porter/stemmer.html

   Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
   Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/package-info.java
@ -17,30 +17,21 @@

 /**
 * {@link org.apache.lucene.analysis.TokenFilter} and {@link org.apache.lucene.analysis.Analyzer}
- * implementations that use Snowball stemmers.
+ * implementations that use a modified version of <a href="https://snowballstem.org/">Snowball
+ * stemmers</a>. See <a href="https://snowballstem.org/">Snowball project page</a> for more
+ * information about the original algorithms used.
 *
- * <p>This project provides pre-compiled version of the Snowball stemmers based on revision 500 of
- * the Tartarus Snowball repository, together with classes integrating them with the Lucene search
- * engine.
+ * <p>Lucene snowball classes require a few patches to the original Snowball source tree to generate
+ * more efficient code.
 *
- * <p>A few changes has been made to the static Snowball code and compiled stemmers:
- *
- * <ul>
- *   <li>Class SnowballProgram is made abstract and contains new abstract method stem() to avoid
- *       reflection in Lucene filter class SnowballFilter.
- *   <li>All use of StringBuffers has been refactored to StringBuilder for speed.
- *   <li>Snowball BSD license header has been added to the Java classes to avoid having RAT adding
- *       ASL headers.
- * </ul>
- *
- * <p>See the Snowball <a href ="http://snowball.tartarus.org/">home page</a> for more information
- * about the algorithms.
+ * <p>Refer to {@code gradle/generation/snowball*} and {@code help/regeneration.txt} files in Lucene
+ * source code for instructions on how code regeneration from Snowball sources works, what
+ * modifications are applied and what is required to regenerate snowball analyzers from scratch.
 *
 * <p><b>IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!</b>
 *
- * <p>An index created using the Snowball module in Lucene 2.3.2 and below might not be compatible
- * with the Snowball module in Lucene 2.4 or greater.
- *
- * <p>For more information about this issue see: https://issues.apache.org/jira/browse/LUCENE-1142
+ * <p>An index created using the Snowball module in one Lucene version may not be compatible with an
+ * index created with another Lucene version. The token stream will vary depending on the changes in
+ * snowball stemmer definitions.
 */
 package org.apache.lucene.analysis.snowball;
--- a/lucene/analysis/stempel/src/java/overview.html
+++ b/lucene/analysis/stempel/src/java/overview.html
@ -98,7 +98,7 @@ heuristic rules<br>
 </ul>
 There are many existing and well-known implementations of stemmers for
 English (Porter, Lovins, Krovetz) and other European languages
-(<a href="http://snowball.tartarus.org">Snowball</a>). There are also
+(<a href="https://snowballstem.org/">Snowball</a>). There are also
 good quality commercial lemmatizers for Polish. However, there is only
 one
 freely available Polish stemmer, implemented by
--- a/lucene/misc/README.txt
+++ b/lucene/misc/README.txt
@ -1,3 +0,0 @@
-miscellaneous is a home of different Lucene-related classes
-that all belong to org.apache.lucene.misc package, as they are not
-substantial enough to warrant their own package.
--- a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
@ -129,7 +129,7 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
   *  (x &lt;= min) &#63; base : sqrt(x+(base**2)-min)
   * </code> ...but with a special case check for 0.
   *
-   * <p>This degrates to <code>sqrt(x)</code> when min and base are both 0
+   * <p>This degrades to <code>sqrt(x)</code> when min and base are both 0
   *
   * @see #setBaselineTfFactors
   * @see <a href="doc-files/ss.baselineTf.svg">An SVG visualization of this function</a>
--- a/lucene/misc/src/java/org/apache/lucene/misc/package-info.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/package-info.java
@ -15,5 +15,5 @@
 * limitations under the License.
 */

-/** Miscellaneous index tools. */
+/** Miscellaneous Lucene utilities that don't really fit anywhere else. */
 package org.apache.lucene.misc;
				`@ -1,2 +0,0 @@`

				`checksum.jflexClassicTokenizerImpl=8c4eac5fd02be551e666783df5531afda23cbc96`