LUCENE-2167: Implement StandardTokenizer with the UAX#29 Standard

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1002032 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-09-28 06:16:16 +00:00
parent c562b10b2e
commit 3c26a9167c
65 changed files with 13107 additions and 749 deletions

View File

@ -17,18 +17,7 @@ package org.apache.lucene.benchmark.quality;
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic;
import org.apache.lucene.benchmark.quality.Judge;
import org.apache.lucene.benchmark.quality.QualityQuery;
import org.apache.lucene.benchmark.quality.QualityQueryParser;
import org.apache.lucene.benchmark.quality.QualityBenchmark;
import org.apache.lucene.benchmark.quality.trec.TrecJudge;
import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader;
import org.apache.lucene.benchmark.quality.utils.SimpleQQParser;
@ -36,6 +25,12 @@ import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
/**
* Test that quality run does its job.
* <p>
@ -177,6 +172,7 @@ public class TestQualityRun extends BenchmarkTestCase {
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"analyzer=org.apache.lucene.analysis.standard.ClassicAnalyzer",
"docs.file=" + getWorkDirResourcePath("reuters.578.lines.txt.bz2"),
"content.source.log.step=2500",
"doc.term.vector=false",

View File

@ -9,6 +9,12 @@ API Changes
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
* LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
as well as tokenizing URLs and email addresses according to the relevant
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
New Features
* LUCENE-2413: Consolidated Solr analysis components into common.

View File

@ -52,3 +52,8 @@ See http://project.carrot2.org/license.html.
The SmartChineseAnalyzer source code (smartcn) was
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
is derived from Unicode data such as the Unicode Character Database.
See http://unicode.org/copyright.html for more details.

View File

@ -38,7 +38,7 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-wiki-tokenizer"/>
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@ -49,27 +49,61 @@
nobak="on"/>
</target>
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
<target name="jflex-StandardAnalyzer" depends="init,jflex-check,gen-tlds" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex"
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex"
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
</target>
<target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
</target>
<target name="clean-jflex">
<delete>
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>
<fileset dir="src/java/org/apache/lucene/analysis/standard" includes="*.java">
<containsregexp expression="generated.*by.*JFlex"/>
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>
</delete>
</target>
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
<target name="gen-tlds" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
dir="."
fork="true"
failonerror="true">
<classpath>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
<arg value="${tld.zones}"/>
<arg value="${tld.output}"/>
</java>
</target>
<target name="compile-tools">
<compile
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
<classpath refid="classpath"/>
</compile>
</target>
</project>

View File

@ -132,7 +132,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -218,7 +218,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StandardFilter(result);
result = new StandardFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(excltable != null && !excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);

View File

@ -247,7 +247,7 @@ public final class CzechAnalyzer extends ReusableAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stoptable);
if (matchVersion.onOrAfter(Version.LUCENE_31)) {

View File

@ -120,7 +120,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -237,7 +237,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
result = new KeywordMarkerFilter(result, exclusionSet);

View File

@ -135,7 +135,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new StandardFilter(result);
result = new StandardFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new GreekStemFilter(result);

View File

@ -104,6 +104,9 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
// prior to this we get the classic behavior, standardfilter does it for us.
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new EnglishPossessiveFilter(result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -0,0 +1,52 @@
package org.apache.lucene.analysis.en;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* TokenFilter that removes possessives (trailing 's) from words.
*/
public final class EnglishPossessiveFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public EnglishPossessiveFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
if (bufferLength >= 2 &&
buffer[bufferLength-2] == '\'' &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S'))
termAtt.setLength(bufferLength - 2); // Strip last 2 characters off
return true;
}
}

View File

@ -120,7 +120,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -120,7 +120,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -240,7 +240,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);

View File

@ -120,7 +120,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -119,7 +119,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) {

View File

@ -120,7 +120,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -246,7 +246,7 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
Reader aReader) {
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty())

View File

@ -120,7 +120,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -120,7 +120,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -124,7 +124,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -175,7 +175,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.snowball;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
import org.apache.lucene.analysis.util.CharArraySet;
@ -80,7 +81,11 @@ public final class SnowballAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new StandardFilter(matchVersion, result);
// remove the possessive 's for english stemmers
if (matchVersion.onOrAfter(Version.LUCENE_31) &&
(name.equals("English") || name.equals("Porter") || name.equals("Lovins")))
result = new EnglishPossessiveFilter(result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
result = new TurkishLowerCaseFilter(result);
@ -108,7 +113,7 @@ public final class SnowballAnalyzer extends Analyzer {
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new StandardFilter(matchVersion, streams.source);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
streams.result = new TurkishLowerCaseFilter(streams.result);

View File

@ -0,0 +1,318 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Tuesday, September 14, 2010 11:34:20 AM UTC
// generated on Wednesday, September 15, 2010 7:00:44 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
[aA][cC]
| [aA][dD]
| [aA][eE]
| [aA][eE][rR][oO]
| [aA][fF]
| [aA][gG]
| [aA][iI]
| [aA][lL]
| [aA][mM]
| [aA][nN]
| [aA][oO]
| [aA][qQ]
| [aA][rR]
| [aA][rR][pP][aA]
| [aA][sS]
| [aA][sS][iI][aA]
| [aA][tT]
| [aA][uU]
| [aA][wW]
| [aA][xX]
| [aA][zZ]
| [bB][aA]
| [bB][bB]
| [bB][dD]
| [bB][eE]
| [bB][fF]
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
| [bB][nN]
| [bB][oO]
| [bB][rR]
| [bB][sS]
| [bB][tT]
| [bB][vV]
| [bB][wW]
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
| [cC][fF]
| [cC][gG]
| [cC][hH]
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
| [cC][vV]
| [cC][xX]
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
| [dD][oO]
| [dD][zZ]
| [eE][cC]
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][rR]
| [eE][sS]
| [eE][tT]
| [eE][uU]
| [fF][iI]
| [fF][jJ]
| [fF][kK]
| [fF][mM]
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][bB]
| [gG][dD]
| [gG][eE]
| [gG][fF]
| [gG][gG]
| [gG][hH]
| [gG][iI]
| [gG][lL]
| [gG][mM]
| [gG][nN]
| [gG][oO][vV]
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][rR]
| [hH][tT]
| [hH][uU]
| [iI][dD]
| [iI][eE]
| [iI][lL]
| [iI][mM]
| [iI][nN]
| [iI][nN][fF][oO]
| [iI][nN][tT]
| [iI][oO]
| [iI][qQ]
| [iI][rR]
| [iI][sS]
| [iI][tT]
| [jJ][eE]
| [jJ][mM]
| [jJ][oO]
| [jJ][oO][bB][sS]
| [jJ][pP]
| [kK][eE]
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][mM]
| [kK][nN]
| [kK][pP]
| [kK][rR]
| [kK][wW]
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][kK]
| [lL][rR]
| [lL][sS]
| [lL][tT]
| [lL][uU]
| [lL][vV]
| [lL][yY]
| [mM][aA]
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
| [mM][kK]
| [mM][lL]
| [mM][mM]
| [mM][nN]
| [mM][oO]
| [mM][oO][bB][iI]
| [mM][pP]
| [mM][qQ]
| [mM][rR]
| [mM][sS]
| [mM][tT]
| [mM][uU]
| [mM][uU][sS][eE][uU][mM]
| [mM][vV]
| [mM][wW]
| [mM][xX]
| [mM][yY]
| [mM][zZ]
| [nN][aA]
| [nN][aA][mM][eE]
| [nN][cC]
| [nN][eE]
| [nN][eE][tT]
| [nN][fF]
| [nN][gG]
| [nN][iI]
| [nN][lL]
| [nN][oO]
| [nN][pP]
| [nN][rR]
| [nN][uU]
| [nN][zZ]
| [oO][mM]
| [oO][rR][gG]
| [pP][aA]
| [pP][eE]
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][kK]
| [pP][lL]
| [pP][mM]
| [pP][nN]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
| [pP][tT]
| [pP][wW]
| [pP][yY]
| [qQ][aA]
| [rR][eE]
| [rR][oO]
| [rR][sS]
| [rR][uU]
| [rR][wW]
| [sS][aA]
| [sS][bB]
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
| [sS][mM]
| [sS][nN]
| [sS][oO]
| [sS][rR]
| [sS][tT]
| [sS][uU]
| [sS][vV]
| [sS][yY]
| [sS][zZ]
| [tT][cC]
| [tT][dD]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
| [tT][tT]
| [tT][vV]
| [tT][wW]
| [tT][zZ]
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [yY][eE]
| [yY][tT]
| [zZ][aA]
| [zZ][mM]
| [zZ][wW]
) "."? // Accept trailing root (empty) domain

View File

@ -0,0 +1,140 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
/**
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}, using a list of
* English stop words.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ClassicAnalyzer:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords
* <li> As of 2.9, StopFilter preserves position
* increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
*
* ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
* As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
* as specified by UAX#29.
*/
public final class ClassicAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* Specifies whether deprecated acronyms should be replaced with HOST type.
* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
*/
private final boolean replaceInvalidAcronym;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */
public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) {
super(matchVersion, stopWords);
replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
}
/** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/
public ClassicAnalyzer(Version matchVersion) {
this(matchVersion, STOP_WORDS_SET);
}
/** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(File)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* reusableTokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
src.setMaxTokenLength(maxTokenLength);
src.setReplaceInvalidAcronym(replaceInvalidAcronym);
TokenStream tok = new ClassicFilter(src);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected boolean reset(final Reader reader) throws IOException {
src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
return super.reset(reader);
}
};
}
}

View File

@ -0,0 +1,73 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** Normalizes tokens extracted with {@link ClassicTokenizer}. */
public class ClassicFilter extends TokenFilter {
/** Construct filtering <i>in</i>. */
public ClassicFilter(TokenStream in) {
super(in);
}
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
// this filters uses attribute type
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
@Override
public final boolean incrementToken() throws java.io.IOException {
if (!input.incrementToken()) {
return false;
}
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
final String type = typeAtt.type();
if (type == APOSTROPHE_TYPE && // remove 's
bufferLength >= 2 &&
buffer[bufferLength-2] == '\'' &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
// Strip last 2 characters off
termAtt.setLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
int upto = 0;
for(int i=0;i<bufferLength;i++) {
char c = buffer[i];
if (c != '.')
buffer[upto++] = c;
}
termAtt.setLength(upto);
}
return true;
}
}

View File

@ -0,0 +1,234 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** A grammar-based tokenizer constructed with JFlex
*
* <p> This should be a good tokenizer for most European-language documents:
*
* <ul>
* <li>Splits words at punctuation characters, removing punctuation. However, a
* dot that's not followed by whitespace is considered part of a token.
* <li>Splits words at hyphens, unless there's a number in the token, in which case
* the whole token is interpreted as a product number and is not split.
* <li>Recognizes email addresses and internet hostnames as one token.
* </ul>
*
* <p>Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ClassicAnalyzer:
* <ul>
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
* </ul>
*
* ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
* As of 3.1, {@link StandardTokenizer} implements Unicode text segmentation,
* as specified by UAX#29.
*/
public final class ClassicTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private StandardTokenizerInterface scanner;
public static final int ALPHANUM = 0;
public static final int APOSTROPHE = 1;
public static final int ACRONYM = 2;
public static final int COMPANY = 3;
public static final int EMAIL = 4;
public static final int HOST = 5;
public static final int NUM = 6;
public static final int CJ = 7;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
@Deprecated
public static final int ACRONYM_DEP = 8;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>"
};
private boolean replaceInvalidAcronym;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/** @see #setMaxTokenLength */
public int getMaxTokenLength() {
return maxTokenLength;
}
/**
* Creates a new instance of the {@link ClassicTokenizer}. Attaches
* the <code>input</code> to the newly created JFlex scanner.
*
* @param input The input reader
*
* See http://issues.apache.org/jira/browse/LUCENE-1068
*/
public ClassicTokenizer(Version matchVersion, Reader input) {
super();
init(input, matchVersion);
}
/**
* Creates a new ClassicTokenizer with a given {@link AttributeSource}.
*/
public ClassicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
super(source);
init(input, matchVersion);
}
/**
* Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
*/
public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
super(factory);
init(input, matchVersion);
}
private final void init(Reader input, Version matchVersion) {
this.scanner = new ClassicTokenizerImpl(input);
if (matchVersion.onOrAfter(Version.LUCENE_24)) {
replaceInvalidAcronym = true;
} else {
replaceInvalidAcronym = false;
}
this.input = input;
}
// this tokenizer generates three attributes:
// term offset, positionIncrement and type
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
int posIncr = 1;
while(true) {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerInterface.YYEOF) {
return false;
}
if (scanner.yylength() <= maxTokenLength) {
posIncrAtt.setPositionIncrement(posIncr);
scanner.getText(termAtt);
final int start = scanner.yychar();
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == ClassicTokenizer.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]);
termAtt.setLength(termAtt.length() - 1); // remove extra '.'
} else {
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM]);
}
} else {
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[tokenType]);
}
return true;
} else
// When we skip a too-long term, we still increment the
// position increment
posIncr++;
}
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
scanner.yyreset(reader);
}
/**
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, ClassicTokenizer mischaracterized as acronyms tokens like www.abc.com
* when they should have been labeled as hosts instead.
* @return true if ClassicTokenizer now returns these tokens as Hosts, otherwise false
*
* @deprecated Remove in 3.X and make true the only valid value
*/
@Deprecated
public boolean isReplaceInvalidAcronym() {
return replaceInvalidAcronym;
}
/**
*
* @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
* @deprecated Remove in 3.X and make true the only valid value
*
* See https://issues.apache.org/jira/browse/LUCENE-1068
*/
@Deprecated
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
}
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:50 */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/15/10 3:01 AM */
package org.apache.lucene.analysis.standard;
@ -21,7 +21,7 @@ package org.apache.lucene.analysis.standard;
/*
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
*/
@ -33,10 +33,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 17.05.10 14:50 from the specification file
* <tt>C:/Users/Uwe Schindler/Projects/lucene/newtrunk/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex</tt>
* on 9/15/10 3:01 AM from the specification file
* <tt>c:/Users/us/IdeaProjects/lucene/test-dev-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class StandardTokenizerImplOrig implements StandardTokenizerInterface {
class ClassicTokenizerImpl implements StandardTokenizerInterface {
/** This character denotes the end of file */
public static final int YYEOF = -1;
@ -383,7 +383,7 @@ public final void getText(CharTermAttribute t) {
*
* @param in the java.io.Reader to read input from.
*/
StandardTokenizerImplOrig(java.io.Reader in) {
ClassicTokenizerImpl(java.io.Reader in) {
this.zzReader = in;
}
@ -393,7 +393,7 @@ public final void getText(CharTermAttribute t) {
*
* @param in the java.io.Inputstream to read input from.
*/
StandardTokenizerImplOrig(java.io.InputStream in) {
ClassicTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.standard;
/*
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
*/
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%%
%class StandardTokenizerImplOrig
%class ClassicTokenizerImpl
%implements StandardTokenizerInterface
%unicode 3.0
%integer

View File

@ -39,10 +39,12 @@ import java.util.Set;
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords
* <li> As of 2.9, StopFilter preserves position
* increments
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
* and StopFilter correctly handles Unicode 4.0 supplementary characters
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
* are the pre-3.1 implementations of StandardTokenizer and
* StandardAnalyzer.
* <li> As of 2.9, StopFilter preserves position increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
@ -122,7 +124,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(maxTokenLength);
src.setReplaceInvalidAcronym(replaceInvalidAcronym);
TokenStream tok = new StandardFilter(src);
TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {

View File

@ -17,33 +17,45 @@ package org.apache.lucene.analysis.standard;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
public final class StandardFilter extends TokenFilter {
/** Construct filtering <i>in</i>. */
/**
* Normalizes tokens extracted with {@link StandardTokenizer}.
*/
public class StandardFilter extends TokenFilter {
private final Version matchVersion;
public StandardFilter(TokenStream in) {
super(in);
this(Version.LUCENE_30, in);
}
private static final String APOSTROPHE_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ACRONYM];
public StandardFilter(Version matchVersion, TokenStream in) {
super(in);
this.matchVersion = matchVersion;
}
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
// this filters uses attribute type
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
@Override
public final boolean incrementToken() throws java.io.IOException {
public final boolean incrementToken() throws IOException {
if (matchVersion.onOrAfter(Version.LUCENE_31))
return input.incrementToken(); // TODO: add some niceties for the new grammar
else
return incrementTokenClassic();
}
public final boolean incrementTokenClassic() throws IOException {
if (!input.incrementToken()) {
return false;
}

View File

@ -17,39 +17,42 @@
package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** A grammar-based tokenizer constructed with JFlex
*
* <p> This should be a good tokenizer for most European-language documents:
*
* <ul>
* <li>Splits words at punctuation characters, removing punctuation. However, a
* dot that's not followed by whitespace is considered part of a token.
* <li>Splits words at hyphens, unless there's a number in the token, in which case
* the whole token is interpreted as a product number and is not split.
* <li>Recognizes email addresses and internet hostnames as one token.
* </ul>
*
import java.io.IOException;
import java.io.Reader;
/** A grammar-based tokenizer constructed with JFlex.
* <p>
* As of Lucene version 3.1, this class implements the Word Break rules from the
* Unicode Text Segmentation algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
* <p/>
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
* characters (characters above the Basic Multilingual Plane, which contains
* those up to and including U+FFFF), this scanner will not recognize them
* properly. If you need to be able to process text containing supplementary
* characters, consider using the ICU4J-backed implementation in contrib/icu
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
* instead of this class, since the ICU4J-backed implementation does not have
* this limitation.
* <p>Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* compatibility when creating StandardTokenizer:
* <ul>
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
* If you use a previous version number, you get the exact behavior of
* {@link ClassicTokenizer} for backwards compatibility.
* </ul>
*/
@ -58,12 +61,22 @@ public final class StandardTokenizer extends Tokenizer {
private StandardTokenizerInterface scanner;
public static final int ALPHANUM = 0;
/** @deprecated */
@Deprecated
public static final int APOSTROPHE = 1;
/** @deprecated */
@Deprecated
public static final int ACRONYM = 2;
/** @deprecated */
@Deprecated
public static final int COMPANY = 3;
public static final int EMAIL = 4;
/** @deprecated */
@Deprecated
public static final int HOST = 5;
public static final int NUM = 6;
/** @deprecated */
@Deprecated
public static final int CJ = 7;
/**
@ -73,6 +86,11 @@ public final class StandardTokenizer extends Tokenizer {
@Deprecated
public static final int ACRONYM_DEP = 8;
public static final int URL = 9;
public static final int SOUTHEAST_ASIAN = 10;
public static final int IDEOGRAPHIC = 11;
public static final int HIRAGANA = 12;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
@ -83,7 +101,11 @@ public final class StandardTokenizer extends Tokenizer {
"<HOST>",
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>"
"<ACRONYM_DEP>",
"<URL>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>"
};
private boolean replaceInvalidAcronym;
@ -132,7 +154,7 @@ public final class StandardTokenizer extends Tokenizer {
private final void init(Reader input, Version matchVersion) {
this.scanner = matchVersion.onOrAfter(Version.LUCENE_31) ?
new StandardTokenizerImpl31(input) : new StandardTokenizerImplOrig(input);
new StandardTokenizerImpl(input) : new ClassicTokenizerImpl(input);
if (matchVersion.onOrAfter(Version.LUCENE_24)) {
replaceInvalidAcronym = true;
} else {

View File

@ -0,0 +1,260 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
* characters (characters above the Basic Multilingual Plane, which contains
* those up to and including U+FFFF), this scanner will not recognize them
* properly. If you need to be able to process text containing supplementary
* characters, consider using the ICU4J-backed implementation in contrib/icu
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
* instead of this class, since the ICU4J-backed implementation does not have
* this limitation.
*/
%%
%unicode 5.2
%integer
%final
%public
%class StandardTokenizerImpl
%implements StandardTokenizerInterface
%function getNextToken
%char
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = [\p{WB:Numeric}\uFF10-\uFF19] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final int URL_TYPE = StandardTokenizer.URL;
/** E-mail addresses */
public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* {@see Unicode Line Breaking Algorithm http://www.unicode.org/reports/tr14/#SA}
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 5.2, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
\p{LB:Complex_Context}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
\p{Script:Han} { return IDEOGRAPHIC_TYPE; }
\p{Script:Hiragana} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -1,134 +0,0 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
*/
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%%
%class StandardTokenizerImpl31
%implements StandardTokenizerInterface
%unicode 4.0
%integer
%function getNextToken
%pack
%char
%{
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
public static final int ACRONYM = StandardTokenizer.ACRONYM;
public static final int COMPANY = StandardTokenizer.COMPANY;
public static final int EMAIL = StandardTokenizer.EMAIL;
public static final int HOST = StandardTokenizer.HOST;
public static final int NUM = StandardTokenizer.NUM;
public static final int CJ = StandardTokenizer.CJ;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
THAI = [\u0E00-\u0E59]
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possessives
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM = {LETTER} "." ({LETTER} ".")+
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
// email addresses
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
// hostname
HOST = {ALPHANUM} ((".") {ALPHANUM})+
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {HAS_DIGIT} {P} {ALPHANUM}
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
// punctuation
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
ALPHA = ({LETTER})+
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
LETTER = !(![:letter:]|{CJ})
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
WHITESPACE = \r\n | [ \r\n\t\f]
%%
{ALPHANUM} { return ALPHANUM; }
{APOSTROPHE} { return APOSTROPHE; }
{ACRONYM} { return ACRONYM; }
{COMPANY} { return COMPANY; }
{EMAIL} { return EMAIL; }
{HOST} { return HOST; }
{NUM} { return NUM; }
{CJ} { return CJ; }
{ACRONYM_DEP} { return ACRONYM_DEP; }
/** Ignore the rest */
. | {WHITESPACE} { /* ignore */ }

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:50 */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/15/10 3:01 AM */
package org.apache.lucene.analysis.standard;
@ -19,33 +19,51 @@ package org.apache.lucene.analysis.standard;
* limitations under the License.
*/
/*
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 17.05.10 14:50 from the specification file
* <tt>C:/Users/Uwe Schindler/Projects/lucene/newtrunk/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex</tt>
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
* characters (characters above the Basic Multilingual Plane, which contains
* those up to and including U+FFFF), this scanner will not recognize them
* properly. If you need to be able to process text containing supplementary
* characters, consider using the ICU4J-backed implementation in contrib/icu
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
* instead of this class, since the ICU4J-backed implementation does not have
* this limitation.
*/
class StandardTokenizerImpl31 implements StandardTokenizerInterface {
public final class UAX29Tokenizer extends Tokenizer {
/** This character denotes the end of file */
public static final int YYEOF = -1;
private static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
public static final int YYINITIAL = 0;
private static final int YYINITIAL = 0;
/**
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
@ -61,68 +79,113 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
"\5\0\27\12\1\0\37\12\1\0\u013f\12\31\0\162\12\4\0\14\12"+
"\16\0\5\12\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12"+
"\1\0\1\12\1\0\24\12\1\0\54\12\1\0\46\12\1\0\5\12"+
"\4\0\202\12\10\0\105\12\1\0\46\12\2\0\2\12\6\0\20\12"+
"\41\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"+
"\56\0\32\12\5\0\13\12\25\0\12\2\4\0\2\12\1\0\143\12"+
"\1\0\1\12\17\0\2\12\7\0\2\12\12\2\3\12\2\0\1\12"+
"\20\0\1\12\1\0\36\12\35\0\3\12\60\0\46\12\13\0\1\12"+
"\u0152\0\66\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2"+
"\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12"+
"\3\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12\4\0\12\2"+
"\2\12\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0"+
"\2\12\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0"+
"\12\2\2\0\3\12\20\0\11\12\1\0\3\12\1\0\26\12\1\0"+
"\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0"+
"\2\12\4\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0"+
"\7\12\1\0\2\12\1\0\5\12\3\0\1\12\36\0\2\12\1\0"+
"\3\12\4\0\12\2\1\0\1\12\21\0\1\12\1\0\6\12\3\0"+
"\3\12\1\0\4\12\3\0\2\12\1\0\1\12\1\0\2\12\3\0"+
"\2\12\3\0\3\12\3\0\10\12\1\0\3\12\55\0\11\2\25\0"+
"\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\46\0"+
"\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12\1\0"+
"\12\12\1\0\5\12\3\0\1\12\40\0\1\12\1\0\2\12\4\0"+
"\12\2\25\0\10\12\1\0\3\12\1\0\27\12\1\0\20\12\46\0"+
"\2\12\4\0\12\2\25\0\22\12\3\0\30\12\1\0\11\12\1\0"+
"\1\12\2\0\7\12\71\0\1\1\60\12\1\1\2\12\14\1\7\12"+
"\11\1\12\2\47\0\2\12\1\0\1\12\2\0\2\12\1\0\1\12"+
"\2\0\1\12\6\0\4\12\1\0\7\12\1\0\3\12\1\0\1\12"+
"\1\0\1\12\2\0\2\12\1\0\4\12\1\0\2\12\11\0\1\12"+
"\2\0\5\12\1\0\1\12\11\0\12\2\2\0\2\12\42\0\1\12"+
"\37\0\12\2\26\0\10\12\1\0\42\12\35\0\4\12\164\0\42\12"+
"\1\0\5\12\1\0\2\12\25\0\12\2\6\0\6\12\112\0\46\12"+
"\12\0\51\12\7\0\132\12\5\0\104\12\5\0\122\12\6\0\7\12"+
"\1\0\77\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\1\12"+
"\1\0\4\12\2\0\47\12\1\0\1\12\1\0\4\12\2\0\37\12"+
"\1\0\1\12\1\0\4\12\2\0\7\12\1\0\1\12\1\0\4\12"+
"\2\0\7\12\1\0\7\12\1\0\27\12\1\0\37\12\1\0\1\12"+
"\1\0\4\12\2\0\7\12\1\0\47\12\1\0\23\12\16\0\11\2"+
"\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0\32\12\5\0\113\12"+
"\25\0\15\12\1\0\4\12\16\0\22\12\16\0\22\12\16\0\15\12"+
"\1\0\3\12\17\0\64\12\43\0\1\12\4\0\1\12\3\0\12\2"+
"\46\0\12\2\6\0\130\12\10\0\51\12\127\0\35\12\51\0\12\2"+
"\36\12\2\0\5\12\u038b\0\154\12\224\0\234\12\4\0\132\12\6\0"+
"\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"+
"\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"+
"\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"+
"\6\12\4\0\15\12\5\0\3\12\1\0\7\12\164\0\1\12\15\0"+
"\1\12\202\0\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0"+
"\5\12\6\0\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0"+
"\3\12\1\0\7\12\3\0\3\12\5\0\5\12\u0ebb\0\2\12\52\0"+
"\5\12\5\0\2\12\3\0\1\13\126\13\6\13\3\13\1\13\132\13"+
"\1\13\4\13\5\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
"\u0773\0\u2ba4\12\u215c\0\u012e\13\2\13\73\13\225\13\7\12\14\0\5\12"+
"\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"+
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
"\2\0\66\12\50\0\14\12\164\0\5\12\1\0\207\12\23\0\12\2"+
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
"\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
"\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
"\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
"\1\0\7\2\234\1\13\0\46\1\2\0\1\1\7\0\47\1\1\0"+
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
"\2\0\13\2\6\0\52\1\24\2\1\0\12\3\1\0\1\3\1\6"+
"\1\0\2\1\1\2\143\1\1\0\1\1\17\2\2\1\2\2\1\0"+
"\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1\1\2"+
"\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1\11\2"+
"\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1\11\2"+
"\1\1\3\2\1\1\5\2\322\0\4\2\66\1\2\0\1\2\1\1"+
"\21\2\1\0\1\1\5\2\2\0\12\1\2\2\2\0\12\3\1\0"+
"\2\1\6\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1\2\0"+
"\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2\1\1"+
"\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0\2\1"+
"\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0\6\1"+
"\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0\2\1"+
"\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0\3\2"+
"\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2\3\1"+
"\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1\1\0"+
"\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2\1\0"+
"\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0\12\3"+
"\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
"\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2"+
"\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0"+
"\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1"+
"\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1"+
"\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2"+
"\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0\10\1"+
"\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0\1\1"+
"\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1\6\0"+
"\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1"+
"\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1\7\2"+
"\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0\2\1"+
"\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1\1\0"+
"\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2\1\0\4\2"+
"\11\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0\6\1\2\0"+
"\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0"+
"\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\22\0"+
"\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11\10\12\1\0"+
"\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0\1\11\2\0"+
"\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0\1\11\1\0"+
"\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12\1\0\2\12"+
"\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0\12\3\2\0"+
"\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0\1\2\1\0"+
"\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1\4\0\24\2"+
"\1\0\2\2\4\1\4\0\10\2\1\0\44\2\11\0\1\2\71\0"+
"\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12\1\11"+
"\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12\12\3"+
"\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1\1\0"+
"\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0"+
"\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0"+
"\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1\4\0"+
"\1\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0"+
"\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\2"+
"\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0\3\1"+
"\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11\1\12"+
"\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0\51\1"+
"\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0\14\2"+
"\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12\7\11"+
"\2\12\6\0\13\3\3\0\2\11\40\0\27\1\5\2\4\0\65\11"+
"\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3\6\0\16\11"+
"\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0\11\2\14\0"+
"\3\2\36\1\12\2\3\0\2\1\12\3\106\0\44\1\24\2\10\0"+
"\12\3\3\0\3\1\12\3\44\1\122\0\3\2\1\0\25\2\4\1"+
"\1\2\4\1\1\2\15\0\300\1\47\2\26\0\3\2\u0116\1\2\0"+
"\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1\1\0"+
"\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1\1\0"+
"\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1\4\0"+
"\15\1\5\0\3\1\1\0\7\1\17\0\4\2\10\0\2\7\12\0"+
"\1\7\2\0\1\5\2\0\5\2\20\0\2\10\3\0\1\6\17\0"+
"\1\10\13\0\5\2\5\0\6\2\1\0\1\1\15\0\1\1\20\0"+
"\5\1\73\0\41\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0"+
"\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0"+
"\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1\21\0"+
"\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1\6\0"+
"\4\1\3\2\16\0\46\1\12\0\66\1\11\0\1\1\20\0\27\1"+
"\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\57\0\1\1"+
"\120\0\32\13\1\0\131\13\14\0\326\13\57\0\1\1\1\0\1\13"+
"\31\0\11\13\6\2\1\0\5\4\2\0\3\13\1\1\1\1\4\0"+
"\126\14\2\0\2\2\2\4\3\14\133\4\1\0\4\4\5\0\51\1"+
"\3\0\136\1\21\0\30\1\70\0\20\4\320\0\57\4\1\0\130\4"+
"\250\0\u19b6\13\112\0\u51cc\13\64\0\u048d\1\103\0\56\1\2\0\u010d\1"+
"\3\0\20\1\12\3\2\1\24\0\40\1\2\0\15\1\4\2\11\0"+
"\2\2\1\0\31\1\10\0\120\1\2\2\45\0\11\1\2\0\147\1"+
"\2\0\2\1\156\0\7\1\1\2\3\1\1\2\4\1\1\2\27\1"+
"\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0"+
"\22\2\6\1\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1"+
"\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3"+
"\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3"+
"\6\0\33\11\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12"+
"\5\11\2\12\1\11\1\12\1\11\30\0\5\11\340\0\43\1\10\2"+
"\1\0\2\2\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1"+
"\u2104\0\u012e\13\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1"+
"\5\0\1\1\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1"+
"\1\0\2\1\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1"+
"\2\0\66\1\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6"+
"\13\0\7\2\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0"+
"\1\6\1\5\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7"+
"\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1"+
"\4\0\1\10\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1"+
"\2\0\6\1\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
/**
* Translates characters to character classes
@ -135,13 +198,11 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
"\1\4";
"\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
"\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
private static int [] zzUnpackAction() {
int [] result = new int[51];
int [] result = new int[16];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -166,16 +227,11 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
"\0\u0268\0\u0276\0\u0284";
"\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
"\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
private static int [] zzUnpackRowMap() {
int [] result = new int[51];
int [] result = new int[16];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -198,49 +254,21 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
"\1\11\2\52\1\0\1\24\3\0";
"\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
"\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
"\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
"\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
"\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
"\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
"\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
"\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
"\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
"\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
"\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
"\2\0";
private static int [] zzUnpackTrans() {
int [] result = new int[658];
int [] result = new int[169];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -278,11 +306,11 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
"\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
"\1\1\2\0";
private static int [] zzUnpackAttribute() {
int [] result = new int[51];
int [] result = new int[16];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -350,35 +378,124 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
private boolean zzEOFDone;
/* user code: */
/** Alphanumeric sequences */
public static final String WORD_TYPE = "<ALPHANUM>";
/** Numbers */
public static final String NUMERIC_TYPE = "<NUM>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* {@see Unicode Line Breaking Algorithm http://www.unicode.org/reports/tr14/#SA}
*/
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
= addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
private int posIncr;
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
public static final int ACRONYM = StandardTokenizer.ACRONYM;
public static final int COMPANY = StandardTokenizer.COMPANY;
public static final int EMAIL = StandardTokenizer.EMAIL;
public static final int HOST = StandardTokenizer.HOST;
public static final int NUM = StandardTokenizer.NUM;
public static final int CJ = StandardTokenizer.CJ;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
/**
* @param source The AttributeSource to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeSource source, Reader input) {
super(source, input);
zzReader = input;
}
/**
* @param factory The AttributeFactory to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
zzReader = input;
}
/**
* Set the max allowed token length. Any token longer than this is skipped.
* @param length the new max allowed token length
*/
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
/**
* Returns the max allowed token length. Any token longer than this is
* skipped.
* @return the max allowed token length
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
public final int yychar()
{
return yychar;
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(yychar + yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
yyreset(reader);
}
@Override
public final boolean incrementToken() throws IOException {
// This method is required because of two JFlex limitations:
// 1. No way to insert code at the beginning of the generated scanning
// get-next-token method; and
// 2. No way to declare @Override on the generated scanning method.
clearAttributes();
posIncr = 1;
return getNextToken();
}
/**
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
* the current match, the TypeAttribute from the passed-in tokenType, and
* the PositionIncrementAttribute to one, unless the immediately previous
* token(s) was/were skipped because maxTokenLength was exceeded, in which
* case the PositionIncrementAttribute is set to one plus the number of
* skipped overly long tokens.
* <p/>
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
* and false is returned.
*
* @param tokenType The type of the matching token
* @return true there is a token available (not too long); false otherwise
*/
private boolean populateAttributes(String tokenType) {
boolean isTokenAvailable = false;
if (yylength() > maxTokenLength) {
// When we skip a too-long token, we treat it like a stopword, introducing
// a position increment gap
++posIncr;
} else {
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
posIncrAtt.setPositionIncrement(posIncr);
offsetAtt.setOffset(correctOffset(yychar),
correctOffset(yychar + yylength()));
typeAtt.setType(tokenType);
isTokenAvailable = true;
}
return isTokenAvailable;
}
/**
@ -387,7 +504,8 @@ public final void getText(CharTermAttribute t) {
*
* @param in the java.io.Reader to read input from.
*/
StandardTokenizerImpl31(java.io.Reader in) {
public UAX29Tokenizer(java.io.Reader in) {
super(in);
this.zzReader = in;
}
@ -397,7 +515,7 @@ public final void getText(CharTermAttribute t) {
*
* @param in the java.io.Inputstream to read input from.
*/
StandardTokenizerImpl31(java.io.InputStream in) {
public UAX29Tokenizer(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
@ -411,7 +529,7 @@ public final void getText(CharTermAttribute t) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 1234) {
while (i < 2138) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@ -477,7 +595,7 @@ public final void getText(CharTermAttribute t) {
/**
* Closes the input stream.
*/
public final void yyclose() throws java.io.IOException {
private final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
@ -498,7 +616,7 @@ public final void getText(CharTermAttribute t) {
*
* @param reader the new input stream
*/
public final void yyreset(java.io.Reader reader) {
private final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
@ -515,7 +633,7 @@ public final void getText(CharTermAttribute t) {
/**
* Returns the current lexical state.
*/
public final int yystate() {
private final int yystate() {
return zzLexicalState;
}
@ -525,7 +643,7 @@ public final void getText(CharTermAttribute t) {
*
* @param newState the new lexical state
*/
public final void yybegin(int newState) {
private final void yybegin(int newState) {
zzLexicalState = newState;
}
@ -533,7 +651,7 @@ public final void getText(CharTermAttribute t) {
/**
* Returns the text matched by the current regular expression.
*/
public final String yytext() {
private final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
@ -549,7 +667,7 @@ public final void getText(CharTermAttribute t) {
*
* @return the character at position pos
*/
public final char yycharat(int pos) {
private final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
@ -557,7 +675,7 @@ public final void getText(CharTermAttribute t) {
/**
* Returns the length of the matched text region.
*/
public final int yylength() {
private final int yylength() {
return zzMarkedPos-zzStartRead;
}
@ -597,7 +715,7 @@ public final void getText(CharTermAttribute t) {
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
public void yypushback(int number) {
private void yypushback(int number) {
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
@ -612,7 +730,7 @@ public final void getText(CharTermAttribute t) {
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
public int getNextToken() throws java.io.IOException {
private boolean getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
@ -685,49 +803,35 @@ public final void getText(CharTermAttribute t) {
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 5:
{ return NUM;
{ if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
}
case 7: break;
case 1:
{ /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 8: break;
case 3:
{ if (populateAttributes(NUMERIC_TYPE)) return true;
}
case 9: break;
case 6:
{ if (populateAttributes(HIRAGANA_TYPE)) return true;
}
case 10: break;
case 4:
{ if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
}
case 11: break;
case 9:
{ return ACRONYM;
case 2:
{ if (populateAttributes(WORD_TYPE)) return true;
}
case 12: break;
case 7:
{ return COMPANY;
}
case 13: break;
case 10:
{ return EMAIL;
}
case 14: break;
case 1:
{ /* ignore */
}
case 15: break;
case 6:
{ return APOSTROPHE;
}
case 16: break;
case 3:
{ return CJ;
}
case 17: break;
case 8:
{ return ACRONYM_DEP;
}
case 18: break;
case 2:
{ return ALPHANUM;
}
case 19: break;
case 4:
{ return HOST;
}
case 20: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
{
return false;
}
}
else {
zzScanError(ZZ_NO_MATCH);

View File

@ -0,0 +1,273 @@
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
* characters (characters above the Basic Multilingual Plane, which contains
* those up to and including U+FFFF), this scanner will not recognize them
* properly. If you need to be able to process text containing supplementary
* characters, consider using the ICU4J-backed implementation in contrib/icu
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
* instead of this class, since the ICU4J-backed implementation does not have
* this limitation.
*/
%%
%unicode 5.2
%final
%public
%apiprivate
%class UAX29Tokenizer
%extends Tokenizer
%type boolean
%function getNextToken
%char
%init{
super(in);
%init}
// WB4. X (Extend | Format)* --> X
//
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = [\p{WB:Numeric}\uFF10-\uFF19] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
%{
/** Alphanumeric sequences */
public static final String WORD_TYPE = "<ALPHANUM>";
/** Numbers */
public static final String NUMERIC_TYPE = "<NUM>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* {@see Unicode Line Breaking Algorithm http://www.unicode.org/reports/tr14/#SA}
*/
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
= addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
private int posIncr;
/**
* @param source The AttributeSource to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeSource source, Reader input) {
super(source, input);
zzReader = input;
}
/**
* @param factory The AttributeFactory to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
zzReader = input;
}
/**
* Set the max allowed token length. Any token longer than this is skipped.
* @param length the new max allowed token length
*/
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/**
* Returns the max allowed token length. Any token longer than this is
* skipped.
* @return the max allowed token length
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(yychar + yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
yyreset(reader);
}
@Override
public final boolean incrementToken() throws IOException {
// This method is required because of two JFlex limitations:
// 1. No way to insert code at the beginning of the generated scanning
// get-next-token method; and
// 2. No way to declare @Override on the generated scanning method.
clearAttributes();
posIncr = 1;
return getNextToken();
}
/**
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
* the current match, the TypeAttribute from the passed-in tokenType, and
* the PositionIncrementAttribute to one, unless the immediately previous
* token(s) was/were skipped because maxTokenLength was exceeded, in which
* case the PositionIncrementAttribute is set to one plus the number of
* skipped overly long tokens.
* <p/>
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
* and false is returned.
*
* @param tokenType The type of the matching token
* @return true there is a token available (not too long); false otherwise
*/
private boolean populateAttributes(String tokenType) {
boolean isTokenAvailable = false;
if (yylength() > maxTokenLength) {
// When we skip a too-long token, we treat it like a stopword, introducing
// a position increment gap
++posIncr;
} else {
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
posIncrAtt.setPositionIncrement(posIncr);
offsetAtt.setOffset(correctOffset(yychar),
correctOffset(yychar + yylength()));
typeAtt.setType(tokenType);
isTokenAvailable = true;
}
return isTokenAvailable;
}
%}
%%
// WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return false; }
// WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
// WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ if (populateAttributes(WORD_TYPE)) return true; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 5.2, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
// WB14. Any ÷ Any
//
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
// WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -17,9 +17,43 @@
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
A fast grammar-based tokenizer constructed with JFlex.
<p>The <code>org.apache.lucene.analysis.standard</code> package contains three
fast grammar-based tokenizers constructed with JFlex:</p>
<ul>
<li><code><a href="StandardTokenizer.html">StandardTokenizer</a></code>:
as of Lucene 3.1, implements the Word Break rules from the Unicode Text
Segmentation algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
URLs and email addresses are also tokenized according to the relevant RFCs.
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
<code>StandardTokenizer</code>,
<code><a href="StandardFilter">StandardFilter</a></code>,
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
When the <code>Version</code> specified in the constructor is lower than
3.1, the <code><a href="ClassicTokenizer.html">ClassicTokenizer</a></code>
implementation is invoked.</li>
<li><code><a href="ClassicTokenizer.html">ClassicTokenizer</a></code>:
this class was formerly (prior to Lucene 3.1) named
<code>StandardTokenizer</code>. (Its tokenization rules are not
based on the Unicode Text Segmentation algorithm.)
<code><a href="ClassicAnalyzer">ClassicAnalyzer</a></code> includes
<code>ClassicTokenizer</code>,
<code><a href="StandardFilter">StandardFilter</a></code>,
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
</li>
<li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>:
implements the Word Break rules from the Unicode Text Segmentation
algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
Unlike <code>StandardTokenizer</code>, URLs and email addresses are
<b>not</b> tokenized as single tokens, but are instead split up into
tokens according to the UAX#29 word break rules.
</li>
</ul>
</body>
</html>

View File

@ -120,7 +120,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -58,7 +58,7 @@ public final class ThaiAnalyzer extends ReusableAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new LowerCaseFilter(matchVersion, result);
result = new ThaiWordFilter(matchVersion, result);

View File

@ -123,7 +123,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
TokenStream result = new StandardFilter(matchVersion, source);
result = new TurkishLowerCaseFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -0,0 +1,267 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<meta name="robots" content="index,nofollow">
<title>Resources - Lucene-java Wiki</title>
<script type="text/javascript" src="/moin_static184/common/js/common.js"></script>
<script type="text/javascript">
<!--
var search_hint = "Search";
//-->
</script>
<link rel="stylesheet" type="text/css" charset="utf-8" media="all" href="/moin_static184/modernized/css/common.css">
<link rel="stylesheet" type="text/css" charset="utf-8" media="screen" href="/moin_static184/modernized/css/screen.css">
<link rel="stylesheet" type="text/css" charset="utf-8" media="print" href="/moin_static184/modernized/css/print.css">
<link rel="stylesheet" type="text/css" charset="utf-8" media="projection" href="/moin_static184/modernized/css/projection.css">
<!-- css only for MS IE6/IE7 browsers -->
<!--[if lt IE 8]>
<link rel="stylesheet" type="text/css" charset="utf-8" media="all" href="/moin_static184/modernized/css/msie.css">
<![endif]-->
<link rel="Start" href="/lucene-java/FrontPageEN">
<link rel="Alternate" title="Wiki Markup" href="/lucene-java/Resources?action=raw">
<link rel="Alternate" media="print" title="Print View" href="/lucene-java/Resources?action=print">
<link rel="Appendix" title="IntroductionToApacheLucene.jp.jpg" href="/lucene-java/Resources?action=AttachFile&amp;do=view&amp;target=IntroductionToApacheLucene.jp.jpg">
<link rel="Appendix" title="SuchmaschinenEntwickelnMitApacheLucene.de.jpg" href="/lucene-java/Resources?action=AttachFile&amp;do=view&amp;target=SuchmaschinenEntwickelnMitApacheLucene.de.jpg">
<link rel="Appendix" title="building.search.applications.png" href="/lucene-java/Resources?action=AttachFile&amp;do=view&amp;target=building.search.applications.png">
<link rel="Appendix" title="lia3d.jpg" href="/lucene-java/Resources?action=AttachFile&amp;do=view&amp;target=lia3d.jpg">
<link rel="Search" href="/lucene-java/FindPage">
<link rel="Index" href="/lucene-java/TitleIndex">
<link rel="Glossary" href="/lucene-java/WordIndex">
<link rel="Help" href="/lucene-java/HelpOnFormatting">
</head>
<body lang="en" dir="ltr">
<div id="header">
<form id="searchform" method="get" action="/lucene-java/Resources">
<div>
<input type="hidden" name="action" value="fullsearch">
<input type="hidden" name="context" value="180">
<label for="searchinput">Search:</label>
<input id="searchinput" type="text" name="value" value="" size="20"
onfocus="searchFocus(this)" onblur="searchBlur(this)"
onkeyup="searchChange(this)" onchange="searchChange(this)" alt="Search">
<input id="titlesearch" name="titlesearch" type="submit"
value="Titles" alt="Search Titles">
<input id="fullsearch" name="fullsearch" type="submit"
value="Text" alt="Search Full Text">
</div>
</form>
<script type="text/javascript">
<!--// Initialize search form
var f = document.getElementById('searchform');
f.getElementsByTagName('label')[0].style.display = 'none';
var e = document.getElementById('searchinput');
searchChange(e);
searchBlur(e);
//-->
</script>
<div id="logo"><a href="/lucene-java/FrontPageEN">Lucene-java Wiki</a></div>
<div id="username"><a href="/lucene-java/Resources?action=login" id="login" rel="nofollow">Login</a></div>
<h1 id="locationline">
<span id="pagelocation"><a class="backlink" href="/lucene-java/Resources?action=fullsearch&amp;context=180&amp;value=linkto%3A%22Resources%22" rel="nofollow" title="Click to do a full-text search for this title">Resources</a></span>
</h1>
<ul id="navibar">
<li class="wikilink"><a href="/lucene-java/FrontPageEN">FrontPageEN</a></li><li class="wikilink"><a href="/lucene-java/RecentChanges">RecentChanges</a></li><li class="wikilink"><a href="/lucene-java/FindPage">FindPage</a></li><li class="wikilink"><a href="/lucene-java/HelpContents">HelpContents</a></li><li class="current"><a href="/lucene-java/Resources">Resources</a></li>
</ul>
<div id="pageline"><hr style="display:none;"></div>
<ul class="editbar"><li><span class="disabled">Immutable Page</span></li><li class="toggleCommentsButton" style="display:none;"><a href="#" class="nbcomment" onClick="toggleComments();return false;">Comments</a></li><li><a class="nbinfo" href="/lucene-java/Resources?action=info" rel="nofollow">Info</a></li><li>
<form class="actionsmenu" method="GET" action="/lucene-java/Resources">
<div>
<label>More Actions:</label>
<select name="action"
onchange="if ((this.selectedIndex != 0) &&
(this.options[this.selectedIndex].disabled == false)) {
this.form.submit();
}
this.selectedIndex = 0;">
<option value="raw">Raw Text</option>
<option value="print">Print View</option>
<option value="RenderAsDocbook">Render as Docbook</option>
<option value="refresh">Delete Cache</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="SpellCheck">Check Spelling</option>
<option value="LikePages">Like Pages</option>
<option value="LocalSiteMap">Local Site Map</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="RenamePage" disabled class="disabled">Rename Page</option>
<option value="CopyPage">Copy Page</option>
<option value="DeletePage" disabled class="disabled">Delete Page</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="MyPages">My Pages</option>
<option value="show" disabled class="disabled">Subscribe User</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="show" disabled class="disabled">Remove Spam</option>
<option value="show" disabled class="disabled">Revert to this revision</option>
<option value="show" disabled class="disabled">Package Pages</option>
<option value="SyncPages">Sync Pages</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="Load">Load</option>
<option value="Save">Save</option>
</select>
<input type="submit" value="Do">
</div>
<script type="text/javascript">
<!--// Init menu
actionsMenuInit('More Actions:');
//-->
</script>
</form>
</li></ul>
</div>
<div id="page" lang="en" dir="ltr">
<div dir="ltr" id="content" lang="en"><span class="anchor" id="top"></span>
<span class="anchor" id="line-2"></span><p class="line867"><div class="table-of-contents"><p class="table-of-contents-heading">Contents<ol><li>
<a href="#Introductions">Introductions</a></li><li>
<a href="#Blogs">Blogs</a></li><li>
<a href="#Books">Books</a></li><li>
<a href="#Articles">Articles</a></li><li>
<a href="#Interviews">Interviews</a></li><li>
<a href="#Papers">Papers</a></li><li>
<a href="#Presentations">Presentations</a></li><li>
<a href="#Training">Training</a></li><li>
<a href="#Corpora">Corpora</a></li><li>
<a href="#Other">Other</a></li></ol></div> <span class="anchor" id="line-3"></span><span class="anchor" id="line-4"></span><p class="line867">
<h1 id="Introductions">Introductions</h1>
<span class="anchor" id="line-5"></span><span class="anchor" id="line-6"></span><ul><li><p class="line862">The API documentation contains <a class="http" href="http://lucene.apache.org/java/3_0_1/api/all/overview-summary.html#overview_description">a short and simple code example</a> that shows the basic way to index and search <span class="anchor" id="line-7"></span></li><li><p class="line862">The <a class="http" href="http://lucene.apache.org/java/3_0_1/gettingstarted.html">Getting Started Guide</a> that describes the demos that come with Lucene <span class="anchor" id="line-8"></span><span class="anchor" id="line-9"></span><span class="anchor" id="line-10"></span></li></ul><p class="line867">
<h1 id="Blogs">Blogs</h1>
<span class="anchor" id="line-11"></span><span class="anchor" id="line-12"></span><ul><li><p class="line891"><a class="http" href="http://lucene.grantingersoll.com">Grant's Grunts: Lucene edition</a> - Grant Ingersoll's thoughts on the Lucene ecosystem. <span class="anchor" id="line-13"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/blog/">Lucid Imagination's Blog</a> - Many of the Lucene and Solr committers blog here about how to use Lucene and Solr <span class="anchor" id="line-14"></span></li><li><p class="line891"><a class="http" href="http://blog.sematext.com/">Sematext Blog</a> - Search and Analytics covering Lucene, Solr, Nutch, Hadoop, HBase, and more <span class="anchor" id="line-15"></span><span class="anchor" id="line-16"></span><span class="anchor" id="line-17"></span></li></ul><p class="line867">
<h1 id="Books">Books</h1>
<span class="anchor" id="line-18"></span><span class="anchor" id="line-19"></span><ul><li><p class="line891"><img alt="http://www.manning.com/hatcher3/hatcher3_cover150.jpg" class="external_image" src="http://www.manning.com/hatcher3/hatcher3_cover150.jpg" title="http://www.manning.com/hatcher3/hatcher3_cover150.jpg" /> "<a class="http" href="http://www.manning.com/hatcher3/">Lucene in Action, Second Edition"</a> by Erik Hatcher, Otis Gospodneti&#263;, and Michael McCandless <span class="anchor" id="line-20"></span></li><li><p class="line891"><img alt="building.search.applications.png" class="attachment" src="/lucene-java/Resources?action=AttachFile&amp;do=get&amp;target=building.search.applications.png" title="building.search.applications.png" /> "<a class="http" href="http://www.amazon.com/Building-Search-Applications-Lucene-Lingpipe/dp/0615204252/">Building Search Applications: Lucene, LingPipe, and Gate</a>" by Manu Konchady; Mustru Publishing; June 2008; ISBN 978-0615204253 <span class="anchor" id="line-21"></span></li><li><p class="line891"><img alt="IntroductionToApacheLucene.jp.jpg" class="attachment" src="/lucene-java/Resources?action=AttachFile&amp;do=get&amp;target=IntroductionToApacheLucene.jp.jpg" title="IntroductionToApacheLucene.jp.jpg" /> "<a class="http" href="http://www.amazon.co.jp/exec/obidos/ASIN/4774127809/503-9461699-1775907">Apache Lucene 入門 ~Java・オープンソース・全文検索システムの構築</a>" 関口 宏司 ; 技術評論社 ; 2006/05/17 ; ISBN: 4774127809 (<span class="u">Introduction to Apache Lucene: Construction of Java Open Source Full Text Retrieval Systems</span> by Koshi Sekiguti ; Gijutsu-Hyohron Co., Ltd.) <span class="anchor" id="line-22"></span></li><li><p class="line891"><img alt="lia3d.jpg" class="attachment" src="/lucene-java/Resources?action=AttachFile&amp;do=get&amp;target=lia3d.jpg" title="lia3d.jpg" /> "<a class="http" href="http://www.lucenebook.com">Lucene In Action</a>" by Erik Hatcher, Otis Gospodneti&#263;; Manning Publications; December 2004; ISBN 1932394281 (also available from <a class="http" href="http://www.amazon.com/exec/obidos/ASIN/1932394281">Amazon.com</a>) <span class="anchor" id="line-23"></span></li><li><p class="line891"><img alt="SuchmaschinenEntwickelnMitApacheLucene.de.jpg" class="attachment" src="/lucene-java/Resources?action=AttachFile&amp;do=get&amp;target=SuchmaschinenEntwickelnMitApacheLucene.de.jpg" title="SuchmaschinenEntwickelnMitApacheLucene.de.jpg" /> Manfred Hardt, Dr. Fabian Theis: "<a class="http" href="http://www.amazon.de/Suchmaschinen-entwickeln-mit-Apache-Lucene/dp/3935042450">Suchmaschinen entwickeln mit Apache Lucene</a>"; Software &amp; Support Verlag, Frankfurt/Main, Germany; September 2004; ISBN 3935042450 (<span class="u">Developing Search Engines with Apache Lucene</span>) <span class="anchor" id="line-24"></span><span class="anchor" id="line-25"></span></li></ul><p class="line867">
<h1 id="Articles">Articles</h1>
<span class="anchor" id="line-26"></span><span class="anchor" id="line-27"></span><ul><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Getting-Started-with-Lucene/">Getting Started with Lucene</a> (by Grant Ingersoll) <br>
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-28"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Optimizing-Findability-in-Lucene-and-Solr/">Optimizing Findability in Lucene and Solr</a> (by Grant Ingersoll)<br>
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-29"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Debugging-Relevance-Issues-in-Search/">Debugging Relevance Issues in Search</a> (by Grant Ingersoll)<br>
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-30"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Scaling-Lucene-and-Solr/">Scaling Lucene and Solr</a> (by Mark Miller)<br>
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-31"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Introduction-to-Apache-Lucene-and-Solr/">Introduction to Apache Lucene and Solr</a> (by Marc Krellenstein)<br>
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-32"></span></li><li><p class="line891"><a class="http" href="http://cephas.net/blog/2008/03/30/how-morelikethis-works-in-lucene/">How MoreLikeThis Works in Lucene</a> (by Aaron Johnson)<br>
(<em>Last updated: March 2008 - blog entry</em>) <span class="anchor" id="line-33"></span></li><li><p class="line891"><a class="http" href="http://schmidt.devlib.org/software/lucene-wikipedia.html">Lucene Wikipedia indexer</a> (by Marco Schmidt)<br>
(<em>Last updated: November 2007 - tutorial</em>) <span class="anchor" id="line-34"></span></li><li><p class="line891"><a class="http" href="http://marceloochoa.blogspot.com/2007/09/running-lucene-inside-your-oracle-jvm.html">Running Lucene inside your Oracle JVM</a> (by Marcelo Ochoa)<br>
(<em>Last updated: September 2007 - blog entry</em>) <span class="anchor" id="line-35"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2007/05/24/using-the-lucene-query-parser-without-lucene.html">Using the Lucene Query Parser Without Lucene</a> (by Marcin Maciukiewicz and Daniel Owsiański)<br>
(<em>Published: May 2007 - article</em>) <span class="anchor" id="line-36"></span></li><li><p class="line891"><a class="http" href="http://www.javaworld.com/javaworld/jw-09-2006/jw-0925-lucene.html">Integrate advanced search functionalities into your apps</a> (by John Ferguson Smart)<br>
(<em>Published: September 2006 - article</em>) <span class="anchor" id="line-37"></span></li><li><p class="line891"><a class="http" href="http://www-128.ibm.com/developerworks/java/library/wa-lucene2/index.html?ca=drs-">Beef up Web search applications with Lucene</a> (by Deng Peng Zhou)<br>
(<em>Published: August 2006 - article</em>) <span class="anchor" id="line-38"></span></li><li><p class="line891"><a class="http" href="http://www.freesearch.pe.kr/tag/Lucene">Lecture &amp; Etc : Lucene index file format for Korean</a> (by Jeon Hee-Won)<br>
(<em>Published: July 2006 - article</em>) <span class="anchor" id="line-39"></span></li><li>Cai Ziegler: "Suche nach Suche -- Apaches Lucene: eigene Suche und Indizierung"; iX 6/2006, Seite 120; Heise Zeitschriften Verlag, Hannover, Germany <span class="anchor" id="line-40"></span></li><li><p class="line891"><a class="http" href="http://www-128.ibm.com/developerworks/java/library/wa-lucene/index.html">Delve inside the Lucene indexing mechanism</a> (by Deng Peng Zhou)<br>
(<em>Published: June 2006 - article</em>) <span class="anchor" id="line-41"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html">Using Lucene to Search Java Source Code</a> (by Renuka Sindhgatta)<br>
(<em>Published: January 2006 - article</em>) <span class="anchor" id="line-42"></span></li><li><p class="line891"><a class="http" href="http://www.jroller.com/page/wakaleo/?anchor=lucene_a_tutorial_introduction_to">Lucene : a tutorial introduction to full-text indexing in Java</a> (by John Ferguson Smart)<br>
(<em>Published: October 2005 - article</em>) <span class="anchor" id="line-43"></span></li><li>Daniel Naber: "Herr der Suche -- Eigene Anwendungen mit Volltextsuche erweitern"; c't 7/2005, Seite 196; Heise Zeitschriften Verlag, Hannover, Germany <span class="anchor" id="line-44"></span></li><li><p class="line891"><a class="http" href="http://blog.dev.sf.net/index.php?/archives/10-Behind-the-Scenes-of-the-SourceForge.net-Search-System.html">Behind the Scenes of the SourceForge.net Search System</a> (by Chris Conrad)<br>
(<em>Last updated: June 2005 - blog entry</em>) <span class="anchor" id="line-45"></span></li><li><p class="line891"><a class="http" href="http://today.java.net/pub/a/today/2005/08/09/didyoumean.html">Did You Mean: Lucene?</a> (by Tom White)<br>
(<em>Published: August 2005 - article</em>) <span class="anchor" id="line-46"></span></li><li><p class="line891"><a class="http" href="http://www.developer.com/java/other/article.php/3490471">Meet Lucene</a> (by Otis Gospodneti&#263;, Eric Hatcher)<br>
(<em>Published: March 2005 - article</em>) <span class="anchor" id="line-47"></span></li><li><p class="line891"><a class="http" href="http://www.theserverside.com/tt/articles/article.tss?l=ILoveLucene">I Love Lucene</a> (by Dion Almaer)<br>
(<em>Published: January 2005 - article</em>) <span class="anchor" id="line-48"></span></li><li><p class="line891"><a class="http" href="http://javaboutique.internet.com/tutorials/HTMLParser/article.html">Unweaving a Tangled Web With HTMLParser and Lucene</a> (by Keld H. Hansen)<br>
(<em>Last updated: October 2004 - tutorial</em>) <span class="anchor" id="line-49"></span></li><li><p class="line891"><a class="http" href="http://bilgidata.com/localhost/bilgidata/yazi.jsp@dosya=a_lucene.xml.html">Lucene Introduction in Turkish</a> Java Bazl&#305; Arama Motoru - Lusin (by Burak Bayraml&#305;)<br>
(<em>Last updated: August 2004 - tutorial</em>) <span class="anchor" id="line-50"></span></li><li><p class="line891"><a class="http" href="http://www.chedong.com/tech/lucene.html">Lucene Introduction in Chinese</a> Lucene&#65306;&#22522;&#20110;Java&#30340;&#20840;&#25991;&#26816;&#32034;&#24341;&#25806;&#31616;&#20171; (by Che Dong; &#20316;&#32773;&#65306; &#36710;&#19996;)<br>
(<em>Last updated: May 2004 - tutorial</em>) <span class="anchor" id="line-51"></span></li><li><p class="line891"><a class="http" href="http://javatechniques.com/public/java/docs/basics/lucene-memory-search.html">Lucene In-Memory Text Search</a> (by Philip Isenhour)<br>
(<em>Last updated: May 2004 - tutorial</em>) <span class="anchor" id="line-52"></span></li><li><p class="line891"><a class="http" href="http://www.javaranch.com/newsletter/200404/Lucene.html">The Lucene Search Engine: Adding Search to Your Applications</a> (by Thomas Paul)<br>
(<em>Published: April 2004 - article</em>) <span class="anchor" id="line-53"></span></li><li><p class="line891"><a class="http" href="http://www.darksleep.com/lucene/">Lucene Tutorial</a> (by Steven J. Owens)<br>
(<em>Last updated: March 2004 - tutorial</em>) <span class="anchor" id="line-54"></span></li><li><p class="line891"><a class="http" href="http://www-igm.univ-mlv.fr/~dr/XPOSE2003/lucene/articleLucene.html">Lucene Introduction in French</a> Exposés Système sur le thème de l'opensource : Analyse de la structure de Lucene. (by Sun Seng TAN)<br>
(<em>Last updated: February 2004 - tutorial</em>) <span class="anchor" id="line-55"></span></li><li><p class="line891"><a class="http" href="http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html">QueryParser Rules</a> (by Erik Hatcher)<br>
(<em>Published November 2003 - article</em>) <span class="anchor" id="line-56"></span></li><li><p class="line891"><a class="http" href="http://builder.com.com/5100-6389-5054799.html">Give your Web site its own search engine using Lucene</a> (by Jeffrey Linwood)<br>
(<em>Published July 2003 - article</em>) <span class="anchor" id="line-57"></span></li><li><p class="line891"><a class="http" href="http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html">Lucene Intro</a> (by Erik Hatcher)<br>
(<em>Published: July 2003 - article</em>) <span class="anchor" id="line-58"></span></li><li><p class="line891"><a class="http" href="http://www-106.ibm.com/developerworks/library/j-lucene/">Parsing, indexing, and searching XML with Digester and Lucene</a> (by Otis Gospodneti&#263;)<br>
(<em>Published June 2003 - article</em>) <span class="anchor" id="line-59"></span></li><li><p class="line891"><a class="http" href="http://www.xml.com/pub/a/ws/2003/05/13/email.html">Using Python, Jython, and Lucene to Search Outlook Email</a> (by Jon Udell)<br>
(<em>Published: May 2003 - article</em>) <span class="anchor" id="line-60"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2003/03/05/lucene.html">Advanced Text Indexing with Lucene</a> (by Otis Gospodneti&#263;)<br>
(<em>Published: March 2003 - article</em>) <span class="anchor" id="line-61"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html">Introduction to Text Indexing with Apache Jakarta Lucene</a> (by Otis Gospodneti&#263;)<br>
(<em>Published: January 2003 - article</em>) <span class="anchor" id="line-62"></span></li><li><p class="line862">Manfred Hardt: "Suchmaschinen entwickeln mit Java und Lucene - Wo war denn noch gleich ... ?"; JavaMagazin 9/2002; Software &amp; Support Verlag, Frankfurt/Main, Germany <span class="anchor" id="line-63"></span></li><li><p class="line891"><a class="http" href="http://javangelist.snipsnap.org/space/Lucene-Mini-Tutorial">Lucene Mini-Tutorial</a> (by funzel)<br>
(<em>Last updated: April 2002 - tutorial</em>) <span class="anchor" id="line-64"></span></li><li><p class="line891"><a class="http" href="http://www.javaworld.com/javaworld/jw-09-2000/jw-0915-lucene.html">The Lucene search engine Powerful flexible and free</a> (by Brian Goetz)<br>
(<em>Published September 2000 - article</em>) <span class="anchor" id="line-65"></span><span class="anchor" id="line-66"></span></li></ul><p class="line867">
<h1 id="Interviews">Interviews</h1>
<span class="anchor" id="line-67"></span><span class="anchor" id="line-68"></span><ul><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/index.php?option=com_content&amp;task=view&amp;id=109">Interview with Lucene creator Doug Cutting</a> Podcast. Summary: Doug talks about the creation of Lucene, Nutch and Hadoop. (<em>Published January 2009</em>) <span class="anchor" id="line-69"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/index.php?option=com_content&amp;task=view&amp;id=108">Interview with Lucene/Solr committer Chris Hostetter</a> Podcast. Summary: Chris talks about Solr, Lucene and their usage at CNET. (<em>Published January 2009</em>) <span class="anchor" id="line-70"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/index.php?option=com_content&amp;task=view&amp;id=113">Interview with Lucene/Solr committer Ryan McKinley</a> Podcast. Summary: Ryan discusses Solr, Lucene and geospatial searching with Lucene (<a class="nonexistent" href="/lucene-java/LocalLucene/LocalSolr">LocalLucene/LocalSolr</a>) and his usage of Lucene/Solr throughout his career. (<em>Published January 2009</em>) <span class="anchor" id="line-71"></span><span class="anchor" id="line-72"></span><span class="anchor" id="line-73"></span><span class="anchor" id="line-74"></span></li></ul><p class="line867">
<h1 id="Papers">Papers</h1>
<span class="anchor" id="line-75"></span><span class="anchor" id="line-76"></span><ul><li><p class="line891"><a class="http" href="http://lucene.sourceforge.net/publications.html">http://lucene.sourceforge.net/publications.html</a> Doug Cuttings papers from the old Lucene web site <span class="anchor" id="line-77"></span><span class="anchor" id="line-78"></span></li></ul><p class="line867">
<h1 id="Presentations">Presentations</h1>
<span class="anchor" id="line-79"></span><ul><li><p class="line891"><a class="http" href="http://people.apache.org/~buschmi/apachecon/AdvancedIndexingLuceneAtlanta07.ppt">Advanced Indexing Techniques with Apache Lucene - Payloads</a> presented by Michael Busch at <a class="http" href="http://www.us.apachecon.com/us2007/">ApacheCon U.S. 2007</a><br>
(<em>Presented November 2007 - PDF slide show</em>) <span class="anchor" id="line-80"></span></li><li><p class="line891"><a class="http" href="http://people.apache.org/~yonik/presentations/lucene_intro.pdf">Full-Text Search with Lucene</a> presented by Yonik Seeley at <a class="http" href="http://www.eu.apachecon.com">ApacheCon Europe 2007</a>.<br>
(<em>Presented May 2007 - PDF slide show</em>) <span class="anchor" id="line-81"></span></li><li><p class="line891"><a class="http" href="http://www.cnlp.org/presentations/slides/AdvancedLuceneEU.pdf">Advanced Lucene</a> presented by Grant Ingersoll of <a class="http" href="http://www.cnlp.org">CNLP</a> at <a class="http" href="http://www.eu.apachecon.com">ApacheCon Europe 2007</a>. Covers term vectors, query tips and tricks and Lucene performance tuning related to indexing, searching and document retrieval.<br>
(<em>Presented May 2007 - PDF slide show</em>) <span class="anchor" id="line-82"></span></li><li><p class="line891"><a class="http" href="http://blogs.atlassian.com/rebelutionary/downloads/tssjs2007-lucene-generic-data-indexing.pdf">Lucene: Generic Data Indexing</a> presented by Mike Cannon-Brookes, CEO, <a class="http" href="http://www.atlassian.com/">Atlassian Software Systems</a> at <a class="http" href="http://javasymposium.techtarget.com/lasvegas/index.html">TSSJS Las Vegas 2007</a>. Covers how Atlassian use Lucene as a generic indexing framework for indexing and finding arbitrary collections of complex objects.<br>
(<em>Presented March 2007 - PDF slide show</em>) <span class="anchor" id="line-83"></span></li><li><p class="line891"><a class="http" href="http://www.cnlp.org/apachecon2005/AdvancedLucene.ppt">Advanced Lucene</a> presented by Grant Ingersoll of the <a class="http" href="http://www.cnlp.org">Center for Natural Language Processing</a> at <a class="http" href="http://www.apachecon.com">ApacheCon 2005</a>. Covers term vectors, span queries, using Lucene in a basic question answering system, and several Lucene case studies from <a class="http" href="http://www.cnlp.org">http://www.cnlp.org</a>. The accompanying <a class="http" href="http://www.cnlp.org/apachecon2005">CNLP ApacheCon 2005 Information website</a> contains many working examples using term vectors and span queries. <span class="anchor" id="line-84"></span></li><li><p class="line891"><a class="http" href="http://lucene.sourceforge.net/talks/pisa/">Lucene lecture at The University of Pisa</a> (by Doug Cutting)<br>
(<em>Presented November 2004 - lecture notes</em>) <span class="anchor" id="line-85"></span></li><li><p class="line891"><a class="http" href="http://conferences.oreillynet.com/presentations/os2003/hatcher_erik_lucene.pdf">Introducing Lucene</a> (by Erik Hatcher)<br>
(<em>Presented at OS2003, July 2003 - PDF slide show</em>) <span class="anchor" id="line-86"></span></li><li><p class="line891"><a class="http" href="http://lucene.sourceforge.net/talks/inktomi/">The Lucene Search Engine: Inktomi Seminar</a> (by Doug Cutting)<br>
(<em>Presented June, 2000 - seminar notes</em>) <span class="anchor" id="line-87"></span><span class="anchor" id="line-88"></span></li></ul><p class="line867">
<h1 id="Training">Training</h1>
<span class="anchor" id="line-89"></span><span class="anchor" id="line-90"></span><ul><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/How-We-Can-Help/Training/">http://www.lucidimagination.com/How-We-Can-Help/Training/</a> - Training on Lucene created by Lucene committers and contributors (Grant Ingersoll, Erik Hatcher and the rest of the team at Lucid Imagination). <span class="anchor" id="line-91"></span></li><li><p class="line891"><a class="http" href="http://www.lucenebootcamp.com">Lucene Boot Camp</a> - Training by Lucene committer Grant Ingersoll. Offered exclusively at <a class="http" href="http://www.apachecon.com">ApacheCon</a>. <span class="anchor" id="line-92"></span><span class="anchor" id="line-93"></span></li></ul><p class="line867">
<h1 id="Corpora">Corpora</h1>
<span class="anchor" id="line-94"></span><ul><li><p class="line862">DMOZ RDF dump - <a class="http" href="http://rdf.dmoz.org/">http://rdf.dmoz.org/</a> <span class="anchor" id="line-95"></span></li><li><p class="line862">CMU newsgroups - <a class="http" href="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html">http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html</a> <span class="anchor" id="line-96"></span></li><li><p class="line862">CMU webpages - <a class="http" href="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/">http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/</a> <span class="anchor" id="line-97"></span></li><li><p class="line862">Reuters - <a class="http" href="http://www.daviddlewis.com/resources/testcollections/reuters21578">http://www.daviddlewis.com/resources/testcollections/reuters21578</a> <span class="anchor" id="line-98"></span></li><li><p class="line862">Enron emails - <a class="http" href="http://www-2.cs.cmu.edu/~enron/">http://www-2.cs.cmu.edu/~enron/</a> <span class="anchor" id="line-99"></span></li><li><p class="line862">JRC-ACQUIS Multilingual Parallel Corpus - <a class="http" href="http://wt.jrc.it/lt/Acquis/">http://wt.jrc.it/lt/Acquis/</a> <span class="anchor" id="line-100"></span><span class="anchor" id="line-101"></span></li></ul><p class="line867">
<h1 id="Other">Other</h1>
<span class="anchor" id="line-102"></span><ul><li><p class="line891"><a class="http" href="http://www.java201.com/resources/browse/38-all.html">Lucene Resources</a> - Articles, Books, FAQs, Forums, Presentations, Wiki. <span class="anchor" id="line-103"></span></li><li><p class="line891"><a class="http" href="http://www.nabble.com/Web-Search-f2787.html">Lucene Search Forum</a> - hosted by <a class="http" href="http://www.nabble.com">Nabble</a> archiving all Lucene and Nutch mailing lists into a searchable archive/forum. The search is coded using Lucene. <span class="anchor" id="line-104"></span></li><li><p class="line891"><a class="http" href="http://www.lucenetutorial.com">LuceneTutorial.com</a> - Tips and tricks, sample applications, code samples, best practices. <span class="anchor" id="line-105"></span></li></ul><span class="anchor" id="bottom"></span></div><p id="pageinfo" class="info" lang="en" dir="ltr">Resources (last edited 2010-05-03 22:31:43 by <span title="SteveRowe @ ist-h335-d03.syr.edu[128.230.84.100]"><a class="nonexistent" href="/lucene-java/SteveRowe" title="SteveRowe @ ist-h335-d03.syr.edu[128.230.84.100]">SteveRowe</a></span>)</p>
<div id="pagebottom"></div>
</div>
<div id="footer">
<ul class="editbar"><li><span class="disabled">Immutable Page</span></li><li class="toggleCommentsButton" style="display:none;"><a href="#" class="nbcomment" onClick="toggleComments();return false;">Comments</a></li><li><a class="nbinfo" href="/lucene-java/Resources?action=info" rel="nofollow">Info</a></li><li>
<form class="actionsmenu" method="GET" action="/lucene-java/Resources">
<div>
<label>More Actions:</label>
<select name="action"
onchange="if ((this.selectedIndex != 0) &&
(this.options[this.selectedIndex].disabled == false)) {
this.form.submit();
}
this.selectedIndex = 0;">
<option value="raw">Raw Text</option>
<option value="print">Print View</option>
<option value="RenderAsDocbook">Render as Docbook</option>
<option value="refresh">Delete Cache</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="SpellCheck">Check Spelling</option>
<option value="LikePages">Like Pages</option>
<option value="LocalSiteMap">Local Site Map</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="RenamePage" disabled class="disabled">Rename Page</option>
<option value="CopyPage">Copy Page</option>
<option value="DeletePage" disabled class="disabled">Delete Page</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="MyPages">My Pages</option>
<option value="show" disabled class="disabled">Subscribe User</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="show" disabled class="disabled">Remove Spam</option>
<option value="show" disabled class="disabled">Revert to this revision</option>
<option value="show" disabled class="disabled">Package Pages</option>
<option value="SyncPages">Sync Pages</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="Load">Load</option>
<option value="Save">Save</option>
</select>
<input type="submit" value="Do">
</div>
<script type="text/javascript">
<!--// Init menu
actionsMenuInit('More Actions:');
//-->
</script>
</form>
</li></ul>
<ul id="credits">
<li><a href="http://moinmo.in/" title="This site uses the MoinMoin Wiki software.">MoinMoin Powered</a></li><li><a href="http://moinmo.in/Python" title="MoinMoin is written in Python.">Python Powered</a></li><li><a href="http://moinmo.in/GPL" title="MoinMoin is GPL licensed.">GPL licensed</a></li><li><a href="http://validator.w3.org/check?uri=referer" title="Click here to validate this page.">Valid HTML 4.01</a></li>
</ul>
</div>
</body>
</html>

View File

@ -0,0 +1,105 @@
http://www.w3.org/TR/html4/strict.dtd
http://lucene.apache.org/java/3_0_1/api/all/overview-summary.html#overview_description
http://lucene.apache.org/java/3_0_1/gettingstarted.html
http://lucene.grantingersoll.com
http://www.lucidimagination.com/blog/
http://blog.sematext.com/
http://www.manning.com/hatcher3/hatcher3_cover150.jpg
http://www.manning.com/hatcher3/hatcher3_cover150.jpg
http://www.manning.com/hatcher3/hatcher3_cover150.jpg
http://www.manning.com/hatcher3/
http://www.amazon.com/Building-Search-Applications-Lucene-Lingpipe/dp/0615204252/
http://www.amazon.co.jp/exec/obidos/ASIN/4774127809/503-9461699-1775907
http://www.lucenebook.com
http://www.amazon.com/exec/obidos/ASIN/1932394281
Amazon.com
http://www.amazon.de/Suchmaschinen-entwickeln-mit-Apache-Lucene/dp/3935042450
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Getting-Started-with-Lucene/
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Optimizing-Findability-in-Lucene-and-Solr/
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Debugging-Relevance-Issues-in-Search/
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Scaling-Lucene-and-Solr/
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Introduction-to-Apache-Lucene-and-Solr/
http://cephas.net/blog/2008/03/30/how-morelikethis-works-in-lucene/
http://schmidt.devlib.org/software/lucene-wikipedia.html
http://marceloochoa.blogspot.com/2007/09/running-lucene-inside-your-oracle-jvm.html
http://www.onjava.com/pub/a/onjava/2007/05/24/using-the-lucene-query-parser-without-lucene.html
http://www.javaworld.com/javaworld/jw-09-2006/jw-0925-lucene.html
http://www-128.ibm.com/developerworks/java/library/wa-lucene2/index.html?ca=drs-
http://www.freesearch.pe.kr/tag/Lucene
http://www-128.ibm.com/developerworks/java/library/wa-lucene/index.html
http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html
http://www.jroller.com/page/wakaleo/?anchor=lucene_a_tutorial_introduction_to
http://blog.dev.sf.net/index.php?/archives/10-Behind-the-Scenes-of-the-SourceForge.net-Search-System.html
SourceForge.net
http://today.java.net/pub/a/today/2005/08/09/didyoumean.html
http://www.developer.com/java/other/article.php/3490471
http://www.theserverside.com/tt/articles/article.tss?l=ILoveLucene
http://javaboutique.internet.com/tutorials/HTMLParser/article.html
http://bilgidata.com/localhost/bilgidata/yazi.jsp@dosya=a_lucene.xml.html
http://www.chedong.com/tech/lucene.html
http://javatechniques.com/public/java/docs/basics/lucene-memory-search.html
http://www.javaranch.com/newsletter/200404/Lucene.html
http://www.darksleep.com/lucene/
http://www-igm.univ-mlv.fr/~dr/XPOSE2003/lucene/articleLucene.html
http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html
http://builder.com.com/5100-6389-5054799.html
http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
http://www-106.ibm.com/developerworks/library/j-lucene/
http://www.xml.com/pub/a/ws/2003/05/13/email.html
http://www.onjava.com/pub/a/onjava/2003/03/05/lucene.html
http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html
http://javangelist.snipsnap.org/space/Lucene-Mini-Tutorial
http://www.javaworld.com/javaworld/jw-09-2000/jw-0915-lucene.html
http://www.lucidimagination.com/index.php?option=com_content&amp;task=view&amp;id=109
http://www.lucidimagination.com/index.php?option=com_content&amp;task=view&amp;id=108
http://www.lucidimagination.com/index.php?option=com_content&amp;task=view&amp;id=113
http://lucene.sourceforge.net/publications.html
http://lucene.sourceforge.net/publications.html
http://people.apache.org/~buschmi/apachecon/AdvancedIndexingLuceneAtlanta07.ppt
http://www.us.apachecon.com/us2007/
http://people.apache.org/~yonik/presentations/lucene_intro.pdf
http://www.eu.apachecon.com
http://www.cnlp.org/presentations/slides/AdvancedLuceneEU.pdf
http://www.cnlp.org
http://www.eu.apachecon.com
http://blogs.atlassian.com/rebelutionary/downloads/tssjs2007-lucene-generic-data-indexing.pdf
http://www.atlassian.com/
http://javasymposium.techtarget.com/lasvegas/index.html
http://www.cnlp.org/apachecon2005/AdvancedLucene.ppt
http://www.cnlp.org
http://www.apachecon.com
http://www.cnlp.org
http://www.cnlp.org
http://www.cnlp.org/apachecon2005
http://lucene.sourceforge.net/talks/pisa/
http://conferences.oreillynet.com/presentations/os2003/hatcher_erik_lucene.pdf
http://lucene.sourceforge.net/talks/inktomi/
http://www.lucidimagination.com/How-We-Can-Help/Training/
http://www.lucidimagination.com/How-We-Can-Help/Training/
http://www.lucenebootcamp.com
http://www.apachecon.com
http://rdf.dmoz.org/
http://rdf.dmoz.org/
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/
http://www.daviddlewis.com/resources/testcollections/reuters21578
http://www.daviddlewis.com/resources/testcollections/reuters21578
http://www-2.cs.cmu.edu/~enron/
http://www-2.cs.cmu.edu/~enron/
http://wt.jrc.it/lt/Acquis/
http://wt.jrc.it/lt/Acquis/
http://www.java201.com/resources/browse/38-all.html
http://www.nabble.com/Web-Search-f2787.html
http://www.nabble.com
http://www.lucenetutorial.com
LuceneTutorial.com
ist-h335-d03.syr.edu
128.230.84.100
ist-h335-d03.syr.edu
128.230.84.100
http://moinmo.in/
http://moinmo.in/Python
http://moinmo.in/GPL
http://validator.w3.org/check?uri=referer

View File

@ -0,0 +1,311 @@
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.util.Arrays;
/**
* Copyright 2004 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);
public void testMaxTermLength() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testMaxTermLength2() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
}
public void testMaxTermLength3() throws Exception {
char[] chars = new char[255];
for(int i=0;i<255;i++)
chars[i] = 'a';
String longTerm = new String(chars, 0, 255);
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testAlphanumeric() throws Exception {
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
assertAnalyzesTo(a, "2B", new String[]{"2b"});
}
public void testUnderscores() throws Exception {
// underscores are delimiters, but not in email addresses (below)
assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
}
public void testDelimiters() throws Exception {
// other delimiters: "-", "/", ","
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}
public void testApostrophes() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
// possessives are actually removed by StardardFilter, not the tokenizer
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
}
public void testTSADash() throws Exception {
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
// to correctly search for these terms:
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
// 'a' is still a stopword:
assertAnalyzesTo(a, "a-class", new String[]{"class"});
}
public void testCompanyNames() throws Exception {
// company names
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
}
public void testLucene1140() throws Exception {
try {
ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
} catch (NullPointerException e) {
fail("Should not throw an NPE and it did");
}
}
public void testDomainNames() throws Exception {
// Current lucene should not show the bug
ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);
// domain names
assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
// the following should be recognized as HOST:
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
// 2.3 should show the bug
a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
// 2.4 should not show the bug
a2 = new ClassicAnalyzer(Version.LUCENE_24);
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
}
public void testEMailAddresses() throws Exception {
// email addresses, possibly with underscores, periods, etc
assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
}
public void testNumeric() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
}
public void testTextWithNumbers() throws Exception {
// numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
}
public void testVariousText() throws Exception {
// various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
}
public void testAcronyms() throws Exception {
// acronyms have their dots stripped
assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
}
public void testCPlusPlusHash() throws Exception {
// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
assertAnalyzesTo(a, "C++", new String[]{"c"});
assertAnalyzesTo(a, "C#", new String[]{"c"});
}
public void testKorean() throws Exception {
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
// Compliance with the "old" JavaCC-based analyzer, see:
// https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
public void testComplianceFileName() throws Exception {
assertAnalyzesTo(a, "2004.jpg",
new String[]{"2004.jpg"},
new String[]{"<HOST>"});
}
public void testComplianceNumericIncorrect() throws Exception {
assertAnalyzesTo(a, "62.46",
new String[]{"62.46"},
new String[]{"<HOST>"});
}
public void testComplianceNumericLong() throws Exception {
assertAnalyzesTo(a, "978-0-94045043-1",
new String[]{"978-0-94045043-1"},
new String[]{"<NUM>"});
}
public void testComplianceNumericFile() throws Exception {
assertAnalyzesTo(
a,
"78academyawards/rules/rule02.html",
new String[]{"78academyawards/rules/rule02.html"},
new String[]{"<NUM>"});
}
public void testComplianceNumericWithUnderscores() throws Exception {
assertAnalyzesTo(
a,
"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
new String[]{"<NUM>"});
}
public void testComplianceNumericWithDash() throws Exception {
assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
new String[]{"<NUM>"});
}
public void testComplianceManyTokens() throws Exception {
assertAnalyzesTo(
a,
"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
+ "safari-0-sheikh-zayed-grand-mosque.jpg",
new String[]{"money.cnn.com", "magazines", "fortune",
"fortune", "archive/2007/03/19/8402357", "index.htm",
"safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
"<ALPHANUM>", "<HOST>"});
}
public void testJava14BWCompatibility() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
}
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
// This produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir, true);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
MultiFields.getDeletedDocs(reader),
"content",
new BytesRef("another"));
assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir, true);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
}
}

View File

@ -1,35 +1,33 @@
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Copyright 2004 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -38,277 +36,365 @@ import org.apache.lucene.util.BytesRef;
*/
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
private Analyzer a = new StandardAnalyzer(TEST_VERSION_CURRENT);
public void testMaxTermLength() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer(TEST_VERSION_CURRENT);
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Arrays.fill(whitespace, ' ');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
public void testMaxTermLength2() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
}
private Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
public void testMaxTermLength3() throws Exception {
char[] chars = new char[255];
for(int i=0;i<255;i++)
chars[i] = 'a';
String longTerm = new String(chars, 0, 255);
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testAlphanumeric() throws Exception {
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
assertAnalyzesTo(a, "2B", new String[]{"2b"});
}
public void testUnderscores() throws Exception {
// underscores are delimiters, but not in email addresses (below)
assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
}
public void testDelimiters() throws Exception {
// other delimiters: "-", "/", ","
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}
public void testApostrophes() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
// possessives are actually removed by StardardFilter, not the tokenizer
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
}
public void testTSADash() throws Exception {
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
// to correctly search for these terms:
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
// 'a' is still a stopword:
assertAnalyzesTo(a, "a-class", new String[]{"class"});
}
public void testCompanyNames() throws Exception {
// company names
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
}
public void testLucene1140() throws Exception {
try {
StandardAnalyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
} catch (NullPointerException e) {
fail("Should not throw an NPE and it did");
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer);
}
};
/** Passes through tokens with type "<URL>" and blocks all other types. */
private class URLFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public URLFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
private class EmailFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public EmailFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
public void testDomainNames() throws Exception {
// Current lucene should not show the bug
StandardAnalyzer a2 = new StandardAnalyzer(TEST_VERSION_CURRENT);
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
// domain names
assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
// the following should be recognized as HOST:
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
// 2.3 should show the bug
a2 = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
// 2.4 should not show the bug
a2 = new StandardAnalyzer(Version.LUCENE_24);
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
}
public void testAmharic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
}
public void testArabic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
}
public void testAramaic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
}
public void testBengali() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
"শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
}
public void testFarsi() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
"برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
}
public void testGreek() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
}
public void testEMailAddresses() throws Exception {
// email addresses, possibly with underscores, periods, etc
assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
public void testThai() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" });
}
public void testLao() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
new String[] { "ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ" });
}
public void testTibetan() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
"མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
"", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
}
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
*/
public void testChinese() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 ",
new String[] { "", "", "", "", "", "", ""});
}
public void testEmpty() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
}
/* test various jira issues this analyzer is related to */
public void testLUCENE1545() throws Exception {
/*
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
* Expected result is only on token "moͤchte".
*/
BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
}
/* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception {
// alphanumeric tokens
BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
}
public void testNumeric() throws Exception {
public void testDelimitersSA() throws Exception {
// other delimiters: "-", "/", ","
BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}
public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
public void testTextWithNumbers() throws Exception {
public void testTextWithNumbersSA() throws Exception {
// numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
}
public void testVariousText() throws Exception {
public void testVariousTextSA() throws Exception {
// various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}
public void testAcronyms() throws Exception {
// acronyms have their dots stripped
assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
}
public void testCPlusPlusHash() throws Exception {
// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
assertAnalyzesTo(a, "C++", new String[]{"c"});
assertAnalyzesTo(a, "C#", new String[]{"c"});
}
public void testKorean() throws Exception {
public void testKoreanSA() throws Exception {
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
public void testOffsets() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20});
}
public void testTypes() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
public void testWikiURLs() throws Exception {
Reader reader = null;
String luceneResourcesWikiPage;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
luceneResourcesWikiPage = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != luceneResourcesWikiPage
&& luceneResourcesWikiPage.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, luceneResourcesWikiPage, urls);
}
public void testEmails() throws Exception {
Reader reader = null;
String randomTextWithEmails;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithEmails = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithEmails
&& randomTextWithEmails.length() > 0);
BufferedReader bufferedReader = null;
String[] emails;
try {
List<String> emailList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
emailList.add(line);
}
}
emails = emailList.toArray(new String[emailList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != emails && emails.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(emailAnalyzer, randomTextWithEmails, emails);
}
// Compliance with the "old" JavaCC-based analyzer, see:
// https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
public void testComplianceFileName() throws Exception {
assertAnalyzesTo(a, "2004.jpg",
new String[]{"2004.jpg"},
new String[]{"<HOST>"});
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithURLs = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithURLs
&& randomTextWithURLs.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, randomTextWithURLs, urls);
}
public void testComplianceNumericIncorrect() throws Exception {
assertAnalyzesTo(a, "62.46",
new String[]{"62.46"},
new String[]{"<HOST>"});
}
public void testComplianceNumericLong() throws Exception {
assertAnalyzesTo(a, "978-0-94045043-1",
new String[]{"978-0-94045043-1"},
new String[]{"<NUM>"});
}
public void testComplianceNumericFile() throws Exception {
assertAnalyzesTo(
a,
"78academyawards/rules/rule02.html",
new String[]{"78academyawards/rules/rule02.html"},
new String[]{"<NUM>"});
}
public void testComplianceNumericWithUnderscores() throws Exception {
assertAnalyzesTo(
a,
"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
new String[]{"<NUM>"});
}
public void testComplianceNumericWithDash() throws Exception {
assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
new String[]{"<NUM>"});
}
public void testComplianceManyTokens() throws Exception {
assertAnalyzesTo(
a,
"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
+ "safari-0-sheikh-zayed-grand-mosque.jpg",
new String[]{"money.cnn.com", "magazines", "fortune",
"fortune", "archive/2007/03/19/8402357", "index.htm",
"safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
"<ALPHANUM>", "<HOST>"});
}
public void testJava14BWCompatibility() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
sa = new StandardAnalyzer(Version.LUCENE_31);
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test\u02C6test" });
}
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
// This produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir, true);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
MultiFields.getDeletedDocs(reader),
"content",
new BytesRef("another"));
assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
StandardAnalyzer sa = new StandardAnalyzer(TEST_VERSION_CURRENT);
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir, true);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_5_2_0 wordBreakTest = new WordBreakTestUnicode_5_2_0();
wordBreakTest.test(a);
}
}

View File

@ -0,0 +1,204 @@
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Arrays.fill(whitespace, ' ');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
private Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29Tokenizer(reader);
return new TokenStreamComponents(tokenizer);
}
};
public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
}
public void testAmharic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
}
public void testArabic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
}
public void testAramaic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
}
public void testBengali() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
"শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
}
public void testFarsi() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
"برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
}
public void testGreek() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
}
public void testThai() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" });
}
public void testLao() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
new String[] { "ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ" });
}
public void testTibetan() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
"མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
"", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
}
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
*/
public void testChinese() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 ",
new String[] { "", "", "", "", "", "", ""});
}
public void testEmpty() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
}
/* test various jira issues this analyzer is related to */
public void testLUCENE1545() throws Exception {
/*
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
* Expected result is only on token "moͤchte".
*/
BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
}
/* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception {
// alphanumeric tokens
BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
}
public void testDelimitersSA() throws Exception {
// other delimiters: "-", "/", ","
BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}
public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
public void testTextWithNumbersSA() throws Exception {
// numbers
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
}
public void testVariousTextSA() throws Exception {
// various
BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}
public void testKoreanSA() throws Exception {
// Korean words
BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
public void testOffsets() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20});
}
public void testTypes() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_5_2_0 wordBreakTest = new WordBreakTestUnicode_5_2_0();
wordBreakTest.test(a);
}
}

View File

@ -0,0 +1,265 @@
dJ8ngFi@avz13m.CC
JCAVLRJg@3aqiq2yui.gm
kU-l6DS@[082.015.228.189]
37layCJS@j5NVP7NWAY.VG
"%U@?\B"@Fl2d.md
aH3QW@tw8uo2.eu
Bvd#@tupjv.sn
SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt
DvdUJk@61zwkit7dkd3rcq4v.BD
~+Kdz@3mousnl.SE
C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY
}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM
lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae
V85E9Hx7@vpf0bs.bz
MGBg2@7F3MJTCCPROS8YETM0B4-C9P7WXKGFB0.RU
rsBWOCJ@lYX0SILY4L53Z3VJPSF6.pwrawr.vdpoq.nz
dIyLrU@9A40T2ZIG7H8R.t63.tv
6dAsZKz@d33XR.IR
EnqCC@2bk6da6y08.LI
AQ9yV@Mfqq32nexufgxzl4o7q5jv3kd.lb
lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H
b6/zomNkV@8jwm-he.IN
5FLuakz.hXVkuqDt@iBFP83V6MNI3N0FRWJ9302DS-0KHRV6O.1bf59kj64uj5b6e2zfn.cm
RhIwkU@58vmet9yfddpg.3adkmhrv1px.AO
nEBk6w2Q@Bb5ib.2pay.so
AlW5CMAn@qos-53u.j91qq96d4en129szf7099kxv5lo6yo.gm
QPYBDV3.Ah/h8U@x3v444pzi.1cvgokam.PW
5Iwbiq7@p9s-2pixps9jwzyhfroxqivw8sv90r.xn--wgbh1c
AaFU9L@3yj1xqf1.cz9.ac
|iCmQ1@rum6w0a7wt.3QLD.ht71.cx
EhLTUjo@rEK.sJ44H0.GR
bHEbq3Rp@33.lKSSMY.9xaurtfle9xe.iu4810l.fj
eFcup.cPPEW@[1ae]
p907@bk3o.fvtmw2m2.Uutr83x2yt4.2nuin.EU
PpW2L5.QgP2n@9rz7.a5qi.oRH1Z.8ov.UZ
o8UgG5fewm4vr9Ai5wPS@sgh.2F-OLKLZ81DIUET.xpya0vtx.fj
aixQH@z-y.AR
jVTeWQfL."M#~t Q"@1e.oglq.ubk.SZ
6e5QQuy@N7.2cuw3x2wpddf.paycp1pc.AI
IqG6Fl@[220.112.120.54]
lWHH4eWSn@tbxyb7.jhzqxrk.lv
P1zO*RaAr@[111.99.108.22]
d00gy@[4TC]
1yNINoBU@[136.003.010.238]
Ms8ox@[_3Tuehr]
wtWDNo@1sjmcbbli196-765mt7m8o8hywft.7-ga6rsnum8v.np
"x)yO "@7le5o2rcud5ngs.Qmfmq.Jfxv8.Zznv6t6il.MIL
1hXd@f8.1kxqd3yw4j6zmb7l7.US
"8}(\$"@mu2viak0nh4sj5ivgpy1wqie.HK
Th7XoAs5@ggdb.BI
5iDbhah.xdtF1x@[59.55.12.243]
j2ovALlgm2Wcwx@5jphzt.TN
ZlaP~E.4Yk1K0F@lF6VN.M5.Nj.PRO
cFCvIJAw@l93H0R1W6V4RI0AY7RLRQR4KOEVQPEG-PDTF03V4D9A0.xZZK5.lu
8Ju2AW@1n.h7.vu
"\nkP]{"@[Vej\yo\HD]
fKWC?@qgcb.xn--mgbaam7a8h
L4BbaB@hv1.BIZ
WvSmV@qpx15vzmbtxzvi-syndl1.ML
"3|PX~Cbdq"@U3vp-7k.8c4q3sgpwt6sochundzhx.museum
LjH9rJTu@tkm.gy
vQgXEFb@maxmrbk-5a5s6o.6MZZ6IK.awjbtiva7.IL
6TVbIA@r50eh-a.la
AaASl@Bsteea.qHXE3Q5CUJ3DBG.S2hvnld.4WJWL.fk
"CN;\-z 6M"@86.qc7s.23p.ET
zX3=O3o@Yjov.7g660.8M88OJGTDC5.np
QFZlK1A@4W47EIXE.KY
1guLnQb07k@ab.ccemuif2s.lb
Jddxj@[111.079.109.147]
Hj06gcE@[105.233.192.168]
u8?xicQ@[i\21I]
CczYer}W@bezu6wtys9s.lft3z.mobi
OmpYhIL@6GJ7P29EIE-G63RDW7GLFLFC0M1.AERO
2RRPLqO@8lh0i.vm7xmvvo-r5nf0x.CY
TOc!BhbKz@F-myy7.kQWSUI7S3.net
"0\!P?".shQVdSerA@2qmqj8ul.hm
LTLNFsgB@[191.56.104.113]
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU
VGLn@z3E2.3an2.MM
TWmfsxn@[112.192.017.029]
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV
CjaPC63@['\RDrwk]
Ayydpdoa@tdgypppmen.wf
"gfKP9"@jo3-r0.mz
aTMgDW4@t5gax.XN--0ZWM56D
mcDrMO3FQ@nwc21.y5qd45lesryrp.IL
NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp
XtAhFnq@[218.214.251.103]
x0S8uos@[109.82.126.233]
ALB4KFavj16pODdd@i206d6s.MM
grxIt96.46nCf@nokjogh2l4.nCMWXG.yt
Fgbh7@2rxkk0bvkk-v3evd-sh56gvhxlh.hhjcsg36j8qt98okjbdj9z574xdpix59zf6h80r.Gyb4rrxu.ve
uo0AX41@Fhlegm1z57j-qvf5.p8jo6zvm.sc
sjn4cz@9ktlwkqte.bv
b04v0Ct@[243.230.224.190]
F!FUbQHU@uvz7cu1l.ciz4h2.93U4V.gb
6CHec@nONUKT.nl
zbmZiXw@yb.bxxp.3fm457.va
"/GdiZ7f"@[221.229.46.3]
NJde8Li@f7a.g51VICBH.cy
6IeAft@e-3fp.Nkh7nm8.v8i47xvrv27r.pf
TC*Qopzb@xIOB3.6egz4.m-24t5wmxtmco4iy8g91o66mjgha1vjlepyffott.E5ta.p9.CF
"_3Sc_"@[193.165.124.143]
W0dwHf@[25.174.65.80]
qPkkP0@4k0vs.oaak2z.3JMTI.PK
XzZh7@[\\Jm D%U]
66SGHzw@Oqnr82oml7jct0b8crwbstdhcgc3khxj7dj-t898mzro0p3-rvp-dythh.TN
ot4tPF@[AY\j]
e4seIFbl@cib.cg
B2w025e@r2H7BW16B24DG1S5DED.bg
atweEde@blk-3y.mgvoh6l9my.F6.FI
uDoPcRGW@rEBD5LUT.ly
2KQhx@Bba.u--9b5bc0.NF
tKWc2VjVRYD@[254.190.162.128]
wc3W16^@D3v2uxqqeclz.w1fd529m.DM
Njg@6S8MA.HK
"L\^4z]92"@0qp--walx.MIL
X08sWFD@62GNK.tN4.f1YXX.ug
eK6Bz1Bu@[rX;J&036]
"~`o\: "@hO4UKF.oZBWV56B.cmn.DJ
lcgUakx@[pjGd&i2]
BqdBTnv3c@wf35nwaza.ME
"a#Um{:\'\bX:"@in7tjo.uw8wil.gp
ApIbER8'@[&Y]
JTsM0c!s9CzEH@Sd.mh
hy2AOUc@uqxzl7v0hl2nchokqit9lyscxaa0jaqya1wek5gkd.NC
pY7bAVD4r@[,>T*R T]
!0axBT@03-gdh1xmk3x9.GH
vbtyQBZI@20al5g.ro6ds4.Bsg15f5.NU
2^ZhSK-FFYOh@Z2iku.rg.Z0ca1.gs
G1RLpOn."yfJpg["@mXEV8.mu
yrBKNkq@a2a1.Aifn.Ta2.dj
Wok5G@b5aqobvi5.ni
nXz9i.=EL9Yj@93r8do3ntizibg1-5-a0ziw9ugyn4bo9oaw3ygrxq-eczzv1da6gj58whvmo2.rs
Dp63hd@B1kbahyq.PL
y01rn27SFq@o0HNP8.C5.i4rvj8j338zgter7er5rkwyo5g.atnc0iuj2ke.8or6ekq0x.IO
0RiEo@08mnvbu.p661ernzjz5p7nbyix5iuj.cig5hgvcc.SO
Dwxab5@1sx5y3-umsy72nl.74lwye5.DJ
IvdZVE4xRk@0vw7ajl.AR
CvQxhXJ@d5a7qnx.ke
n7MxA4~@[4(R]
RFGzu3hD0@wbh4.sm
eOADW}BcNG@2568p3b4v.Xq3eksr.GP
AsAMWriW7.zSDQSAR6@Gg2q4rtgr.GG
cDCVlA0t@[20.116.229.216]
c=yJU+3L5@n2x3xhksf.gvreani.MZ
wfYnaA4@lzojy.4oii6w6sn-p9.kh
kdeOQ5F@vD5Y.wmmv.7rswz.1zelobcp5qxxwzjn.fOEJZ.KM
ppULqb2Z@Hv9o2ui.AO
tOHw@[IPv6:3500:8B6C::CB5E:1.124.160.137]
MWLVsL@7nhliy.O8mjon3rj-kb.t8d6bcpa5i.au
BN0EY@hh9v.p9bwgs.TN
RgiAp@d9ln.bf
PBugBo@97gcz.DJ
Fh#dKzbI@[+_]
wyqU-C9hXE@wPRBUI-WS9HXE19.LV
muC?Js@[IPv6:47FB:5786:4b5e::5675]
yLTT2xV@wdoszw9k1ork-z-t.kq.l3SEO.Lb4jx0.NA
6zqw.yPV4LkL@dA3XKC.eg
S5z9i7i3s@Vzt6.fr
L|Sit6s@9cklii1.tf
yWYqz@mw-9k.FJ
Knhj419mAfftf@R26hxll64.3qtdx6g.AL
aZYHUr6@Shyn76c67.65grky.am
ZYxn6Px@di0cqhtg.hu
"#mLl"@w1sc0g3vm.j1o4o9g.GW
WYJcFp@653xk-89oprk2im.iemhx9.CC
y5AXi@[Oa #]
nZErAGj@6sq3-p.r8KQ.aero
OMq5sBK@udg-5zp1.Dory85.SG
2bymd@Ojla1hvfpw8rrihrx.cy
5OMbw0@r2d8cn75.1VR2BJ0J3A8PY.gc0mljc-h.COOP
al6X^pQkx@pyj--2hp.lbet.TN
NkzPW4f@2-0.aaoqccwrgi4olytac0imp6vvphsuobrr115eygh2xwkvzeuj.tl
"4-b9|/,\e]h]2"@9-iiahsdlzv-v65j.FK
g8Pv2hb9@[166.176.68.63]
"IA~".Tn03w7@[\>J?]
E6aK9TaJ@j0hydmxhkq2q.Svku4saky.MU
rdF2Zl1@9fsic.C17pw9o0.vn
pCKjPa88DG&x5a@4ha07ia2jk.xk7xe8.PM
qgLb5m@nynqp.DE
qC731@["\S]
vIch1nT@[IPv6:4c2f:A840:1788:ad5:C2C6:dfae:1b1f::]
GVSMpg@2YGZ1R19XTW1TIH.Re3vg30u1xq6v7cj1wf-6m14939wvgqbl.93mztd.SG
0jq4v7PMxm@eq6teog.kO6LR3.x2p.53yltrsvgpd3.RO
zdGLZD0P@i2JQNM8.816oja8pkk5zkvyx.KM
Jp#hSH@74zkerax4.31kr.7c9-yuk.mp
Kx^0oZn@oFFA-URZ13B34J.DK
sub52@aoq7.iHF.CH
jfVSq9oAR2D@iGU0.7bp3x.4cr.sz
nalgU@Yfpbdcv8a5.n9kwz6kyi2u.thic-rws.af.TG
=uC5qVT@56g530cltpekrw.pt
QR5&kx@7qhi3bhav5ga0eva.b0sdom.bb
8DZQ7@dtr16r89fdw59q.cf
Q4pNw@6o-9weojl3r7.LS
*mfOc_CN@[G\ 3]
2p`tbG@c767inolrav0hg6a-ucs.y0.tw
Rop{cgBy@Wekdh0xns2um.UK
t*p05lV@017y.MR
7ZxO80@Dovepwr4l.qxfzchrn1.es8ul0vavi6gqy82.K1hc7.INT
C_Iphp@5t4rtc.id
q+m2x@Cfw.1tm52-kr.BO
47NIL@Hl68os0.66l9bsf2q.SC
vi0LyF9O@p74jz6mxby.it
xQ4jU@rQVWLWAD3T8.4-lnu.AZ
zea_0Kr@[97.59.144.249]
5HP1k|s@[068.150.236.123]
5XJZlmYk.3Du5qee@[072.023.197.244]
AvNrIHB0@[+n}oV]
"!N7/I\zhh"@[204.037.067.146]
vlJODxFF@xFO6V.i1.fgad6bjy.NO
qDe0FA@xpp1le82ndircjgyrxyzkrqu3il.oUKHVV6829P-16JILWG62KN.cr
pMF64@wssq6kh9uhxk.cA2YZVBV4JW.xX585A.ru
G3meE@[^!'OO]
"1@0UYJl"@vplkx.d2n.i3tcx3aaxut.lbb3v9.ldq.me
iTH0QND@wg9sizy.lr
9kF?opSTo9rSDWLo&W&6@xrh32ibf.F0zb6kb.BJ
a0FI1m@1olkdpz.W70a3w8qmk3.NA
"0H}r}X(p\M`/x"@rY48LPH.Axy.Ue624.TV
AQL6YBFb@Hxawb15okz.y4.y5c0e.bt
PEaNVR@m8NH9BVX5L096DRM7YTR.er
diI`Q@i5fpkuc.7zg2av.D6tzqq.CK
TCN0-Z@Tezeq9ejv.ekeab8hz14hui.il
05SnFh@jZ85JXZ.1RO99W5FYK3.uyv7g15.MP
B2Z76Rn@9yce0shfsydxetu1v4-y.rBU2M0.6ik8oapv0zho6n653il25gu4rd216uw03.MG
vGZ2K@C2osgjtel5uerwn.riihbabhh41ve84.r3l.vH6S64.vn
Nv2ZgL@[037.054.177.155]
WsdI2W@i1ULFQ1.79qfph2.eg
vJfpTf3@Hh4x2h.25m0idq3.fr
oRqbgftr@l6jg0.TV
NiynsKb@k9BTX4-FV.hc0skm-o.lv
w9uGwf@4hop8.Jb9655is.nr
"NVUW+"@6jbe.KM
QusHU6JMR@0RXKIZNH76C3.Oqwcfr779e.MH
}C5IwKv1S45vlmPaaVHhF@[IPv6:EBF6::]
T7rXlYc@4AI1LM.2o.uk
uuCiDC6c@Maar3.65hlg-wf.t3pt9.FJ
w2mNOvIUh@dx3ep7ew.ru
b#Add@9hpopo.Xg3tbjchdpt.TT
NtrgJjfj."NBwi"@[142.085.096.018]
00lF9UB@2NR2.rs
MPr42ye9@p08lcrzs.4bzxfznsh2bhgsa.CX
awwLoYLn~c2LfTEVT@fwksx.qoj94r11kw19k50k3.gd
gRZ5w9epm@p6adico3auugj5qklec.Sm4bx5.li
zfdZ67Y@1azhq.dl3xxzni2.rrj.lpclc6g4d.sl
vTWwSD4fb@uBSOHD.3g.u3mb.gf
cYFVxcC6E@F9g0b.n1339r.AU
pnuXl@s1alo2.tc
lKy64zp.Cbg8BM@y0S.6uiux8h8.0udipt.ma
|9FDgc@vbrz.3L.av4kmt.rs
skcHAu7@xD715N1.DZ
BfcgHK3@[220.136.9.224]
LCOEag@Gwm.drsa0.GL
qrNZtp3vO@a0gr.8j9cvcgy0p-3.HN
lfW2rei20XWSmpQoPY1Dl@[(N &c]
WFBBEv|@q7R2J.oy48740.pm
6H6rPx@zVJ40.xgyat.cLUX6SVFJWMLF9EZ2PL8QQEU7U1WT0JW3QR8898ALFGKO18CF1DOX89DR.1tfu30mp.CA
ytG@J4auwv4has.PS
"X;+N1A\A "@rc9cln0xyy8wa6axedojj9r0slj0v.Luy9i6ipqrz74lm5-n6f1-2srq5vdo-opef747ubdykv5hc.2lztpe.er
DQTmqL4LVRUvuvoNb8=TT@2up3.PY
NC0OPLz@kcru1s0mu.name
kBoJf{XaGl@[248.166.223.221]
pEjZPm8A@v956Y7GQV.5uu6.Ribgf20u.6e.0do1nki1t.ahy.6iy.sm
pIFWkl2@w9N0Q.MC
p=VTtlpC@w3ttqb.FO

View File

@ -0,0 +1,206 @@
#!/usr/bin/perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use warnings;
use strict;
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
my $version = '';
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
print STDERR "Usage: $script_name -v <version>\n";
print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
if ($version);
exit 1;
}
my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
my $scripts_url = "${url_prefix}/Scripts.txt";
my $line_break_url = "${url_prefix}/LineBreak.txt";
my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
my $underscore_version = $version;
$underscore_version =~ s/\./_/g;
my $class_name = "WordBreakTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__";
package org.apache.lucene.analysis.core;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
/**
* This class was automatically generated by ${script_name}
* from: ${url_prefix}/auxiliary/WordBreakTest.txt
*
* WordBreakTest.txt indicates the points in the provided character sequences
* at which conforming implementations must and must not break words. This
* class tests for expected token extraction from each of the test sequences
* in WordBreakTest.txt, where the expected tokens are those character
* sequences bounded by word breaks and containing at least one character
* from one of the following character sets:
*
* \\p{Script = Han} (From $scripts_url)
* \\p{Script = Hiragana}
* \\p{LineBreak = Complex_Context} (From $line_break_url)
* \\p{WordBreak = ALetter} (From $word_break_url)
* \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/
public class ${class_name} extends BaseTokenStreamTestCase {
public void test(Analyzer analyzer) throws Exception {
__HEADER__
my $codepoints = [];
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
# Using lowercase versions of property value names to allow for case-
# insensitive comparison with the names in the Unicode data files.
parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints,
{'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
open OUT, ">$output_path"
|| die "Error opening '$output_path' for writing: $!";
print STDERR "Writing '$output_path'...";
print OUT $header;
for my $line (@tests) {
next if ($line =~ /^\s*\#/);
# ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
my ($sequence) = $line =~ /^(.*?)\s*\#/;
print OUT " // $line\n";
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
my $test_string = $sequence;
$test_string =~ s/\s*÷\s*/\\u/g;
$test_string =~ s/\s*×\s*/\\u/g;
$test_string =~ s/\\u000A/\\n/g;
$test_string =~ s/\\u000D/\\r/g;
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
my @tokens = ();
for my $candidate (split /\s*÷\s*/, $sequence) {
my @chars = ();
my $has_wanted_char = 0;
while ($candidate =~ /([0-9A-F]+)/gi) {
push @chars, $1;
unless ($has_wanted_char) {
$has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
}
}
if ($has_wanted_char) {
push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
}
}
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
print OUT " new String[] { ";
print OUT join(", ", @tokens), " });\n\n";
}
print OUT " }\n}\n";
close OUT;
print STDERR "done.\n";
# sub parse_Unicode_data_file
#
# Downloads and parses the specified Unicode data file, parses it, and
# extracts code points assigned any of the given property values, defining
# the corresponding array position in the passed-in target array.
#
# Takes in the following parameters:
#
# - URL of the Unicode data file to download and parse
# - Reference to target array
# - Reference to hash of property values to get code points for
#
sub parse_Unicode_data_file {
my $url = shift;
my $target = shift;
my $wanted_property_values = shift;
my $content = get_URL_content($url);
print STDERR "Parsing '$url'...";
my @lines = split /\r?\n/, $content;
for (@lines) {
s/\s*#.*//; # Strip trailing comments
s/\s+$//; # Strip trailing space
next unless (/\S/); # Skip empty lines
my ($start, $end, $property_value);
if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
# 00AA ; LATIN
$start = $end = hex $1;
$property_value = lc $2; # Property value names are case-insensitive
} elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
# 0AE6..0AEF ; Gujarati
$start = hex $1;
$end = hex $2;
$property_value = lc $3; # Property value names are case-insensitive
} else {
next;
}
if (defined($wanted_property_values->{$property_value})) {
for my $code_point ($start..$end) {
$target->[$code_point] = 1;
}
}
}
print STDERR "done.\n";
}
# sub get_URL_content
#
# Retrieves and returns the content of the given URL.
#
sub get_URL_content {
my $url = shift;
print STDERR "Retrieving '$url'...";
my $user_agent = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $url);
my $response = $user_agent->request($request);
unless ($response->is_success) {
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
exit 1;
}
print STDERR "done.\n";
return $response->content;
}

View File

@ -0,0 +1,427 @@
=========
This file was generated in part (i.e. without the email addresses)
by the random text generator at:
<http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-rosalixion-word-2gram&paragraphs=20&length=200&suppress-quotes=on&no-ads=on>
=========
waist and Wintja are relearning how dJ8ngFi@avz13m.CC we spread out, but it
here before, our dimension of story. In Bed and Marys opus in the last thing
actually having difficulties moving, Spiros rises to our hidden on your
<JCAVLRJg@3aqiq2yui.gm> orders, my love: Im seven doors and with gentle
fingers, then disappears? Whats the idea <kU-l6DS@[082.015.228.189]> of
<37layCJS@j5NVP7NWAY.VG> the "%U@?\B"@Fl2d.md pages blowing to appear on Earth
in motion (what rules did we can take a radio changes. A VOICE: Hes a
scoundrel. VOICES: Burn him! Burn him! SPIROS: Want to team of the couple is
the sweetest love aH3QW@tw8uo2.eu of the teaching teaches members to
communicate with time interplaying and linked and you marry it. It will leave
Bvd#@tupjv.sn the logic of it from hereing those people were all
SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt the
artist stray? Does a few rose doom the UFO with my dear Sissy says Sissy,
holding hands up a bit of DvdUJk@61zwkit7dkd3rcq4v.BD fate falls asleep. When
an internet age is ~+Kdz@3mousnl.SE currently working with his bedside table,
and brings in a shimmering timeshifty verse vortex, the dream. Victory is
hallucination, my hand for more. Mmm my head,
C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY in five. (Spiros waves goodbye to tell
you, honeybuns: The poisoning is, but no addresses. A message identical reach
across the script. }0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM I grasp hold their
flapping wings and when theyre seemingly infallible information? Bookshrine of
a sip of defined the Great Horned Goddess of no feeling.) Meaw. FFIANA: So,
darling. Dont be dry white and teases him back
lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae in society not speaking, giggling
V85E9Hx7@vpf0bs.bz in MGBg2@7F3MJTCCPROS8YETM0B4-C9P7WXKGFB0.RU the boring
f***s! (She leaves and Him Lover, Outlanders. Plus Universe where better than
they just the land any letters in the gods. Expected, this at the threesome get
even touching myself. rsBWOCJ@lYX0SILY4L53Z3VJPSF6.pwrawr.vdpoq.nz He picks
dIyLrU@9A40T2ZIG7H8R.t63.tv up at our harem world 6dAsZKz@d33XR.IR so pop up
you will be gathered, then Wintjas hair; smells of the manuscript: Contains a
EnqCC@2bk6da6y08.LI common AQ9yV@Mfqq32nexufgxzl4o7q5jv3kd.lb universal within
this lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H web.
b6/zomNkV@8jwm-he.IN The
5FLuakz.hXVkuqDt@iBFP83V6MNI3N0FRWJ9302DS-0KHRV6O.1bf59kj64uj5b6e2zfn.cm cosmos
is filled with soap bubbles. <RhIwkU@58vmet9yfddpg.3adkmhrv1px.AO> I cant
concentrate with a nearby and he nEBk6w2Q@Bb5ib.2pay.so pours.
<AlW5CMAn@qos-53u.j91qq96d4en129szf7099kxv5lo6yo.gm> Its a wine with the joke
in the only good enough! It hit again the house. He thinks of terrorist, this
water. They were in verbatim rewritable. World by a quick eye shadow beneath
the stairway; we not easily counter weight, is filled with your own perceptions
about it. (Eve, how to talk to you really turns on its physics. The lover on
the sunflower in worship of the? (She smiles.) Greet
<QPYBDV3.Ah/h8U@x3v444pzi.1cvgokam.PW> it makes sense$A!-(B Not really,
5Iwbiq7@p9s-2pixps9jwzyhfroxqivw8sv90r.xn--wgbh1c from up in the candlelight,
denser <AaFU9L@3yj1xqf1.cz9.ac> medium to say something. Shifting of that
|iCmQ1@rum6w0a7wt.3QLD.ht71.cx the eyes and there came. And now, approaching.
When the thing. What did I woke up the printers! We EhLTUjo@rEK.sJ44H0.GR shall
we are heard like a glimpse of hyperspace. It travels further and kneeled down
bHEbq3Rp@33.lKSSMY.9xaurtfle9xe.iu4810l.fj to you can walk away? FFIANA: I want
to eFcup.cPPEW@[1ae] speak. The Fountain of the background when I extract of
hers, so strange book and a royal destruction of songs of this pearl. Not often
by an incinerator vessel. Spiros, the delivery of alien exists now. Forward.
The rosy guidance of wine. Notices that is partly the pipe
p907@bk3o.fvtmw2m2.Uutr83x2yt4.2nuin.EU of the chance in Old Town. D Strange
music keeps one of the top of myth and smiles.) SPIROS: Nope, cant even
PpW2L5.QgP2n@9rz7.a5qi.oRH1Z.8ov.UZ more! says it doesnt exist! The world in
the cosmos loves us. (Spiros soon
o8UgG5fewm4vr9Ai5wPS@sgh.2F-OLKLZ81DIUET.xpya0vtx.fj here again aixQH@z-y.AR
and again he turns and blinks with you want? says Sissy looks over Wintja and
the fashions of Fit to Spiros continues. Its a situation of the barman says
Spiros. I read the river. SPIROS: Damn I said. 69
<jVTeWQfL."M#~t Q"@1e.oglq.ubk.SZ> he kept locked up into a suitcase along
her body, points a female voice of 6e5QQuy@N7.2cuw3x2wpddf.paycp1pc.AI their
part of flowers, and Marys opus IqG6Fl@[220.112.120.54] in my PROSECUTOR: Hes
<lWHH4eWSn@tbxyb7.jhzqxrk.lv> one is <P1zO*RaAr@[111.99.108.22]> unsafe at a
little <d00gy@[4TC]> secrets, we made to write: And a drink of Eternity,
Speros, <1yNINoBU@[136.003.010.238]> Mr Boore, back to me! Lovers break
Ms8ox@[_3Tuehr] the code so
<8'Hk8a@ksf7qqaa7616xw8dq80h.K6fy89c.3k-8c.g58m48v-18zh8v> recap.29 28 So,
darling. Dont leave each itself, on and devotion to all about time
<wtWDNo@1sjmcbbli196-765mt7m8o8hywft.7-ga6rsnum8v.np> has happened? ANON 4593:
What the tongue Such as she did you back and the whole moment in
<"x)yO "@7le5o2rcud5ngs.Qmfmq.Jfxv8.Zznv6t6il.MIL> your own lens, thank you
1hXd@f8.1kxqd3yw4j6zmb7l7.US arent already. It tastes them have ever come come!
The tomb. Blink to him and flips to it, but the palace. No
"8}(\$"@mu2viak0nh4sj5ivgpy1wqie.HK way$A!-(B Happily: You smell of it
all and yet sure this pool Th7XoAs5@ggdb.BI of the first of his
5iDbhah.xdtF1x@[59.55.12.243] heart j2ovALlgm2Wcwx@5jphzt.TN can take to the
wind, speak to apply perfectly, you say turn toward sexual nature and lays his
ZlaP~E.4Yk1K0F@lF6VN.M5.Nj.PRO pipe. No, landing from
cFCvIJAw@l93H0R1W6V4RI0AY7RLRQR4KOEVQPEG-PDTF03V4D9A0.xZZK5.lu the fruit will
say. -F<>Dont talk like the west 8Ju2AW@1n.h7.vu wing of the letter in every
second, <"\nkP]{"@[Vej\yo\HD]> but he slipped in. Yours Spiros and there
when I imagined anything can take returning? <fKWC?@qgcb.xn--mgbaam7a8h> Where?
With? Who? Going toward his body and kisses the notion that has joined odds. A
scattered around <L4BbaB@hv1.BIZ> slowly, moving eyes on and
WvSmV@qpx15vzmbtxzvi-syndl1.ML turns toward her. She sips some way everything
began was finished my wet Earth. Warning
"3|PX~Cbdq"@U3vp-7k.8c4q3sgpwt6sochundzhx.museum for me.-A City Different.
Let your myth LjH9rJTu@tkm.gy settles over it
<8myMO4@hOV209VZ-SHGBIH5FBYLTCQZSBW-U5-1.dv9> means to Our of a book he has
only but <vQgXEFb@maxmrbk-5a5s6o.6MZZ6IK.awjbtiva7.IL> the imagination, master
phreaker, <5ohpA3ww@dcpcotwccy> main railway station. Loses the dreamadoory in
the surprising success.) A note from round is her splendour in them? Mmm my
dear, were 6TVbIA@r50eh-a.la from them keywords. Boy,
AaASl@Bsteea.qHXE3Q5CUJ3DBG.S2hvnld.4WJWL.fk my own imagination, master
"CN;\-z 6M"@86.qc7s.23p.ET is the usual fashion, says to stream and appointed
space-time continuum. Dilutes your zX3=O3o@Yjov.7g660.8M88OJGTDC5.np sleep. Ive
been seen, he says the ringnot we proved? (On the pact. Thanateros is an
internet caf<61> where the Queen. Now cmon, lets take to raise the apartment. Like
a limousine and I kiss timelord slides his hand QFZlK1A@4W47EIXE.KY in words
now. Get us in the same time conceptualisation is to bed. STEFANDIS: Dont do
you think Ive put down the green lush. She often by God of a 15 minutes. The
others knew into the 1guLnQb07k@ab.ccemuif2s.lb you-know-what. Youre the luxury
hotel. Diamonds and receive the process of action. We wanted in the nominated
bird. The <Jddxj@[111.079.109.147]> woman undressing. He has him just get at
Hotel California. Its <Hj06gcE@[105.233.192.168]> about all devices. Playlist?
Initiating playlist. Timelock? Timelock on. We have a u8?xicQ@[i\21I] lock of
the apartment. Like a kto, part of Our superhallugram to hook up and
CczYer}W@bezu6wtys9s.lft3z.mobi outs. polish
OmpYhIL@6GJ7P29EIE-G63RDW7GLFLFC0M1.AERO fills the crowd, comes from the music
is impossible. SPIROS: F***. You are your voo goo.
<2RRPLqO@8lh0i.vm7xmvvo-r5nf0x.CY> Daysends burn deeply and will take
TOc!BhbKz@F-myy7.kQWSUI7S3.net this he thinks. For UFO from elsewhere. Bzzz!
Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them "0\!P?".shQVdSerA@2qmqj8ul.hm the leg
of LTLNFsgB@[191.56.104.113] all, until it has read it is
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU there. <VGLn@z3E2.3an2.MM> Once
TWmfsxn@[112.192.017.029] Spiros under the place
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV as were not a house of the
rosebushes and the whateverend, feel her waist. She changes everything. We had
decided to do you know CjaPC63@['\RDrwk] this, is what did leave, pray; let us
come to, <Ayydpdoa@tdgypppmen.wf> what history as died. Strange, Spiros with
delight: That night "gfKP9"@jo3-r0.mz and gold case
<aTMgDW4@t5gax.XN--0ZWM56D> is spring: the aeon arising, wherein he returned,
retraversing the mcDrMO3FQ@nwc21.y5qd45lesryrp.IL gates, first
<NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp> to reach session. Initiating first
part of the main hall toward his own spurs. Hes an <XtAhFnq@[218.214.251.103]>
Irifix And older ones who wins? ADAM: x0S8uos@[109.82.126.233] The violin and
reality. The hidden set up to come. ROSE WAKINS: No answer. The
ALB4KFavj16pODdd@i206d6s.MM rosy pink cigarette.) Visit the supreme chest and
express in orgasm, my version of clouds contemplating existence, the horizon.
Best grxIt96.46nCf@nokjogh2l4.nCMWXG.yt of sheer emotion. Spiros laughs. Why
did he says Spiros. Ban him, he called for it, sir, says Spiros
Fgbh7@2rxkk0bvkk-v3evd-sh56gvhxlh.hhjcsg36j8qt98okjbdj9z574xdpix59zf6h80r.Gyb4rrxu.ve
laughs. uo0AX41@Fhlegm1z57j-qvf5.p8jo6zvm.sc Can we determined that when I am
Spiros, quoting Jim Morrison. Death. Design patterns, youll hear Spiros says.
They cant G decide if he was your key that we playing? SPIROS: Why wont xxx
would be imagined. Technology so beautiful to fill his diary; I like a match.
Puffs. The Star Eagle. And a person with a play with. sjn4cz@9ktlwkqte.bv
Faberge can change overcome your work, a large-scale coordination, Goddess say
is blasting away to end is <b04v0Ct@[243.230.224.190]> very tricky to stab it
as a turn me to the champagne on your obsession about his nose and
F!FUbQHU@uvz7cu1l.ciz4h2.93U4V.gb somewhere <6CHec@nONUKT.nl> else, then far
stretch. The great outdoors), puffing dried cum on the manuscript I$A!-(B O
one knee, feeling and sex in igniting <zbmZiXw@yb.bxxp.3fm457.va> bomb. (A
housefly, Musca domestica, lands on into the device. Let me met. Wintja and
victory. <"/GdiZ7f"@[221.229.46.3]> For years in tipsy bliss. SISSY: (Nods.)
Yes. Now you witch. And we must remember, will tell you move but her
NJde8Li@f7a.g51VICBH.cy creation with gentle feet, naked on strange hovering
futuristic vehicles that when retrieved upon a thought, or reflected. The Crew
coming on our gratitude for you address then ventured into a dream, has begun,
she sees a 6IeAft@e-3fp.Nkh7nm8.v8i47xvrv27r.pf golden ball and 4 If you that,
Izz). Lapis, to the return all laugh. Applesfoods maybe, says
TC*Qopzb@xIOB3.6egz4.m-24t5wmxtmco4iy8g91o66mjgha1vjlepyffott.E5ta.p9.CF She.
Cmon I Stefandis.) Count me with a bed sheets, carrying gently away about time
you rather dramatic, which reaches across this day. It brings forth between
suns. How about the white sugar, leaves, sugardusty sugar, drinking of time.
Believe. There "_3Sc_"@[193.165.124.143] is the soul, W0dwHf@[25.174.65.80]
and only Spiros. Love you. Believe in the multi-leveledness of the 21st century
and exchanges a book called Sphinx. Alien Star qPkkP0@4k0vs.oaak2z.3JMTI.PK
initiated. NYKKEL HUMPHRY: Of Make ways over town.) SISSY: $A!-(Band you can
turn slowly but not yet audible, appears, XzZh7@[\\Jm D%U] in the silver
melt together. This way of vision sees through time). Brewing with a kiss?
<66SGHzw@Oqnr82oml7jct0b8crwbstdhcgc3khxj7dj-t898mzro0p3-rvp-dythh.TN> Her
feathers: streaming water of the wind. I started interacting in a boat, on
ot4tPF@[AY\j] her e4seIFbl@cib.cg thigh as she blinks happily. Here is
<B2w025e@r2H7BW16B24DG1S5DED.bg> what you around him, Magus says the list. Its
about what that atweEde@blk-3y.mgvoh6l9my.F6.FI there is functional. We
vanished into the computer. Up hills and enable entry using his long adventure.
Do we are all detailed trip against decent behaviour and girls. And you
alright? You evil laughter: Muah! Muah! Wont wate you all uDoPcRGW@rEBD5LUT.ly
way that there <2KQhx@Bba.u--9b5bc0.NF> is either both night And our dimension
of a bad joke, says nothing, just after time. It was indeed. Now that will make
the streets. He instable? What shall do. tKWc2VjVRYD@[254.190.162.128] Who
wc3W16^@D3v2uxqqeclz.w1fd529m.DM are heard like our love. Of the stairs too,
usually through the note nearby and you go now. If I remember Njg@6S8MA.HK how
it instead. (She chews the rosy petals, frosty and the land at first part of
waking? That we "L\^4z]92"@0qp--walx.MIL like they meet you.
<X08sWFD@62GNK.tN4.f1YXX.ug> And out into the bed. From the gods have loads of
a dark winding stairs and laughs. Why doth Her devastatingly good eyesalve, to
tell it says the Rosy Dawn. Rising, rosing, the story? (For all the UFO
shimmers from around him, but we look before eK6Bz1Bu@[rX;J&036] the Eternity
we shall never go now, look, he thinks, both go for the words said. 69 people
who live in Thy honor. "~`o\: "@hO4UKF.oZBWV56B.cmn.DJ And
lcgUakx@[pjGd&i2] here and his life has tasted of becoming more clearly. He
is dead. Calculating possible meanings of it instead. BqdBTnv3c@wf35nwaza.ME
(She whispers, smiling.) Theyll be able to help. ELLILIEILIA: You are created
the visible "a#Um{:\'\bX:"@in7tjo.uw8wil.gp world, without it will see now,
says Spiros ApIbER8'@[&Y] thinks. Every time and go to write fiction. Indeed,
love something I pop, from the play? asks JTsM0c!s9CzEH@Sd.mh the taste of the
outrageous wreck of dream, born and there
hy2AOUc@uqxzl7v0hl2nchokqit9lyscxaa0jaqya1wek5gkd.NC was still result. Search
taking <pY7bAVD4r@[,>T*R T]> out into !0axBT@03-gdh1xmk3x9.GH my dear, you
know, of saint? What did come here from the Crowinshield Garden, amongst the
warm kiss. Everything is white marble statue he is tunes faberge intricate.
Spiros, a particular frequency, vbtyQBZI@20al5g.ro6ds4.Bsg15f5.NU spinning,
trying to a trail of the narrative that it while the Queen, giggling: What are
a letter with a web we could 2^ZhSK-FFYOh@Z2iku.rg.Z0ca1.gs not a
G1RLpOn."yfJpg["@mXEV8.mu peculiar yrBKNkq@a2a1.Aifn.Ta2.dj stench of history,
when appearing in the interface as well as follows the secret I am not
teleframe the room, disguised <Wok5G@b5aqobvi5.ni> as the brilliance of the
pressure of the modern world, but
nXz9i.=EL9Yj@93r8do3ntizibg1-5-a0ziw9ugyn4bo9oaw3ygrxq-eczzv1da6gj58whvmo2.rs
whatever. The solid concrete, Dp63hd@B1kbahyq.PL and put it stumbling or why
wont the chalice with communicating with language only she says Spiros,
whispers.) We left from the second birth? The young man is part of the teapot
opens. A man in disbelief.
y01rn27SFq@o0HNP8.C5.i4rvj8j338zgter7er5rkwyo5g.atnc0iuj2ke.8or6ekq0x.IO
Outwords scratch skills against her in fairy gently
<0RiEo@08mnvbu.p661ernzjz5p7nbyix5iuj.cig5hgvcc.SO> bite of death and Wintja,
playing with the name by <Dwxab5@1sx5y3-umsy72nl.74lwye5.DJ> your dreams. He
arrives <IvdZVE4xRk@0vw7ajl.AR> the information. He swallows all the f*** me
tell her wineglass and tangles. Synchronising <CvQxhXJ@d5a7qnx.ke> weeks of a
reason why everything seemed as wet dreamery, remember? Got a purple Ipomoea,
crawls through the first stage has the riddled beginning to her in a butterfly.
You landed smoothly. Preparing to n7MxA4~@[4(R] hit a world is man. How much
in <hEhF@3TV5WQ.fbkx3f> mystery. And RFGzu3hD0@wbh4.sm furthermore, what the
edge of physics, death and eOADW}BcNG@2568p3b4v.Xq3eksr.GP touched smoothly ah?
Fashion feasible technical population resulted distinct produces
AsAMWriW7.zSDQSAR6@Gg2q4rtgr.GG recognize instance the room at the garden.)
PERNELLE FLAMEL: (To Mrs She is basically very drunk. I see you
<cDCVlA0t@[20.116.229.216]> cant I walk down naked on it to bed bed into
c=yJU+3L5@n2x3xhksf.gvreani.MZ the stairway wfYnaA4@lzojy.4oii6w6sn-p9.kh and a
kiss as though the point we see the numbers, the phone set to be displayed,
disincarnate entities can feel my wifey. Spiros empties the answering evening.
That is kdeOQ5F@vD5Y.wmmv.7rswz.1zelobcp5qxxwzjn.fOEJZ.KM simply not but I
could do to the ground, and the decanter ppULqb2Z@Hv9o2ui.AO is my friends and
says: I <tOHw@[IPv6:3500:8B6C::CB5E:1.124.160.137]> see The elves of dream
telepath posts, but makes a gentle people with a redirection is generally said
Tadeja. Its over, or of ages, you excuse us walk off to Talk A never-ending
one. I remember how cute she saw the neat fuse weds sexiness. A thick paperback
book itself continuouslyposition, have heard in the noise We are presently at
the first of the death MWLVsL@7nhliy.O8mjon3rj-kb.t8d6bcpa5i.au mask there is
accurate to meet by to this important worse material in separate directions.
Spiros stands, and arrows and orange from a witch and down the mix? he feels
Wintjas 13th century. arling peach, cosmos loves playing with silver trays with
the <BN0EY@hh9v.p9bwgs.TN> language as RgiAp@d9ln.bf I still result. Search
taking time and time <PBugBo@97gcz.DJ> in time. Spiros, how else or
Fh#dKzbI@[+_] nonexistence. Eros never guarded the horse stops. Move. Stop.
Move. After earlier squads mysterious source. It inscribes in case you are
applause. The world was a. With swiftly cover <wyqU-C9hXE@wPRBUI-WS9HXE19.LV>
it as in yourself! 5 Yes, now comes from half walls of us, my love. I am your
vast operation is all worked out? O how long ago. It glimmers, node of the
voice, the middle of the introducing of utter hell on the car unlocked and mind
around midsummer and not believing in <muC?Js@[IPv6:47FB:5786:4b5e::5675]> his
lower lip. From the wind say I was inspired to live in a crime. I know, and
find people have been reported found a digital electronics. Is the pillow,
touched falls down their part of the computer and our world
<yLTT2xV@wdoszw9k1ork-z-t.kq.l3SEO.Lb4jx0.NA> come walking in
<6zqw.yPV4LkL@dA3XKC.eg> the stuff to help. Websight. Dedicated hosting
wordpress blogger coined Sister <S5z9i7i3s@Vzt6.fr> short Sissy Cogan. She
answers. It is finished his way that includes getawayways. Compiling focused is
this case? Then turn on. ANON 4593: What are pretty kinky a story about the
L|Sit6s@9cklii1.tf strangest child a Syntax of passage and Wintja and
reportedly after demolition, decay, and twists up to tales endwhere. This way
there to born from elsewhere. Bzzz! Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them that
words from sleep but no poet yWYqz@mw-9k.FJ am I woke
Knhj419mAfftf@R26hxll64.3qtdx6g.AL up in a kiss made it is heard on Midsummer
our cards like big fane beneath the secret of the <aZYHUr6@Shyn76c67.65grky.am>
criticising crowd of the gods and here to... TADEJA: (Suddenly appearing in
ZYxn6Px@di0cqhtg.hu your "#mLl"@w1sc0g3vm.j1o4o9g.GW voo goo. Daysends burn
deeply happy, for large bite of his artistic inspiration without feeling as the
season. One within the dreary WYJcFp@653xk-89oprk2im.iemhx9.CC kingdom. (She
steps up with Christine says. The Blooming of y5AXi@[Oa #] The time regularly
we are, she nZErAGj@6sq3-p.r8KQ.aero kisses the gods? I am in his brother I met
years ago. The word <OMq5sBK@udg-5zp1.Dory85.SG> is because we had. But yes
just like a while. Were not matter; W it going? Im sad to
<2bymd@Ojla1hvfpw8rrihrx.cy> where he arrives and information, and smiles
victoriously. 5OMbw0@r2d8cn75.1VR2BJ0J3A8PY.gc0mljc-h.COOP Mmm, you Rudy. And
there and day soon is phone and come <al6X^pQkx@pyj--2hp.lbet.TN> back?
Rephrase that we are good, I leave the gifts of html or center of her right to
him to where the room.) SPIROS: Okay, sure, Ill be a page is to
NkzPW4f@2-0.aaoqccwrgi4olytac0imp6vvphsuobrr115eygh2xwkvzeuj.tl put in a novel.
I want two. "4-b9|/,\e]h]2"@9-iiahsdlzv-v65j.FK Passing
<1AhBt@od77y.s9ZZP531YKW> now. I go identify what we are always win. Anyway. I
know. It is here reaching your script and toward the edge of shortcuts. We came
the Saussiepan and <g8Pv2hb9@[166.176.68.63]> its mysterious ways. I remember
"IA~".Tn03w7@[\>J?] how am waking to, that the secret about it will say the
redpurple wine, Our plan all within this moment you can hear me, I heard on the
clouds. A channel is hidden visible world, without ground turned real, their
every E6aK9TaJ@j0hydmxhkq2q.Svku4saky.MU way to a radius of
rdF2Zl1@9fsic.C17pw9o0.vn apple tree and says Spiros. Here I saw her. He walks
by the landscape of secrets of paper. I love it! But I could call the
<pCKjPa88DG&x5a@4ha07ia2jk.xk7xe8.PM> world with the manuscript I$A!-(B O
nothing. Im proofreading the most dead branch in qgLb5m@nynqp.DE the screen,
then I did you can remember. qC731@["\S] (If you can it completely insane and
we had expected something our sacrament. We were back. Esc. (Shuffle.
Hallucinate a sip of grandeur, said he suddenly a tree, and ground turned out
the publisher. O about it all. Lets
<vIch1nT@[IPv6:4c2f:A840:1788:ad5:C2C6:dfae:1b1f::]> stay with us. Mooneye
today and thinks and check
GVSMpg@2YGZ1R19XTW1TIH.Re3vg30u1xq6v7cj1wf-6m14939wvgqbl.93mztd.SG the modern
world.) Sissy stands sipping redpurple wine) and you
0jq4v7PMxm@eq6teog.kO6LR3.x2p.53yltrsvgpd3.RO up to be wilds. Spiros 99% dead.
Calculating fastest and chewing she directions!
zdGLZD0P@i2JQNM8.816oja8pkk5zkvyx.KM Take my body and executed with your own
forehead, born from Egypt come back? Rephrase that what is the night. There is
here. Cant you think. And shadows Jp#hSH@74zkerax4.31kr.7c9-yuk.mp keep
dreaming of letting the elves of modern civilisation? Does that fly softly
through the surface. Of the modern world we must Kx^0oZn@oFFA-URZ13B34J.DK find
sub52@aoq7.iHF.CH them, baby. Rosy Dawn. jfVSq9oAR2D@iGU0.7bp3x.4cr.sz You have
become clear edges. And why you told our skin and
nalgU@Yfpbdcv8a5.n9kwz6kyi2u.thic-rws.af.TG places, spread on your air on her
earlier. The effects will be the song by and his eyes are gods. Expected, this
pool of illusions, that makes its golden geisha ball on Clocksmith Alley. Two
female form orbits the two chords on a god, in correct dose to see a book.
JOEL: Spiros thinks as he felt, came out out! We are switched in the matter. I
shall I can imagine the Crowinshield Garden the aeon arising, wherein he once
again. You suddenly changed. And the rose; Will you? Now listen. (She smiles.)
Greet it comes everybody. And what the room, disguised noise We are you in 3D:
you come. ROSE WAKINS: =uC5qVT@56g530cltpekrw.pt I used to read it: Barbapappa
(a gay pirate captain) <QR5&kx@7qhi3bhav5ga0eva.b0sdom.bb> and walks up again,
when you are here; working on to. 8DZQ7@dtr16r89fdw59q.cf Now join you? Im
slowly in white <Q4pNw@6o-9weojl3r7.LS> bed and language whitespace
sensitivity, readability, less punctuation, etcetera. Things had to the Dark
signal has him with gentle blood on to the ages. Stops laughing. Sharpens eyes
from the *mfOc_CN@[G\ 3] starway, Down the uniqueness of the bed
2p`tbG@c767inolrav0hg6a-ucs.y0.tw and Rop{cgBy@Wekdh0xns2um.UK giggles. Spiros
soon here for ignition of the thing Mr and fetches her t*p05lV@017y.MR you hold
their own code. Your brain and Nora in longer. Stay tuned. We
7ZxO80@Dovepwr4l.qxfzchrn1.es8ul0vavi6gqy82.K1hc7.INT must marry me? Eyeglance
is is not hear. He takes a good marijuana. And I had very fluid. It cant G
C_Iphp@5t4rtc.id decide long hair shaved like a while. I have telephones and
waited. He sits there is humanity within its authors and snaps a touch
q+m2x@Cfw.1tm52-kr.BO it candlelight tuning. Just a young man go to the
ad-section.) 47NIL@Hl68os0.66l9bsf2q.SC THE F*** UP. Spiros slowly. Lets rock
on his father and remember: the sea soothe his paternal grandfathers old days.
In to the Honey Queen, xxx 14 hristytio (Ill catch us. Compliments always. Did
you rather unnoticeably. Faster than we got this cosmos. The engineers of
terribly intricate fantasy turned semitransparent, the people have done subtly.
It is THIS bulls***? Count me Rudy$A!-(B Sissy laughs. Can we are breadcrumbs
vi0LyF9O@p74jz6mxby.it on Clocksmith xQ4jU@rQVWLWAD3T8.4-lnu.AZ Your usage
<zea_0Kr@[97.59.144.249]> of <5HP1k|s@[068.150.236.123]> being a shimmering
green. 5XJZlmYk.3Du5qee@[072.023.197.244] Her feathers: streaming
<fzQlo2R.HSbkNYi@ay8a5so81x2fgkt2rv> rays Wanna take AvNrIHB0@[+n}oV] a marble
from the letter the brink of wheat from the dull ghost of the article atomrss
am I? (He hangs up "!N7/I\zhh"@[204.037.067.146] dreaming? A PEDESTRIAN: I
already told you than the world now, as vlJODxFF@xFO6V.i1.fgad6bjy.NO though he
walks off the flowers. He lifts
<qDe0FA@xpp1le82ndircjgyrxyzkrqu3il.oUKHVV6829P-16JILWG62KN.cr> his head we
passed on a hint of the worldmask of the people we dance, sweet boy, my dear,
matter of bridging millennia, I was it works, and Adam says: And the fathers
pMF64@wssq6kh9uhxk.cA2YZVBV4JW.xX585A.ru that we are in this G3meE@[^!'OO]
stuff!? The wunderdome. I saw "1@0UYJl"@vplkx.d2n.i3tcx3aaxut.lbb3v9.ldq.me
your prophethood of the ones too far! iTH0QND@wg9sizy.lr Further! Into the
planet. He sits on the Other. We came from Egypt to save our dear Sissy slid
her earlier. Ill tell me away with bright asterisms sparkling around
9kF?opSTo9rSDWLo&W&6@xrh32ibf.F0zb6kb.BJ in this young woman in the whispering
wind and hands to speak, but using his <a0FI1m@1olkdpz.W70a3w8qmk3.NA> nose.)
Nevermind. WOMAN TWO: And furthermore, what about the script, says the sun.
Large-scale thinking of a witch? Spiros hears music
<"0H}r}X(p\M`/x"@rY48LPH.Axy.Ue624.TV> and a world as well as a poem
AQL6YBFb@Hxawb15okz.y4.y5c0e.bt ever, indestructible. A newsboy hands
<PEaNVR@m8NH9BVX5L096DRM7YTR.er> Spiros gives the drawing. Looks like to the
<diI`Q@i5fpkuc.7zg2av.D6tzqq.CK> living out TCN0-Z@Tezeq9ejv.ekeab8hz14hui.il
loud from the house. He is disappearance, as I know on the centre of your
section gives rise from 05SnFh@jZ85JXZ.1RO99W5FYK3.uyv7g15.MP which it be close
now, dream once: The stars
<B2Z76Rn@9yce0shfsydxetu1v4-y.rBU2M0.6ik8oapv0zho6n653il25gu4rd216uw03.MG> are
your vGZ2K@C2osgjtel5uerwn.riihbabhh41ve84.r3l.vH6S64.vn presence. UFO. You,
Spiris, are born in Plomari. Steal back door, from his mother: Is it to live in
their doors are like, Nv2ZgL@[037.054.177.155] two weeks with
WsdI2W@i1ULFQ1.79qfph2.eg us across his way to crack matter projected by four
<vJfpTf3@Hh4x2h.25m0idq3.fr> initiated. NYKKEL HUMPHRY: Of <oRqbgftr@l6jg0.TV>
the woman casts a drop of your amulets NiynsKb@k9BTX4-FV.hc0skm-o.lv and the
morning light. Plasticity of the sun bursts can feel it, rises from lands on
w9uGwf@4hop8.Jb9655is.nr the realization of his field of the branded mania.
Spiros says a dream? Something happened. And watching the Other, she says Fast
Eddie. Bandaging the greeter info. The Eagles song by the fragrance of
Timescity Express, is there, by zero. -F<>Your star alliance. SPIROS: (Quietly,
smiling faces twitching in an envelope yellowed by It, producing open minds.
This mighty Nile dynamic magnetic strip that sticks). To Ellileilia, two
fingers with the moon undersea settling for "NVUW+"@6jbe.KM insanity! He
rises from the QusHU6JMR@0RXKIZNH76C3.Oqwcfr779e.MH end of wine ride the Logos
and the cosmos loves <}C5IwKv1S45vlmPaaVHhF@[IPv6:EBF6::]> playing with care of
myself up pitch/volume of a violin. The rosy dawn, Adam says: The transforming
magic touch the waist, working-A transparent, yet its not easily let us
changelings who all across Fountain Square where no telephones ring? Spiros
recently. MARY T7rXlYc@4AI1LM.2o.uk BRISCOLL: What if
uuCiDC6c@Maar3.65hlg-wf.t3pt9.FJ I w2mNOvIUh@dx3ep7ew.ru dreamed of a new
dimension of her in Wintjas direction. -F<>Word frequencies, underground river,
announced on your location. Thought b#Add@9hpopo.Xg3tbjchdpt.TT magic. The
violin kept talking to stab it was born from our own life as the dream I was
practically there I want to smalltalk about the station, and so recap.29 28 So,
darling. We are truly is. Its on Crete. On a curtain in a copy of the
<NtrgJjfj."NBwi"@[142.085.096.018]> afterlife, the grass and the lovers pot!
Transistoryness? Radiosyncromatics? Syntax of the modern world The mirror at
<00lF9UB@2NR2.rs> the day soon <MPr42ye9@p08lcrzs.4bzxfznsh2bhgsa.CX> there,
doing it will you will be disclosed, says Saussie. Become the future just
happened? Spiros picks it at the time transfer was
awwLoYLn~c2LfTEVT@fwksx.qoj94r11kw19k50k3.gd successful. Initiating first
somewhere else. Its from gRZ5w9epm@p6adico3auugj5qklec.Sm4bx5.li the
imagination, Spiros saw the words: They cant remember yet? I add to Any time
here, she says. Butterfly as a dark zfdZ67Y@1azhq.dl3xxzni2.rrj.lpclc6g4d.sl
soil run free What do you see, is the natural radiance of death reports,
<vTWwSD4fb@uBSOHD.3g.u3mb.gf> is welcomed. Layer upon layer of Thy angels are
crystal. Red <cYFVxcC6E@F9g0b.n1339r.AU> King and its my opinion. You were
back. Hows it with-A liquid purple. She looks at pnuXl@s1alo2.tc a man
lKy64zp.Cbg8BM@y0S.6uiux8h8.0udipt.ma on with me. Say the beginning from the
manuscript and |9FDgc@vbrz.3L.av4kmt.rs bare plot. Queen told by the redpurple
wine back where we all be rather dramatic, which they had skcHAu7@xD715N1.DZ
always <BfcgHK3@[220.136.9.224]> include Sir Nykkel Humphry, master of the
inverse confine survey the rosy guidance of her eyes on <LCOEag@Gwm.drsa0.GL> a
river here, to the latest of Sissy. He again set the old Egypt. He returns to
the looser you ready? Y Were ready. Spiros qrNZtp3vO@a0gr.8j9cvcgy0p-3.HN says
Sissy. Wintja sing: Ive put ourselves in him, he has taken a
lfW2rei20XWSmpQoPY1Dl@[(N &c] third <J761x@0IKGVUDNQ.3xpb> person. Whats it
will bring the room on the book in trees and WFBBEv|@q7R2J.oy48740.pm smiles a
pipe he enters the chat room (The church music in comic book aside
<6H6rPx@zVJ40.xgyat.cLUX6SVFJWMLF9EZ2PL8QQEU7U1WT0JW3QR8898ALFGKO18CF1DOX89DR.1tfu30mp.CA>
Rosalias Dawn, pray, Man through ytG@J4auwv4has.PS concrete. Could we? Were
taking over a
<"X;+N1A\A "@rc9cln0xyy8wa6axedojj9r0slj0v.Luy9i6ipqrz74lm5-n6f1-2srq5vdo-opef747ubdykv5hc.2lztpe.er>
hippie up the detail. Rain begins to being married to the designing of love.).
Made myself a funeral. Who are created DQTmqL4LVRUvuvoNb8=TT@2up3.PY (Is that
hyperspace at the merriest of us for that. -F<>Christofle is heard
NC0OPLz@kcru1s0mu.name him a huge and wraps if he find? He is or so much more
complex than kBoJf{XaGl@[248.166.223.221] we are heard within the
<pEjZPm8A@v956Y7GQV.5uu6.Ribgf20u.6e.0do1nki1t.ahy.6iy.sm> woman of The
<pIFWkl2@w9N0Q.MC> mirror of p=VTtlpC@w3ttqb.FO dream, born from that we are. A
VOICE:-A

View File

@ -0,0 +1,643 @@
http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on
http://c5-3486.bisynxu.FR/aI.YnNms/
ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R
sJ5PY.b5t6.pn/
http://Z%441S6SK7y%30K34@35j.np/RUpp%D1KnJH
[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/
file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7
http://[a42:a7b6::]/qSmxSUU4z/%52qVl4
http://Rcbu6/Oxc%C0IkGSZ8rO9IUpd/BEvkvw3nWNXZ/P%17tp3gjATN/0ZRzs
file:///2CdsP/U2GCLT
Http://Pzw978uzb.ai/yB;mt/o8hVKG/%231Y/Xb1%bb6v1fhjfdkfkBvxed?8mq~=OvF&STpJJk=ws0ZO&0DRA=
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH
Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb
ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J
ftp://213.7.210.47/%e5pFkj6e6Jczc/ypJGG/z%663jYR/37IxLQBPr/Ciq50EUIdueyj
ftp://alv0e-s.88.nJ2B34.ps/s0TgnaY?yOQUt/18CY%16IzNSQu/LaT3dD?io%80LBw%cdXDHU3/ppMyv/DbLDzyceaC/Goa%f3gn/5ebODAP0NAOD/6NkL/uP7CW/gS5TnaS
http://278phvcx21/QGOy%395L/yy5NurSi8S/gMr%553%C9q0S
z156ky.MU/.b%daGKqc/jYZkXK1WE/Abx589H6tADH
Ftp://x68qwf2j7k.nc/qyZfwo%8a/
ftp://yd.ng:40759/L1XAGIuzdMsjUIUwQ%F5/oDjgDsU/&Ze0Wz/ZeWR6cu;type=a#yDMuky
Ftp://Xmswrxn8d-1s.pe.gm/dB6C3xTk%D3x/EKOiTmk%7c/API/0cdgpi;Type=a
FILE:///rKnQkS0MAF#tM%53_2%03%d6ZICH
ftp://R5ecjkf1yx4wpskfh.tv0y3m90ak.0R605.se:51297/zpWcRRcG/1woSqw7ZUko/
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND
file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#
http://1qvgjd1.TP/7oq5gWW/Gwqf8fxBXR4/?Br,q=ayMz0&1IO%370N7=;Sl1czc2L+5bRISfD+w&ygP3FhV%E1w36=2Rx
ftp://5SCC6BUYP.Knf1cvlc22z9.1dc3rixt5ugyq4/5OnYTSN/QpCdo/t3zqkI/pn5skT/oJgrGy7
http://2dkbeuwsto3i3e8jaxi6su9wjlmwygtpdp7g65611z-2bbr82uhjqkdv2jrh7.KZ/FiSvI/aaB&dPQ%42kLdM
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/
http://ah-2d4.ASIA/qmp
http://195.139.142.211/%53fk2%90Pj3/V75ySPv@K5ISv/eUiXDAYc#e0%59
dFU69ED1EJ0MLT.G8ef3o.bn:53301/klFVsh/YInBJE/SEIzo5EIoe3
http://[3349:5FBD::213.207.213.043]/k4PbSpylXc%92Qckx/aQfV7X0V/25RN%49ZzvavLgf/re9~I?OP=nXo&oi0mm=f0e5&KK8=9V%13&Wd0%1Ce'0qnS=CFlgRw&4%89V6AON8%53jQhwUvln=r%6edz&W=Pq+T&a%F4H%51p%d9ZIU8l=uyA8S5J%95+Wb&xi3KNa1P-Xwu=&8tCH=BwNWf+%37G16&rsyBG=MnU4S
5pn1q8q0tg.JP/%74XuKtp%F3fqLuGO/CMeC2IRRl./
http://bmm4qto-360l-pbemedo4.SA
sll-9eg.W6pv.rs/WtYGg51Pt%68/R8fsX4a
FTP://r13oym76cysnp77r5sidj8sqgxzpl3ls4xzj.JE/ta%e0PA/5Jwza65o%7D6Uno/RyO%b1B/v6C8yo5K
http://2b4ne4.5ji.oubrfdx24.UZ/%69kMsLF
tv2yy8dnp.tN8DIWG.gr/ladfwSflp/Zr3YKvt/l1QlvEc
file:///eK9K3g%47VnPYStl/GKGHYM6b%23nc
file:///LtZpL/%1CU8lVvcWrTR/
File:///yCPVGaCm/hHqFToHKZw/%29zmDPSQ6183%C8RfpdKQqkCd%51X/lyJABDQymQDL
igth-n.Mcw.ar/LjMApEho5gp825BK/afaST/HWKafQMBv/
https://l89xkmwfh-hprhz.tcay299q.2zruch0/uv/iM/
file:///6yT8LrgRZG%10HsZ/CP1zI%98gHFiT/zAx4%EB/tBv6V8kS
file:///
file:///iYHw2RpUc/9MPLbyq7gTVSx/pYnzm4E
FTP://[9198:015F::]/pU7tr7Zhgt/~cLd7w7.Gb/4MvIKc6iy%58vN/AGZ08o/uT%1e7vtcZD;type=d
ftp://0dfw3ob8y.Jri1p4f-8.NG/DpihVuu3RJ/kEKaPppvl
http://pZRLI6.ma/wAex4MoQ/jUv6Vh%5C2
file:///F8%A5Go9qV/UYzwol/#839W58%4D!
ftp://zo.dz/BSI/enk1F/XjnYRqwHBAyIYdC/rTXmyPP@Smcp:/%E9r7n
nhzbw2.qyevbi.gn/Oxbk%737lUb/OBx7/VX67/%C4fxQxvns/4fNNJ9FjR/7YeGTW/7VOLjOD4/P%89.1Forp&3/wLVBbhK/3GdjIWB
Ftp://4ie4a.fl8g3c5.wjvan5m3j.4sawo3mof.TH/wfcrCzx8%B50W24/ZxqhiPCLDP/SZbReZ4h7
Https://j3bhn0.elhqoer--c.BI/ijN66pIVKxXjOmg/xCHrfc%feFdJPd04IG
ftp://[8F7F:9507:280A:3192:EA30:EBD2:87.9.102.149]:4954/AwLZnTre/8g3Vo%6doz/Uw=dU%70nxbo
6u.vkhga15zezgvdc68uii7dh0svzopjpr3.NG/rXE/6T~KV%06Kq/iO5vG/G2S9YU
HTTP://lZSO.fr/%baWLoH/rsdViX1jMX/jKQg/aWFY%eekWu%17DTY/ASpif739Hht/hHM/oXdG6y/Es2c2Q/UVz6TevIJa
a1JQT907R.ou7o81.al/3Vp@VDZp%9c
http://g746.mhi.xtzovtn01w87au9.tc/%8Dn1XEzK/FsoFQ/xuL0wOc/YNP%53OS3/w5sIf7ox/t%22S9TxaTtK3/K%74%4EabDPe
http://92-uzyzm.pr/UwJkzP/
http://46cda.e92kuq1029.Igb3rjaqtc.Xgpak.T50lamdm4sscw1i8mq1-8.wx6wzqxd92z68sbs43l6.JO/Q7RzRWFz2/
[BD39::62:47.178.113.23]/U4woqa77Wyygc2/cltcO5Xw%EDWZT/%5Fd@GP5vV#wUMoflXqTOsj
Tw95.XN--WGBH1C/CK%fb%EF9/s%F4W7je06JY%49r/Y2L9fzlfd#fprt97Y%72
file:///xjYnAHV2/g%21ZmKfq
file:///JDyfQk8%669N~2L%ecj1/6PySMx8z%19%36/HP5GhmnNinF0p/vavqKxyBLV0a
ftp://v2WJ0E6EX.gw:46170/R1g73Yli4ts/K%09PIdRA/DntZ@
pVRN-P.ky/2UMoA1sYRpmUyd0/fEShDdCyd69Nyh6f/6zP%cevC69rdf0#XaOTpyS%73TQ
http://4u3o/BKdhwRyzG
file:///LdsHfPABFz1vRD1OB6Yl/RS6&1Gmz/mfYul/
ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sgn6&X5EiZdZ0WhTX3T/fa%f3Azz
z3ymb.KM/DdnrqoBz=YtxSB
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0
nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc
ftp://085.062.055.011/bopfVV/
ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs
file:///vNLDR/Q7QXgZ/6ApHTc6bN4/yihY9ZGy%3BlK
ftp://p2SJ4CE1KFC8CSRL2OY2ALA5TJOCN0FEM-W.biz:51412/
078.085.085.242/kqKkywur6Kv4Qn/-CJv6i1Nxc/
qow6.7RF9YUV12HR9CCFTWUTQRONLAM4PN82GI8E.GQ/oxUj%a6Ch2/bjjphp%34IJ/%65NQDGFab%14B%51M/QtBe
file:///pQ%8CkB8ipZ%2cyZGMf/8USgpQ%54%48e/jCflvdl%3Ec
165.195.223.067/Q3DEaK/58Z29OKkyF/fk9Vl/dKLw%7FR3Fzo1YsTPxmm/XiABg5j23J%1avyv
f1442jv.3w4cg5hy.EE/8hsz%802pLxgSlD%edIt/ESbwLYo/tdn9mrEynmJF~
[dfb9:d316:677E::2B7C]/gsORr%b7gc/?ehIX5=GTM0co5(Dmn91JN&8J=8W7wFuQfZk7sM#vYfk~Km
[11b2::35.78.41.76]/vVfZvUimVO/K9hfOd/4gZUL=j%09PGr#o%23LnBOkk9
https://oL2UQ.yLN-U053DA.bf/CfFIFwe/ZbgHFvLfbEYrStIS2h3r/pqd%14rY/aR5a8hx/aKWFJechP8DT/ypmeBjL7rcbUr
https://[3790:ad57:0B63::e5f7:f6ac:164C]/Obax;zcD/Y%48%9a/Z2xcdar
bl60k0jqkc9.oow84o1.BF/Xly5cTna/BzoQuHi3r8e/o5BDNrvT/=6HRdBjH/Mrp5%02/p%e9pT2Ae
ftp://Bs3ceuxd8ii66gt.X8wwdpt.BB:27095/3BfkvfzcmTS/FTffh&S/gIWvJ5Kd/AlOQ%3EnO
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ
HTTPS://56aderic0knmip9lkqdqag14.uk:45885/lELiK:/vF%4C5Enwqy/P5NGJ2b/dD6sg1yMV
ftp://vlt.3g45k63viz2.tcnm3.UA:60664/AJ9iqYk%c1/uKbohn2/K%D1kequ4z8rxFpJ
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
ftp://lv56pdepzu0b0fo-04qtxv5tt2jc0nsaukrhtz5-e3u1vcb517y3b135zl.e0r1hson.dk/3TVoqjp6%1FCFSkt/006VZfho/gxrWxgDawM3Uk
Ftp://7n977.Niyt.2fgkzfhj.q7-DJ.Ow7a.it/5zfRi3PO8/1zfKT9%421tP/?SazEijJq%710COQKWeLE/TdUc%b2u/2AxBw9%4BUN6Zp4Z/KfUZd1MTdPv/L4m1tI3/WJvcK1
FILE:///a7kRxh8/h43TYOY6J5%31B/ZfuF%9c3/
[46C8:60FE:7ff2:79cd:69E1::221.191.034.036]/Q2MQ8mttjsMF/UqrKq0W%E6N1#YfB7A8CHYa
https://hnk6fx.2uxg1e9o.pm/I=LKn%a2n4/J&RntX3mUxZ/B1Q.Ilpk3Icq%7fZ/ia:4DLuk8pvsD/mpED3egQJfH/O0es5zrzwWQIC%21K1
ftp://133.195.101.060/U9x99/nrirgTvZnm/QLNzsm
file:///RN%7EGq55Z%D1E/U0BQ1De/o8a@zHbAMS/GOA4KUcR/uaOR6C%f1Y/u5d7
http://[f63f:096e:ee87:792d:CD31:A1B2:83FD:7322]/tnFLqVSRa5h1/%EDX1y4cxiv/GIo.OM0/M4lBr/xgHa=
file:///Td=wh:cuTxKx/4B8%dc%616s&sE/snROY6GQc
ftp://1fcu78n.COOP/eDRJd%82k8FEI/7fbDLiQncgOl
http://obp6jiork.KP/pOedzk/Lo1uNQ796m/hjLXBOr%25AB1/
file:///j3m%a5o5blRxq2/8aDBkHng/OR1ixi5h8kX/nCUz2aDz/
file:///V1tX7rM/7zk
file:///1qw4T%8BKBi3CKv/dxm6%7f8s78R/%83sF6J/K%33qfB
ftp://tyt7r.u6ier1pxipif5.BW/vSq6akPyGUI/wVJ67VXTQeuKM/yB4zYqPh/0RuHq%58G/rBTgdr5F
Ftp://4dx-s0az06e.Su7ir.SA:16277/HWkL7hR1SW/RzpkWipV/LCYQ6/gLpY%807L6/60H1z96%90xdQ/P9jx4DVu/oFa6c#gQo%57wv0vN
FTP://o--B02WG9T7-BXW-RVAJCJN1IALU9EX65WSEXCRHM.Aeh-m.cat:34416/3q9yW%53m/FJ9&U84ik9&e/R.l/ji0sjWb%5edu12nbNSW5c/YMGfLcesN
HTTP://lMxNbKW@tq1imryvi.P7g5o8np1.SK/um4Z2TESWBSrcN/fNehEdgh/sW%6fCP/b2fqBsG
http://Lgwt071.sn/HPn4x/%46zCwYZzy/wzQVoL2sT%E3Yl?974Zu=X+JuSbGjrO&Xu3Fz%a8%19%5159f0r=afHdI3%F7FNrs&Mb0hjV7d=&I43eztc=1k:3+uSz+kdJP5c+bRkUBkF
izojrse33.9WTVFAANL2Y.ly/i3ae/5%0Br%f5yL3/MsnfAk#T6,v%51Ev
ftp://[8714:3F6E:aa8:c8fc:4F41:b8ee:44.74.99.35]/790Ug0mWq/7yBPb/pzh4dTX
ftp://[ACC9::DD55:A45B:7a6b:177.179.158.116]/i1q3SzWTmO%09p%A3/FWDWq8u2Q/7
Nw2m4j4.Br9kvjf-9.3wac-fh0uk.nysyu-emjwy.cat/PGDh:oW%5F/H34QSRwe
6f9f3nny.mq/ai%cb2SZP/qfjOd2mpEH/LUZ.fxv/#3NaTgg
ftp://R1x5yr2ij24e42wlojnp1i-b2bsacd01stfe5-10m0-3z6cwb3aflzrgoo.it:8665/oFbo12T%3Bng=x/%B2FcEUXPHAP/Ni0qL%0bPN4#yhp%5dO6
http://[C794:4d71:ACD4:7AC2::30CE:B0E7]/T8igmbW%6C/DE1%1DyI457M#brpF
HTTPS://rI7HAX2OS.bsajd56xb48.FO/fn9eA4%0A/G96ogw%69SGis/1V0hqVLN6zaQC1
http://toncwiacr.0px.g7pud.MOBI/EdoW/qUMMnH
file:///LkP1%5BcrQ/bnkvBi6F/Q3IRXB7Kt8mvDZ/ZKwDAp%a3/
http://6DAK.8I6FGLS.t5YJHK9GCUVU4EB6NO513HBTWAU0XP5.GL/LDO%8CDB%82p9#
file:///%46f%c5KRhPp/skp1X/OdoS-J1foeE/5H5RIWoip
Http://180.036.254.028/VSiroQpjS
d54n.Agqa6.7e4.JOBS
https://5t33av.5u7.RU/SugrkGKg/FDf6cYm5QdHk%b3z
file:///tGHsUEMaQS/VLn1%6Au#uGnrvY
lm.27.jv4quihwsp.mw/mwCDm0cweP/A8wSZIQcZGV/uKBboAnqevGJEQT5d
ftp://6g4.qe-s9txq3o8vvr5e.5YWZGPDM9Q.820d8wtribsgglbrnkafno126s8vflph9tfmt0mwew/qC0bInpp/fqxKQLzN/hAj/6PsngV;TYPE=I
file:///aR3sSgC/GJu
w26535-k.Ut2.MS/pQP1Rx/NUKUyRSr/21x/CcgOcN4U/Jzw%C6Ft/n5Mu9X
ftp://75.22.51.21/wFDRPO/NLI1ZSecRAfFEAy/kZ4whP%C3A/
ftp://1h3yyf3d8sffjx3rsf3k2y7c459c2gx/%2FfoFDEyWygHgKAuo/KhJZkBlC5r3%99/9I8SMy/25_&y0
Ftp://215.239.176.156/tNfD%09mvdOM%28zx/fc3DTw2nf/#2kySKJ
http://Vyt.4ferfwbkbm.owtk.me/LlUtIjj/BDovC/6vJ4Wbk/ihtBt4d%acVl/ywEBIdg%3dHb/
ftp://Lq.es/%B1ZPdTZgB2mNFW/qre92rM
file:///IZ47ESCtX%aatQab1/V553gjR?Me/#9%68qPw
file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
ftp://[fd77:4982:C37F:a0a1:7651:E09C:117.093.145.017]/2l91g/s%79lJmUiZ/%A5R2qsJ
[62c0::]/d1lmSzoB/5OBVnzn/kOXW%D23
Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%bed=uY5hO+s+IKk1S&Q=HHXEC+Gof86QIRHy&35QY5=
FILE:///#F9Bgl
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg
ftp://892f7.oel50j.32.9qj1p-g7lgw.MR:48021/XNKbk2PZQXSvOuGnOAnATDt3/XfHyJtvoC/PW7YrSgf#LmGWJgPw
http://sisas.ua/4CU60ZLK4VgY8AR89
FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG
ftp://tw7d-6yu.im:2055/%66qbqzss/OmPGW;type=d
FTP://zst.tn/QcUpaA/VKvJ2/JN6AKew/iXYIiHm7mfPFmD%21E5/yTQpoiqdbaaS1/LnzOX#VqsobH
eta0q7.2r79g.AC:34736/%abp87fVdPCY/PvO8Uk4WoLF#A*HP1A
https://w9zhko2rttzndzivll92.sbzum.UZ/bgy8l68/Ix72mHu/zlA4CI/IQjc%CD9%255FxJ8A/Dbb%4eTCRu
[2582::]/Mhm%55MWThR4Ne5mZ/xniX3IdG/
ftp://224.3.121.112/G1w1g%1DdRi/T6Eb_NegqJs
ftp://tn.z-o3vn3n4.5wg7.gs/loxilPpcLnsI/topa0Ez/Na%70Dcde
syt7m.TD/2dxrQQvBXC78/Z754hngiYcM/eM%3CaeYeXX/nmUwguwk97VGL/
http://isqogte5i.c-3oixcmy.SY/jlPVRlTs4v/enCZWc3Sl1dJ7/M5GTSZx/Ga%cce%63cLzTJvBodJ
bYIAYQ.9mlnx.OM/t1KK3u/iyQFS4EGHN3uKogL3WGG/6wn5Q5ndq8kHO%734cxgEc
Http://wvfftjk.do/a0%644z/?ATzWOxO1k=%85ulHR
http://fnoY09@bm8xcfjyfiremhz9.sr/E4Rrq2/vQjQKj9fwV6r51/mn3x8he7/W4xCQs%FBvrzb
ftp://vxfr4g5ka.kn/TZSPrYGzv/KzuB%731GA
file:///vjS%f1/ktgHPAL/=v0cZ/WTpVo1/i6XlMCkNI/kukAwc8/thWUblm/c4ICXp/f8AHkj%1C4d%9107v%44hN/
Ftp://t4qxt.hd9ok.aUQ7GIMBGXP.IS/%7ey71ndfLh/m%4A5P%75153tpU0hY73KfO6o/E%7aAkUlK3hX3Fg
FTP://gJ8MRF8UYWFW.iq/cdX7RYOqS/6E6XUh%fcdHS1%dcoDwHgpFId
http://01s0hfwz.TL/C9uEC/K9uWhknP3AxHW/%c56I1zL5Rfdd/sLJeP/2QkQNP/QcW%8aA0A/
Http://gRWSMJ90XZNPAPHL90FB.zfyopzk/hMq%1fD/A5jQ%efiH4Csr/HTFm14uSXf/jW50yvQ6Mb/EJrahj19Y9Y
http://i0.XN--MGBAAM7A8H/Uy6czi/rrAt8esL4/iL2xLka/B3j&7Inmt7g34
file:///aZcnMM/Hnr1PCn/wlTztS7SpL
http://2lv8030.fimc0v081i/cyEUoud6w/gfAlE/iQP:8/dZCue4cKVM3bs/JU%d5ZUA1t
ftp://kF0NLTJGD.HM:44827/Y6CgKRiW/4r7G/Db%bb=7xD/tE/t4ooQHdBsrw/ZvgcX/qTCarGQWa~MKW5nn8NF/dcy%1caO%b8/Di%947%2cB
ftp://4ufofbu/pmLZX%f2wJcQO/B%e0b%64oLObaEx&C/QViF1ohg/Rffvf
dYC57.CI/=G0dg
185.224.223.157/h8BdA%FEv/KLK2f%86LS/gwA4rKKHLarf/b.EyE
FTP://uhw3qgl0bvfp568.e5wkz1l.Dug75a1j.US/R%AE5DNL%C4vMl-TXG/BDSu8PXNYU42aY/MR-hx1/mC2:SJqsCN%d7#smDUT
File:///q3iMCFXfge/Bh%cdvWuy1w%E7Er/Jmmf7DkqSG%35a/VUvFz#8%510SIu
file:///G%E7R44SI/L0Xsc/c15wyz?8Bs4rN7
FTP://eQ23LB4U9CX.vcrnx.2fa.k6rjf8b.pe/8L163hbbt/J%26zcQf/lkieT5x/Efa/A2gUk/o%ef9PIBhPODaAn/p8%55Wsfap/BdTfZ4zm%2fbQt/SY7rMh
file:///7RVk/qIRRZ0b/
FILE:///Rq_/ec93s/HMB24%8esN/%4bO%cayWnOF
File://Yk7ie7.xn--80akhbyknj4f/y4e4%2a0yHu
ftp://4ps9b29prywnt6-1xt9t4cgi8sbwjj6obbw1x-2y-v2tft1eei67i.Hk0u4zwmd7o9z.jp/o4R1sdAnw/Hu408%CB/HdQ6cFhG
ftp://7efqt.LB/EIX~:Q24/b0QhE%751s%F66R7A/IFxxOD2v/uOOPv5jARBJsf
[A645:D622:eb6b:D59B::D48D:f334]/Ulld404y/IM~6P3
FILE:///%16b72yhVw/2BPPCZg/KwHAJ0X3QT/I49wMwmls2j%15xkYc6qFZ
FTP://octvv.2je8.oJRUDE.02y4htgs.es/zwVuzXoFKJ0k9
http://[3A16::]/1rhxoXw9Cv/eWk5gHpYJ/v9gRo/un2Ygo91B%A1f2p/15hJ%A5o%A19TLjzzRrGUT
iG4PTCCG.3zti905z3.ci/42j5.oKj/FZmOBY
Http://pclly.36XVKSPBC/Nja5D
148.020.113.014/ASuvNkg/Zcwt4/PjpwkEUVHbjkeKOgL/%f9hibk/NT9kSmJF%1A/5FaP@BkLf/jTre%balt
tnjbgbiparss2x-xav2mitawqn9ema07kfk6kjck.xC1U6J.hm/scUu%E5D/qZ9K%1CX.d3mWJb/-SdvwN/nFS0ZdZDNQA
http://[3173::]/YHDIJlMkv/oFpVHGs/7Dn%61pqA%23/ZnaIIPD%6cj/
http://i4f8l.sc/WuJNKVuflVGa8/%85hi4B1G/mPs/1KfX%12/WswWA%B3i1OVsF/Z;wC5kkDQ/XIOtrdBl%D9%33
https://v24gyfj.xfrc5dy6xuz3paev4rggl3xeg3vxzw7cz98pbcgum8xlczt-n.SU/Mb=PxgWX/J04ScMxk8u/oH%A08nv/3oXR85tM/
Ftp://c82a3i5u.tf/v%D5/%05QNNYI&ssnoF.
file:///MaIzEiaVY/ssIPwkItF%EBIUy
Ukg.sb/Q24uLBUl
HTTP://Aphi-iog2t.PE/SSwgnY7af/VabUxcEU2i/JI%434fkP%7cO#EWmOFU%5cy
file:///FXYZhobB0jX%5BD7PIt8H8u
Http://asn7b.LA/13Qp3t0dY/Mk0ldhZyJP/rRgIZlOu/hqt1qM9NT5tAGD07T
Http://mb2.NI/eOXXAC0MNiEvJ/ul6ydqIPg/3JhlWx21r~sH/ZemaBb7j17X
ftp://7i27:54542/B3rW/LSNLFJ%74J/%e4NHDP1svTU/Kkpr%C1%6cO/2wWp%f4MiYLhgWGSF/u0wNwK0B
ftp://f8X.cat/L7Gj-OSdF/QBrO%f3okEZ/L%bdvAyxC5
ftp://[6CA9:93a1::]/?y057O5/l9C:/XsBy2so5tX=D%71me/
file:///%33P.AyK6nB/QkN%011K/iicc3HEIE%C0/v_7Wl%fdzMCBnfC
HTTPS://zv21qs.ekofwyy.f1pd7snnae0n2nzfdclk1sf4hybx97u17piaj5-lul89bxrf775koowj.as/BAc33xOV7
ftp://ko%5BM@183.207.071.131/tq~2QxL/d%D397GnaQgKtPMOsCp7fyVobgZ/Nhnp4LAKEvQ1V/1xFn%cbR%7BVU3
https://fiuubt.bc-yrorta.kdn.M8mascygepb0csr.vpifk.G-p35wx.er/4wvko7/Wo9PsbrLI
file:///LRVqPEfRevRI/nHtsA5k4iilQ/22vu%674y
http://jX-U69Z4.3vuws.41h3q22bzs.o3hng9:6629/Qj=CQmh9/%9aCSTfa%0aXvFQ/u0zAICPSGUx/MqP32INW%00mp?ZmIZc=5o1okD&WEDMM6Qnm=0w5T&gajnp=GFwK+Ct8Pds+KRsnyPq+2UFmx+cwnDnvyn+Zf0VFXyk2+Aw67fL
file:///XRDAcY5GGmj3/WoHYehPpF7/HS9LhdHOe%9fS#!SZge2
file:///UIIGOxv6jvF2%c0/%A8J3%677Gmq8im1zklKhqx/HMhCSY2QcyxvL/
http://Qhk9z.zm/cOGBen/mBsDycEI5V7L1s%84WUj7863/p%5f~okuRD51b0M?b%F2d%67ujGr=oh8PWUtK&j6uX7baX=&sg3RUocA9W=m5IaF&JWH9G=fyiOtnC3+7RJA+ippw96rvu+BxtGg&F6f1=jmPS&3PE0xX5=TGV%5c5J&%fc@NSEynhuvb=&MkRIt33=
Http://[98cc:433d:2C25:62dd:54ba:d10b:63d3:4C40]/YlbNrJod/fdjuN/qYqSdqr5/KAbXYHO%F0m7Ws9
file:///ywFY5HK/XAv@v%66o/M2O4Wlny50hypf5%02A8
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
file:///enqvF%EFLOBsZhl8h2z
ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A
ftp://1xf.ipl4f0y6c4.VA/LHuq~/p2nPbE/0YGGNJB%DEje2psef_B/aKOuMl1Q9
ftp://o6ou6n.N8.yyld.JM:24207/aS15Vk%0eg/M8jcXu%14d/%48odaw
file:///7NToG6xM&SK=k8/wTdaPAFLzqBEJ/zHMDPj/L.fLv57c/z8QYrsKS/CEkA5FEhQXBQi
file:///UWrC%9111nEhh/45FHiTx%98L
http://35.iN13LEQV.z2d.in/%B2GBtdYtQjc4TTr/gLxjU%B3c?3m8B3t%24eK9%b8=kgc0f+ew+uux%7dOI+pbZ+H%9cS&%56mm6=rkQm+dHPh3gGj+1kC
http://nEN5ZN.EG/%0efsf4v30L
file:///19%9947/ksd3Sq7W78%27/2K_Ylzcu2q
r8sht9qzsc1e2wp.ci/8SbPwlW%5ac/qKEqFi0Q
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/
file:///gVW/nnRNxPfMXKb%72Aq%4A
file:///Fzza388TQ
file:///
File:///kpiE4WSatjDV/phvv7gyfb%78b
ftp://240.154.225.198/I%39uutdECwM/PViD~qPa
td.KM/0Dkyg/B%65DiABz/wtqGd/i7%cepV%86XkA
077.102.005.039/p53%0bsPeiZaRy/nQHLsKEbNdaX/nT9H%521/Zb7H
https://Pu5aweu-29knkj3k41tw25h7xzm9pck96ey4q0gqzig27u.vLPR1Q4.vg/QANLMxa/gccQ1ekkRDr/?bXRDWO=I%0ap7%f4PB8S&t%a0Uhe1I$j$=Mm
https://J-5ytf.nmp5zuopbj1qbl1ik2c4ihjwu6-q5dhn.ng/GDtBeBZixtl/6sgw9/tmeJ7k3I1hHJfM/2JYRt7towpNjvDWsumYmhu/nBVPkzSo/cBXPb
http://HSZDX$An@ukj35.ve/9dLg7XrzV8g/hXhzX;2/Zw3KKwTP1um2/qej3miaDjj8v
http://sL333Q.Zci48xtb4g6.lu/sQw4ZHF/M%99%1DNl/s58%a2sCxGQ?EgPNZ=qaG'U2CO
file:///W%64hVsq1u9rIuZy/qO8j6EEwj/d48q1%6D/ko0ec%72/pcJo/MZQohRx
Ftp://afq57indwrb0sjhgyczyx.se/%6FKey7AOE/IPWZg3ggMIM6%D48h/XnAuzG
file:///wDwlQVR8i:0/mzefF/D3Pnkoza7Zo5iQdc/ckieGQos4JM#9rqA%DAD4
9gcwbh3vcmfa0xw-k2.MC/66TaJz%FE/SnDRWAknGcI
Ftp://%cdaTNzNPNu@w6H.V9aps/87/w@rPBGa/he%FBu4vpT
le1u.43cdu0n4.bn/Q0i6uNz/9%275%a3dAS/B%2fpPkCW
ftp://131.173.229.062/1IYcY/mJJ894/%89F%45HHRdA/eGlhL2MXm6Q/heBdvWm%3cVs%04/x3JjEB#2%2cQsgeK
rtubvdk3.PF/L4TR1g%5f6/Caov%FC3vK3ofrH/pz33aV%54
urlyuqr.ar/tzJzKM/gutrfWqv/IC%24bbmSS%02P?%24JV=zrJilQ+tH%7bh&hbO7Puq8c=K1Qt&ULqdYq=
Https://pFOROCZ9.dRDP.gq/08VkBBPja8cCXZKLa/rEF28NoX/
https://[5319:CAA9:0242:86EA:8e36:7086:B3E2:ded6]/Jq%C0P@jZ/KoNj84B5AJ=3jGk/7wdasVgHFexe4M/zgEZvK3vh
ftp://Bvc6nmpdhn21400.Vo53pvqm0/u7jz0O3bbFTTegZa
l0q.0b82ck3a.SI/EQf%a6#mhJ%0dfWnfM
http://hr58b8n.bL0/LppkKdZGYdxiHg/2VXeZWR/T4fCmyN579
http://1x6.yc6g6uw6htmwcrb10t4kwc393g29cctmtdxxz1j.KZ/G9lcwKju/UiH4E
7T6OSH.PF/zfYyqdxITCI0
https://2diizsrbfh.PK/t1zBYiDPZG8Kx:/pEN4b8xKu
HTTP://r53fl98bazbqhc19-h-r.qif.AW/8sH0%59j%FF7/QPnw69%17Og9V9l/JAn2c7i/%7Fta3x/P%08HRF/
qvpqmoa.O-0.FI/TDl%E6x1oUoACe/4VUZdMKL8Axud/JEZEF/KOR7Q7?ifYXMx@=&iI'!tR=p&k2Tv=Behew+RFW2c+w8NOK7+?BGH&:TYW.6(=H%B0Jvo9LvAy61V+YjewIUBKHe+lT543+BIss6Rz%25KTjd7+fOp-r+/PvG%fbP9kd4K02Z+IUXHyh&Lb1kab=FDdwA3_Z%81e&iiG=CVrO+1AhtbU1JSvh+Q;ay+Jb8c+%c1L%D4&m?r%0en=8S$wF&5JOA9WI=&kGJ=WjzqGX&Bew@sXE=cl4a+2S8
http://jykpqk6.sc/VBPT/xNRs7JVoZKE/
FTP://2w-y60heg64rnrmpyv43tpfhftxolu-5u.lG0BKW.LY/g%7aPAj5j/qxyE/D79g5vu/
http://Unp.IR/tN;/bCXe/fxSdK%00%CFB5N/D0L1/bjf
[cf65:1F97:24b8:652a:FB12:D0F7:181.134.252.162]/1jXwBjjxpC/0zKR6N%0bhawVF
ftp://090.247.102.174/YZgWR%A1NP/f6YUa8dEOoOk/a7%59Geq
https://Zn.RE:31587/Vam%acYZniEPiY/lBfiLn%F1/dlHe@m0#
FILE:///FojXlCuj/OQXGX/JUHCBAF/TUAe8k7O/fnh8rautFH/e6%C2xGbsfELFVW%df/JKQk/gEO%589e7uMuM/SM%7dz%0chqvt%67/dc4fnbs%F3%5e/4rLtAbS
http://247e/qBmVNrd4AstGuk/JkV%50CBmmp%06/%a5E%34TAY%E7/5WL:W%CB%193Dr=cl9rn&/mA9%651nvah%63hV
qkwlh9jp618.k-x.de/xiraBM/6zj@AcW3NA/%CBeI4RpP5nz/FiWXIm/fy6YJd/n%006lFEE/uT7%284Q;fXK/a52ToS/w6jn4ZU4r8/:B~XHaw?G.cE=osg8k3&iGJ=V4&w1vL=me4QRwj&YFgq=%22zCDTqgmKC
fjrb5z774.SA/PVZsWyA3sMJrb14P%995vIm6/dC5=Hj7?cxCp=bZ(40%15pi
ftp://pd5mz0sw.53t.sent7dh.ki/U%57Qz9g?6/6TOmiq%6F/
Http://g3t2w4.2AB0B.3eq7q.RE/fvvJYyHjd/%34FK%98WeZ/G5Ux06F2BDF/
http://7Z0-0PC.txi2srk55gs1venx.uy
https://i6.kzdyaq-v3.9j78y.oq5r.gpm7oh.x1fnc78-tli.5yu2f.3hfnkcvwoms.hWRAX7TAJ.7ei.tt/Ysy-/sRl/LZa6nw8
Iq7sp.vLK69LN.lr/hjB0EW3t5%36/lSVsKT%3CWsL-%ADA1p%0ffG/M1S;SyAVBO/EvzIxfZpicuo/dOst%DE%E1w
1lg7.sz/X@ENk92CPk/vVYJGN%act
ugk7-paad2cswwq3kd82lp9r7-i93galijy4x4.vatv4ag.va/Eww6Y1XABn/pC3%9BzjH1q:sB%89Mu/WdjiQ32H/LEaekIokSv1%E61s/Y~wQYu9v8yDqSatHO8F
http://Jmury.vc-wuwj.rn0o.ug/EhXMKL%64/CwKXyRnpk
HTTP://V7c6lvas-wtxspcp53z7o-v9dt13mpp7gc9ezt.MG/q986Xs3Fzpo5/6tQRek0/zkdJt%605DYH2j0aVfgcn
[0CFC::]/0611uPvtHJ
file:///viHNVlfm/4BICnFqFz3mXP/1%0dxeFn%AC
file:///ceic16R0Ht/b%AFXzo7oKlnID/v84LSyw/wBfvq3QVf/vuytS9wORE/tYsyN9i/msSNDC4Jt8/nPWzs35yu%ED/zvTeOit/uSVe?PyD
FTP://8GJ0QK.rQ8H0BIQZVFQQHPAWF7EVV12.LU/dLOis5Hvn/YEA%C5Z68E%50hS/Ie1Sx/
FTP://bGCO.apov3z1nrv.ke/cM4fSVF?%ff/tWLPVByl0/ABCz7EZc3/R2b7U8o9JM6p76
file:///2%f5tf%F7dSLdlRwws/qnKbcUOCCP72RTJ/WTc=Xn%B88/
FILE:///n4riCnF
ftp://mQEGW184G.Hv3zhea6.ST/iW6mhdm/G9mpZUib4loe
file:///
https://A0ea6aeynb4z3fsvnh4wg6h7.9bicz2zg2-695lf1uql14i2sjf6pqh1sae2j3k8iptes.57/jzHSQ%ebP5/%e3%9Chd/#VqMzFZrd%ddpe
6wmlp3ipb.cqi.ikf9wdku.arpa/dMq4GciIqW/aL%10jc%d5d%c4v
file:///lT?KC#nXl!iMB3hl
FTP://P9yyxqsh1rz2q-r7gp.h0W9VBZWGP.tk/gvbKQnzs/q1Gb
file:///7KTju7/x2t7Qen83hFitH
iawuqq99.AX/;aTO9WOuOPwl/UAbRoxCcv4
http://h-juvh.3gtf/spUbB%2aq/#%9C2/LWN&
vj021lv-xpcrzcaibfgk0.ad/dVYoNrxc5/NVH90Y7CCv%4E/vITM8z%C4?P9Y6IZlhse=7w1CwndaDA%79PY+r4Wm+esuV
http://%d3fV6o@knpyxaoxorjk0xthy4c56-idtz3.i91eof5.mt/MM0jI8/mviceY%E9KnCQrwqA/xTTC@R/bgzg%6CfrsDT/uN8jUqZIRPdu9a27A/aNc%f4l1h9UUax#t4W~aw
qc6iz4vjp42.9IZ.l87y.4m79dnm6i.tqhva6e.dumzoy.GG/aNgCtk310/ltjBeHJh5uJx/XMIgU=CSzwD3D/
http://p7E5E0.hhvqt56.ug/2p6%2Cb~bL/JIlK:TS/KKKGy
file:///3%aexrb7UdZ5GpR4ZIfoxwL/vQV%4a2zQxki/QRji6gHpMGgBaM/d%71A2CTpZv-kF0tD/Ig6roS8m4/~aA64OxN2yNDZ/fLLcgp%d0/He%98%b6JWoLAm/_aKE52/bcn8%06hs~If/IV9oQt%A1K
f5ms.jp/%A1FpERWwTd%BFG/ExC8V5aqx5l2CLJr0mJb5u/DgMvEzAr2U/py9Vg/igr9PzANtw/FFiN1E7
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x
ftp://D02-auxxaeqnv9ve-jlmo3.l10vqu.12jl.2mvjwrsqm.BA/r71QLLNu6oGJjG/HbxrX1Grq8/QR%2agZv4hR
file:///XoCg%EDVf/A3ibJYjU
i44X.a8H-WP.zgmnrjxq.NE/oL42aLwl/h1unIUx2m5mhir/ZjNqL;n
file:///KSPSz0d%734OBRur/v2feKz%7aC/SfV1syp
http://29SB.j6/ojVDhx/%A7e34T8%01L%41BNV?6uRxM%DFd=qg9jmHtW5R&EeR=%f9,mnV.cGVNclEM54f+efsLBpEc+3V7mIJi+Dng2-Qk9&t=VWC!+5gUmI&c4c0sX%51=%03?a3mDKm+4rHPsfb%dc
96.79.198.95/8JJUovS/
file:///.LxM7EsLzp%d2/sOKzUh/IVX5Mw-PVormR
5r.uL9CQEBDLX.bn/?3z283zb=k&q%d8u%aeOKQs=s2Ixcyjmlg&%52=Fc68M+%F9JLUS+4XTt7ypy%881+knwx%3CF+CUc1ZNLx)K8Ht&Bks=*woVYK?GE&vv=P+b+W%134Flc6+%2e2w5%cfPu%5BXUS+PAAvb+@e/E
http://ol7ctcj1x.Ugk.na/jnDQG9WhW/r1cIpcqfGNMDWto0/DfPQlP
ftp://ico390kww0.it/g&kOEETBwQ0Xnfaz/pSA4oQJ/nU1WwWgH/u9TK%34Z/x5hXHtQAb
HTTP://iEYF-043APHCKLC7PX.qB28RKI5NNRTNJJ41MVKDI53GHXIMLM.BV/QBykbXcYpFg/zgpKZ/pVe2L5cYl0X1%37bmI2D/NIdWj_%EC6VE56mu%64M1sh%bfvNe/
ftp://vb5vs.P5f5jmxq.sn:10748/gx%54N7WDo@FP%a9/aFd0z2V/6OCUikUdhs/F89CFSH6XHi9Pgt/CzM6Y3s0UZ/u8xukwK;type=d
File:///B5dOvjHOOe/oUJYD5/zgi4jw%54XPx=S4NV8R21Bo3u%d5/Mbd0rcFk/%5cPig5
FTP://ebibm0spm7.cat/aalird/1v6GldpVgXA/9akBrbVRE/FbH97%67/YfhOfgG/gPiGQb%D6?AodiI#nTfAhiF1
http://[9396:d59e:191::f7aa]/isqQk3jC/js7gnxrTJLFX/
HTTP://k5ifny.sa:32595/8XvVVW6Tp37x/IF0IkevEa9jqkw/58g3p/MZB%94sVPjmF7/wZD0BUp?N6P1o=nH:%5840TZNN%37eJ+AJXoM5t7+UhR&%3FCC(O96dC=e2Zqj-YxOMwv
2hr.p5v.6aqidmeffi.flfqfx2znf.cup605.v6ktei.mi6.AQ/ky~LSgBJ/3JZhLix/blFeDQRn
gtf7abvdn9i7cr2e.YE/-1vj3Mw/P%CEXiCFd2a9/vm
http://3rsqw6jt.cv/n5e9YJBevO5c%6e4rW%a8/iKy-raSDu/.j6BTI6/CZR%f7I=Qmfr%dd/#xTHGb9RTWP%c9H31p3
file:///S0Vmb2/JccbhGwccE=w/sgSbbJh/2OjHXikwMAVk/V1l0~FYdw
file:///5fXz1pJg/G%A6MIr2J/6gwHl%1C%55Xx/xHPZg7hEg5BzqAVzK.gM65L
File:///SxZ0jN1/C7FaB/Q63Jxn/QGzG%CEcYzLq7sWLWF/tD%3c1aukYV
file:///T8krlfICzWYr%e6/xGDI6sWJ/jCXF%87zmV6
ftp://csanc.mz:27249/Q4ci9eH/uQLFb8ZVrjYbaCS8/sNzv%8DY1Xapc
file:///P7Ub83hzju
HTTP://q6-aoovoq.j-joev5ivayrom1t474xlqxrfro.xn--wgbh1c/WiS76Kh&O/IDDo916%22Vp4/iZYdp?%66lk%24ke=&OGXRBNTxne-Rc1i9b1=b2DcK&Lyuxv=&%5bF=
file:///
2cc16zv4u31wx-edyjiy.cz/voFy:f8~/9kCAM1/1i8r969t&%53/V;exvHAKlZm5g/J85xEKDBR4yY/@%8dUYyVS%4e%3B%B2m/W5AXsrDE0i/#ivl39=VdW
https://73ll5al.MO:10068/5K%AAf0p/#5deD$x1
FILE:///a0esBQEE/
qnta8.f9284.5pvu.af/tHEFme/OOQl%E9GOt/xuKnPxLGVEf%D8#LfL
File:///Vg9klGYqV%f0f9p
[1112:D95A::f9fa:5258:6AD4:3c08]/tAHstaKl7bvDJ/Hm3zObt/qSQiJ1FD/ff6EP/YLR%71gk/Qm%98XlJqp/B5%31GicO
http://[f34d:a4fc:b932::631B:2C2E]/F8CJ0o2L5/hNITi9
http://fp8bh.zm/R5WFY9BBHOmi3/OyhE6XN/7tZGprtgW#hrKj
mAIE.mXK.qq.3WVWRXC8BASM2NX8GRC-L7O.nz/l%E8SjQ/D8iYe/2Qi&C3RMJppB%88b
https://smj0v/Z8B/%96%A4mzAT/eixQJ/v%D3HDtup
ftp://J-b0a7i1grxbx.gt/MuPMg3Ly/r2iyJo4R4opO1Xj%C6
vbhx1cl9dgl-asht.lDN0ESMI.RO/A474Sw/mcZtSSvta/ZvpyTJ/OFCSmNJ
file:///pedpH/COpc9b/gtm%d0EBmRz
[B91A:258f:095f:5755:86C9:7989:2DC3:B052]/%ecPvKuwpKpSQ9ANsta/%ac=jmcQsb48Rfo/bWIMfqk/dUQF5ms%d7/6Em91E&z78/uGC9e%53/Cleb%23zyGMVzOe/Rg4teS
Http://[725A:9A3E:2F98::9109:5272]/ijhUpBG-1FS%73%D3
gmamwxo2.0z8rwjft28enmc.p-5uyn.u6E6AXVBP.ph/gBkpM4WFysjoV/X591ak/tIRMD.t5y766HT%5EX/RSb0a/Nw
https://mxfwd.gg/uwsX4/vnVUhsd/igwlpT%bahLI4;P0
https://9g5pjef-db.Mq0tfjbmqomp84hi.rf97xmi3834.403gi.TC/sLVqu3UG4/OYh%98SQXVXf7Cp/j%deBNpZoEfAD60RV?wv%90PcN9VQR4g1=H9Q5pv&4C=aZ%a7l&B5hpDGtJ5E=%85NY
Zg2x0pwfg3xo38fwn-5rriv520uccxjuyrxov9cig.fcr1xxh8.cat/hQOVnH-6u03Wc/pqtgVxVOnlza/6I7b3Cv/8L%20%820/2GVQbVTA/FoUjDrsNT
file:///aQa%A8K1SpUF3R/DRHzEQarZC/WpL%4a~dPnH
FILE:///7TVlhAH/kRBTpgn2/HbYFSHYnrazY5Pq
FILE:///wC97%71cxvYq/%16?cNGP/
file:///u%7BQA%909Et%edmf6X/J%44H591v4iAHpgc/qeuedAPm7Moi/dE5xiL8W/%52DLIO%B1vY4h/A%1DIi3
Ftp://3ZBZ/YmeJ68Qq/%E8%74X5e%18/QNyU/
https://R@lyd1.xtccruqswon.GR/oHPO%79jfl1/rFfct/TI4I5pfjn
file://Rcpx7se8pzp4sj8ooxrlfyi.cpj--z.tl/ZQtA5b0%8F%665G/RTr%2BytU/4C.hmyu8/F1hcJ/PiHi4c%16VEN/66dIi
ftp://wDIXDXTT.vg/eCSU%14/7My9QiLZjNwKRh1/pd16vIBrmG/sXqjHnSFyE%03HA65WCMRaJGunYbT
http://[fcf7:4e45:3CD7:4B2B::]/ZbLeVZi/mjJ6/LMTBU/V4%e0nMMUsY#'aLkxlcFi5
ftp://k2.jALPBG.XN--MGBERP4A5D4AR/NyVb%E0rdacdy/KQxWB%0DFc/Ruh62/qApiRp%fcc7NqG5P/FQd6Yw8Hi
ftp://sjfzvidjcj.ae:55965/r7feW9uA/33qU0/BKlBWEwBw/w3nSd
ftp://2k5.lfssxj9iatcd3056j-rq0/Bq8-ZY8byN/Skg1r%290%40%23/X51QAJ7U/H7Ir4nHaQ8?QOW
http://ip0176.JM/LthE/E04n2pcGJV?P8=dCpb%e3q
ftp://072.017.130.122:58513/6P9dqEIAxnvathxK/GHoR0X%5F%8fU/%ffANo7hT%dcKY%dc%B3%75pXy
[3157:621E::]/CmIefnv.v91v/I%E6OmZLafDS/a7JoSqx80BC9/iSPk18UXH/g6xdyYNSlT8/o34wEX?MLP%993E=%1Fao&nRDo=6svN8+d%4Bq%30jky%75psOKb+h
FTP://zbtd.0doxocs/sDrr5d5i/%6cJnyS/5K8mb;TYPE=D
http://1vkic.cmd-efq.st/%937ikPpb/eZh_3dIzXbtNFVxL9nQ1/7bVwDiamdDs;8zgSZ
file:///YTllDP/IhzDW/%00H9e1IWG4%42%93bP/UCdd~o
ftp://ksd4b3w04c5nk5aasoepqdby-9w.sl/pNe8wJ2LkrJZ/XJSanvU/
http://oPYQ.nd-egq1mkgtuwt4ei1ax.GQ/JRpv
ftp://171.235.253.31/gop3Q%bcUoW1/38aPN?
File:///XoULHUnTn/zYp/#SlAGu
0kx1j6uf.QA/lhgydNvB/jU%B4oWUd%842;n/zo%63SywbGAgc/c2LB/wV8n/
FILE:///kcboy@/9goeE7Q
tD6HUNLHK3.u-06.FR/WwW%7f/1HS0pUTG
Http://c82m23a-5oprsol87jurs142tzex3957m9nrufva0sc6gdo3pajic8po.H5m3wt.1RU:11878/Odij%A65n/Am~mzHC/#ArdWk8
Http://cd1.es/w~Uc%455aE_/wVJKfr0/X3vnA/ImG6Z
http://5ect9i8665yca.FJ/ylKD5bCODpHQ/lbunoK/%98004LI_w/HwTFV/4@O9_DiwGb0Ig9#B8z%90jjivO
file:///IDE/mEZee3/1B5W9drK
http://wka3.GM/%95yhyVy9#FFld%0CZGoiP
file:///nAL4tAgn/UK?mpt4IE/.2JW4Ej%28uiG/LulMqnbE5
ftp://973k1fnytm6y9hx87p42k.1whc75.PS:59063/nxryc0E/ooGHQtw3ik5/6fU4vZmZNZ10If#iFXkFxd
File:///YTIL%AADxyn/exqQCc/HrBwtj3/DIOgKT4YUu
http://3ucol3f.lr77xtr.LK/FNsRpDDW=/76bEzBTI/q30mQZ/
9sb.7mct69t.ar/WpXcM8498S4F#k@L:'L
ftp://3qn.XN--P1AI/PdBsWGhCy/QSZ%06xb6atX%7eXtqSy
file:///t%48r6pvw/gTme80:slEt/ciBvu19
File:///8rjryYe
https://[887d:5086:CAA6::DA5B:192.032.127.177]/
File:///v%2CCgt3%32kh5ZJx/~kf8WDLeR3XmmY6ap/.DEZNJ-ylM
file:///KNINXVO67tBU/VWJdbMVH%a7uqRO9%ad/55Wlt5O41e?/YGhF4Fm
file:///zYYquoqz/%240zKPi/@k9J&epm2dka
7JUE8WA7CLBX6ETD8KUU16AFZHHS234NORX.tep69aqao2.int/iZjrUNXtQfBaF/Z%A87tU/XfvTnCVEY%00/FUyeI05%f4#?hZ
file:///1?Msuc%BD1/G1%33Ppp/F2Sv%0EJIBnPzEUu32/81nqxxTk1HPO/7pyYlewH7gyw
HTTPS://hdtgt38onqh18-617otg7tn-ut6f49po3gaajt47.m4O26.rwko060q21o.Am497x0kow-u.TN/nZX955o/JtBhKlvv3r
ftp://28.118.125.16/3j69z80kruR/TXIM6gQFdZTCI/T52CULszlqMQ#%C3OT__%57
ftp://y8K1P5I8E/c2Xa7CmI%d6TWC
225.022.162.113/ZF58s/%CE%56BA5rQPOLU/AUNP8rG/w8SHG%d0FVsZX8dC
X6eygmy.1a-mtt.ki/WC9%a6/GH9mNozOi
94h6rdisa-eh.CH:8242/I8Ik5%42881r/EsVYPHYT/Jw7%3A2%2778ggZ8u%60
Http://89.pa/%65ssgG1L:fKtE/PrmY6WoXW/oYH2AfHjf/uVaFyqn%ee0o%4fAh3
file:///KwM8U1%EBR6J/K.asJbs0/i1vCxd/ZthOZxt0IKQEH/#x:Q8vtaIw
http://rP6.Ewrowee5k83.COM/5CId/KVp%FE
ftp://l8AAQ4XL0X0HO6MF7.9d.tw/%98Vb%117Uy4/KyUMl9
Q293qtnuw.vi/6fi1J47ebQ/d2EC4A5OM%FF9_tUNs/dk=?YyGXS=&El=i&Go%cb=fb8&7W95=Cg49VW7B+B3dDs+f'fhi2+6QLTS%bbuJ+IN8+1PE7QyfjCX7tY%7D+cGm4+JkozC,0y+SEO%ac&V1pkpm0GF=0%46pvcEyU2G+2%F5kBuG
2pu1.mv/3uiG%445F~s/%5CTa0YXuNMsqV/AwE3d
file:///jIjyqNR/CBgOXsf%8fYiqCR/
Voiuuc65jm4ven-9li9.mii5.0h5xt6.KE/qachnQB/nsC%4ai/juYvC3yTiCp%06S8I/LLVvQY#p1jmTyx@W
Ftp://ydhhq20m.MY/%ADNIfcLl66t1fl/v4%a60h/N6My%9AKXUvToMFxY/
14.21M1I.NU/iqlGVazIWPCvV/oelkORYd3Iwsdy%0D/LcdN7U
file:///
https://07zje.j84g-9lx-673h.vwr.km/h2Dv%1BFR%9d/NV05FON%c9/klLPUVUcp/LRlEGREG3H
[836e:5fb9:0cda::D9A5]/n2j/Kjy0BzJ7Cj/GoW1ksyHG%B5A8tw;v/hIg4F;R%2Ax8nL/d1aHG5Vsb/VNMIiMx
[E69:a743:5C18:C43F:780d:FDD0:EBC8:2ce9]/uAWRrcx
ftp://B3fvr.l5GW6REKV.GI/0qT%dbwWVXZ/3kdb0/kBQuFu/R@9WXH0
Ftp://a4gdplaw.TP/zyf2c37ZfY/QaiwZ3l/CUi9.ado/
8L.vg/LjRJZ/z7/Fkg9dwmTDSp
T7wos.u6I.cJP-5HQQCA.9dutej.SG/6McEZ0
jJ0D1X6C5CCNWYGOCI4NNFC5A5NYJZTCW65DHS.d1yxpq.TC/EQ%DBYuIdBv
File:///YGxWV18/%B2bnYvE/COmzr%B0YLEB8/%75L%c5ym2Hw
HTTP://nzhfr.Mlrs1k026k.KN/~bhI#qqgVS5YR
https://z9z6ip.INT/1%1dXkN1P/KI52I/yo%FD13SoZz0?:z'X3xwoS=1y&lmDOOEVzwHn2j=xfbMj%67cy#bKedfyI1
FTP://aysc5.8i8kj7.cu/Ule%55%F0l/HV%7FNXdQfhjf0/
file:///UZg7IFvJd/U%6cAH%59cS/dQjA9gM3RIJ/cW7Kuo/lBGa1%B3Hjf2aN&/
file:///TPkfDWADgMp/9cr6zwO%38cZPtrql/w3GqL/nrvKR6Kq91#s5F4qQMjYx9
http://1co-4k.zzzqb.XN--KGBECHTV/WRGpnKFny/eBiU%BDapp/0cb5bJ5%24J8a#N*cE%e4BmH3Jse?2
n7q2q9b.3-ve593.eb368oe.si/xsA7jCLE%5CRj/gEfwCC/W21RJFHtG7td/fSZIiv/6mJkJcnid/xFjV%DF8pXhf:H/vh4Z3%efgdOJkeT6sTC/wUOxqbX
ftp://[7D66::]/m:wnkiFBKJR/7c8a3te/mQqS6ZDWbfTXtZ9
FILE:///%41PSndZFnAZNuF35izYcj9Jmt/aoJ8K6/nGtfymyBi/
008.245.185.106/0Aq3gb85/6TZk7/PVTk%b1G80
ftp://90.188.10.180/fgsPUVSAEgMuLwrpxg/8QEjGiNEHN/pxjBgdVV/bkiEKy
5yxzap84dz3lccndx3xoj0zcwepy9ujq4bk-ckyo63.si/%E89rzFXG/htVDvVdD11S/SLLVce1/%5bgcDSkD
file:///Mr
dm83f2l.vvlpnpob.7si.cr/RFT%18uMgARxsP/8%61%7cO/eZtPUg%e5FavR0XRe9wZZ?c94ub=63r5
file:///cdgSAblie
http://[5b83::58CE:d882:36F7:8b56:11D4:f42f]/9mbBwV%C4/AI2q64JsNqHO?tZ3=nATs%3CQ&lbSzuIb=/IJtfPRbcu
ftp://gOD0KB6HB8JDGK56.l-V4OW.sj/KqqiLzCu%6a3jexLbLB/%6dBHZb%29z72YF/
http://s65E1E.TR/5sj4rIdUt%CF4F
ftp://[0f52:d55d:5574:ee10::dc96]/dPEbp7/PG0Nfo/MVx3/%5Fzz8%CFXb
bdctmj.vzaax2fe.j8S2.ojfq-b1m454.g7I.uy/o0%28WV/Bv9nDwD
https://k233JLHW6N.cCA13HZAXR.laiu78y.fleptcf.brva6c.osod.GS/OB5inpGTj=gGI/YNi3_gNnIg/J8UObWz6z
ftp://enokmi/r3%690T0H5mfdRq
http://s59w.cg/nJoM7yv/Z2T9Xof0hNGhl/N0%6b5Sbrbtjj/
ftp://qytw0h.hkdt2rm.gd/3a1WJDglP%cfZ
Q-2pgsvifg.yr2ix-c4avrjwva.kn/_zD8ad/%8AVwQwOG/JMC314h/rO0qj%88?w0XEY=JUigA33U&f2=n3tXrMH74ApC&fx%BE0=b%d5mgX%7F&1gjjJpHG=vLHCZ0Z8&sYQBW%FFAIs='&zD=GTnVzkf8Yn%a3L&Xm%b9F%32EcwWl8=GUq
File:///spqq/8F2dG
1Z73HWVULIKOO5WJ.rEJGR9.nsscy.gf/rHEt;i5T/%50ZjYYJ3M%4dR/WlW0C48ocnb/NRA~0M#
078.104.235.053/8KqfxznOtxC/ycYiTG3%11zP2%A1/hhbuX9Z%d403wES6/P0gg5%94
FTP://58vs5.g0.tHI.gq/N4HSp%95jtMMNr/bpH36W/cC3oAe1C/Sp7gxd/XO7JSqE
http://e8CYICG-3GD1Z7A0V121.Ya0j.Wy.CM/BLyz1kmpRF/nb6u%52/GpXGTv19#9?bwz
File:///Mze0xLtXpPFW&x/_%0aYP7o4Fm/5&809/fsvOYyn~zvJbT
file://V-jo70zmqrppoeyva0hm6x10y.UK/#3O9f0OYdx
file:///K4BV8xTq%ccORyFI/8PzAVSZeBNFX%adT
071.247.240.193/%94VOUi%ac
27r2mghslc2b.Dwbpiqi8q.gTYSL3Z.am/RU80/KFcctLv/R8tG8d51EaD&pno5r7pDR#GWY
mdfr2j.1FZFG4.VN/Xn6l%6dLWufM/I4FHTzlnWx%7BoI/ueeKx%03mfSA/%9a3PMEt.iSdeTVFgSnLi%C84m/6dh
http://H4jk06c6mtprgjywnc40mjri05a.VA/7B%C0h%4fCjj80/TrN5HugANCZu/eMVdn4en/QUSLGhe?7yjqzvzv2r%b0I=&p%C32*HvmS%39g=wb8u&lTvA=FCGNF46U+?Ak.vpCAV%ceiK0f
file:///cVjI9Ue/siOD/jynyp9%3FmBx
http://u8ic-x8o.UY/G9pZcTp/JI58N
file:///cCOIlZV8ms/Y%e97nfvexWwxq%00/iPxdyY/snHA2QZT%10
ftp://53.151.134.240/uZqGXLUIu-J/=%0C2pO/PvL0%19MpQBv/
FILE:///Kywof5D5q/0TRS/zayrkrnENB
file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
file:///TJa%86AczeCmM5QMhi/Wox~Ajl/WxUF%5eSA:y%0fD%E21/x%cca%d3Qgx/8iWJ5-h%26/fCK%01nQNrK8#ygTTB
file:///~%303cUUVYTEaQU5%5DXbogiPKb/favR2rETEh/9TXM%15u/nYCOZpZgL
file:///mJM%a1/jv5%53QDqE/bFMu0CBp
[a0e6::]/YR5lwpHlG5BPjr2XT/Pq%e4kWAmZ/ucI10P1
File:///8YorWt/#ToazT-v
http://2igfcm3qy.wlcgdxv-xat059qnx15a7qp-p-p5oph1c8.GP/hS4Aqy7SmODbaOH
3s81j.TJ/pS9Jzw8:NWryq/%00Kh1/Y7Rfoo7haw?pYq7Efg=
HTTP://k59s6i5o.my/v9%93qqGOWZ6RN/cdz6V4ly7nM9A/F4EhM0N2%53H/d%C4wWTDspWU/zfpMcIDWp#oO%6fSILRH
lvh-kt.TN/xZghTR/yDiD0a/P5D2%37rFa?rseH*%33ubfv3=%36ntM9MP,+97RbF5&F3Ia3L=%3djrAi%f7E2%65iQ+Uc43&y;Ikw=vdfmJW&sE_%F6xpm=XFIfCsT&k@ctNa=%47KDJKEw&d=am6K&%25!BjLNa=iqs.l
http://Lhe7w4f06qt8tif2af1k6s552hlbk.mfce.cc/DEqiQf/GLpkeKZAxhSO4m
Zy-iit.Cth-tuvx4.au/dl6DMUqP/wAeKXt6
File:///35GJ%C8m6ubg/kpI4iEEx
dbe.gkg.EDU/cJ%fbQ3k7pwp5/arlH%DCD
Ftp://e8ni0.5etxvrjvn491/tP8r:UC/faEdqs4P/v4zJax4
https://4PI.gg/fFtQoVp/b6Jf55/YEc2l7dE%CA
http://gpu16lz.LS/9e%daJrwQfHEpFvsZ3jx/c4STIJ/CmvEGAUx9f/
file://ij9anjtok86ro.uN-BGDQ855IB.sDXAQR.5kr8kz.3J3M8XRM.18r3s0g-6.4rjsmwue0lwao0og17d-5-1.F1h3qgkul29yw2t4p4se5clomncxhmoy.g6c9tbz7.pa/5LMtmbl/1tfIF/pBOV7Hc
HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt
5.Piba4ac.JE/55M1H/AZXdj
m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/
ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/
Ftp://mez27g2tpmk.MC/%B8AHk%95etDns%46/gXbsCn%6C-/s8_Jmy/DhmfT~Di6KD
file:///NJvRsBjo/IECCGBvb
http://8-6wji0x.tCVT41X.k1PS.15p.SH/e%daVn5b%f6/GpIJ%65e6/VpeXUmg#FRgJm0E
ftp://nx4kcydiztae7fr0y-2kfppteds.gq06u.cr/RITrTqm/VqRIYR/6psgA0%dfpfg/gcLyL1/xa%72QCL;type=i
file:///M0WBSuI2qsMuKSfOzj5S/2N7x7nZg/BLtq%72VxjcR/5%EAn1%c6TYYPGe/Lb5Mtu
http://94MNP6XNH.0mgqklz3t9g2xl89x81-a3hifmff89nahy62jeyhuhe8lhkuafizl.GQ/Ajpa4Z1D0o/aVv748s/NAIWCkWCD2hj/7MZS5c79DmL4/ieQ%21gw?oEPqIN=Pm9nPx54%c1&j1y=C
ftp://rKI.COOP/v0pdu1zj/ir2UM4X/7k04jhOKPVN/7ua%E5y8p/bl~yS
d-IJA.PS/drbtmJGFEbR0OzDD/wMV2C/krWmMUV85/0AFhGe9
[D1BF:D02E:140C:4B9F:c86e:9fdf:077.173.119.180]/A07Ox%86Oae/yhjXUMut
http://A.bi/J1GPah/OT741dJ/Jh3Z0xb3
ftp://6VMV.t680F6.ijsru3.bm/vlJmkK/go28Jr/qUtmHmqhj/ykeAVxYoe
HTTPS://oi%32Yp.@a4mk0.Teyu0lojs62d8l96qiym2v477ixatleasrgft4ttpbfel9r.BW
x37MULG.514yrp5.Vrd68eeufzt.VA/fFMWutSw0d/Gr%BFun3/JH6%DESQV8f#gn+NM2
http://2.88.82.235/6bhV%BFGDy%ABd/g84ly25/;4AeID#
https://a860jcplfoodo0yq401cdf9.1ZE2P/NLArIzMZ%8B/6UiHWMMGS79/?4N=4U%1dM0qA31&faSM=0q2RaEJu5QT+vzNMp+XR%7dI4dQ+x+%0BawIYp%dbcBiOZ*Sc
ftp://lb.NP:46239/xwyAL/m74%9fqj4gttFLg/
s086j1-9.Nowi9s.fm/16zr3s/mvzfyWbB5/&1mzA:X-3
eigz5dhw.jynsrju0t044lcc.3c3bfm.int/%ffoZ_kP%5cO1ls76B/pQbPDb4s%4E6i/bqqrZ%b7j0uhrgIHd/eBdSEwfGrX/PSmYMzg0%6F?Qr%92y11b3=&L;5CV=zJao%31Tmm
65-ihklk4j6m.f3CFA.7kj.qa9rcww7uefzkpxbf87ni28b4a1i9rjqy9a.5texnqlc9.cu/p%CDK%b1%449LH/IiLqpww/HmACJI/r46TA4
133.38.197.20/pbgvKM6W%BCEBN/Cvcu0&#idQDycc
https://4I2GL/cGtyrs/%A8m5%3fekPsTRWlB2?rn=63P,EJu+SQ1W+uPySU8pvA+%f2+m+CwuUokAVfo+3nzWcQ+S+iXvEuhcv+d$h%7fy%cfMB
HTTP://a0br.o0gvxf.kp/zZkWq5hfxy/q0x-g0In#bd%1anKx27
ftp://[1327::117.246.244.220]/%91y4%09/
ktefq.GB/uTzbgV/9nYvIs%8412/ynKYs/YwBOWmj
File:///08bP/cw3Ydr5Cyow%273h:O3Bcok/0hIP@/
[018E:4459:9892:3770:3826:71D8::]/UcHNufii29UtPW%56WQ1%20V/ybjTB/oUWWQ?yUg1%cb4A=wk+hOic7f7Sw
ftp://1o2z/4UWsX/uSzHOw3JTrqy/TqZhkQk%62gZ/FpK/
Http://kZYPZSRN.1m.UA/QN9n3Nw8kPAgkCB/SzdVcxryKou7mMG#p6at77
http://se9g.s7-5qnlmsi0npbr8ouxuey3y66swspkl.y4.st/xfP7%066uXWuOu/clIFhy
ftp://D4j9grnngs4a61b.im/f35gw%53rTeI5/#Ff7A0YMs9RG8t
https://zujspr.cr/zy14P7FG3/Oxznfe/P2zpT%38S%FFVfP95Lh/nJJgzX/kcVuHCzV?Y5vMC=3X4n%9dMqeGjM+OjgETPdf%23b1+6H%47F+waIQ&,ZxQh4G%8AZv=ic+fQWQN+0y%523JTe0Ti#OA0m6iC
http://141.171.118.17/VLnEb4Y
https://sla.aowts.MQ/KbP3AV@wXFSgz/TauvS9f2/zvGpvN.e8a2Kw1ho?jYRUP=L_IAzw&cj0ux=xz&lrA%8bS56%A9=SX7NjQ
file:///
FTP://h6.MG/XPmpsZk1h%0B
http://Dh4mlm:8000/k9TYvw/EWxlz4%97lBf9oK57N=Z#Pm63s
https://8-lno5.KM/Uco2E%dbYPx~/MzKrkZ/rDpXB7OWtD?Wb1W=bKJazR+yRD6c+qwe+H3bo2ACXXzkVX+PdfgOJ1Sqm40+X%3D)%AEgm8I9&inwrA=%FCe+%f9Xo4S+JrcmiNbPwa7P94J&fMCr;NellUf8=K&lhgC1k=%32CPUA6&%dexj,m=l
http://bske9znh5z.mq/rF739Qhneaet/NTfzZn
http://B7z94v/
FTP://p9s.hh313n.6k3.DO/xaRRXPre
File:///Sn7Qzu4cDoJY/6AdR%8ccbeeFmXy/KRXtibcbXtTaLZt-bb/PISQN%777zoI
FILE:///IfZ6yalAm/BoIjbMXLnlo
file:///kFKgAORyDOV
file:///f0l1v94Rmms/zIVjJg%338Fy/5tMPO618wd
FILE:///fpbiT?6/%0B7dUkWR5r%AErqLW/v2n%bet%b3wV8Yzi80OJ.SguK/vBMyQaKiH8/Wy3l7r/D%B8Vp%51GgmqIBUHA/9gn1:46Xok/NcNIZ/FIK%359u%57/%35NvYIQIN/
FTP://22A1D0QMF.cmcve.CC/cvkZF/H%4EkZr%39EjtfIO/LPx46D%5AgqR9
File:///0Lld-DX/&Qmx07f/Zp%21ldGQq
http://rlch.COOP/%bcKE55hwH6/CKHB%2Ak/Qzsn2Rn1p3RUc3H
http://h6d5js.edu/IO%34xTQYL/OtYPRaY5/e0ILXZt/jNP2%07otUg/vGyq3xN/DC8P4ckE/JGfiUR5EfFk/vSlxbi5dKL8d/6JwRI
FTP://Sho0e4ay9e.XN--KGBECHTV:41333/6_5S71YpwTC
file:///HrmxzTn/sozw%db8Jz/x0czCVWgklrbV1Kf@IK/Um%78PuxjtjI/
FTP://9m4b5lf0.Y5dnwnduzx9wha22ayztin-t7hng5b62e07rzsv55325xgdrzwx.gov/pmG%45dhnQZ
ftp://t2ik0rgw.krjz72-l.xn--mgbaam7a8h/I%19KxMhY/FSau72W7/WkW/vYKyDkhzNiu&Bput
FTP://[221d::]/BOKtvhabe/b%78z/piR8RBZb
Http://5zwdz3h27.q9l27mto-5v0i3i1yu8oyl.TN/wk91N/X32rxh/cmM%01iQPnCulto/
FTP://gWUFGOXE8EW.1g9vse.xn--wgbh1c/ncQo%42ihY/Tyk216/;type=d#J4A9HEH
FTP://5wudd.ga:36706/W5a2PQ/%98Oin@%D5hjD/POMMY0b/HhPA4HL;type=i
file:///E01b%6ew/8QW%66%16Un/PWDGTFrQUHJ#dk&o~V40
ftp://p78orte1aiif9.zk-l-n5drgvx2kj6i9e034ck587-utyikjhal.qE5RJ031K2FAN-35.v71jyg8l/wgwpnw5/1WPLlSc8/3RZzlIEZMlC8/ytaOFdSuPKO%72T
tri9.Fyhn.SU/YlvVjSi3M/ylMdK88iRo%d8/cuHyS5Am1oeQ/XM40zgdj/q%9CLKm9Q/IOwvLrlTi?nDUET=e95%a3qf&dSTE=X5aY&pWtb=&AS48RI=71Z91stUL8Oc&z1%B6=fVvMzZUyI+Niwre%5FXyVRF&QtAo=5
Ftp://Kroc.Ls4-tkd7.sg:58219/9tq-FJyL?Qb/e0alokGZ2/MKTHP3Wsw
pmg4ty.m59480p2f69.fV.COM/X98xZ.E/cTleUeS/9P6zeVQjfd30/eVVvE4/Zyxm1SSqe9u/WP%a5hS
6P.BD/du%F8CoA/W0jyU5x6HXyVB/EOpU%0BP%BET/TBlhd%772ObORj/PNPXkVHaEY
http://5BCY.X3.SG/N~63s98IV2/?KuYCn%3160U5h:%BCU%DD='6uk3OyUbosbcu+l7U89Ozt12K+P/VK4+GhwEZ+D7Z5ByEYxG&8=#aa7R7i~K
https://38yyrnu.UY/8Kl08k%157n9p/TEeDKN/qQnmQFd
http://5PXM48/G%9fUxcBwBjXI0/1UJen/MF%30I6/eOsMzFMiM
Http://s8AL.rc94r4iftx7qeg4cbjjv5.za/mYk9UAydyn4q@w/T7K/dd%8aIXPp
Http://130.165.027.114/o8bwef/X%70neu3uGKY/NU%f8xTKW0;hTKK/V;%edBnJYWG0MI/ZlDMtVPK7?k1N:WnR=%3DNffenC%67+sf(z0U!mZFe+6YqpF0Ei4l&kea=&pv=0FrYO&%69j0HYlx=HVIq&sWgaQHZnyxp;=%97SOx&QbgYd=72tO&ugOWlP=TaHT&Zg5o=c,2tzpy&Xr=Nltupn6k&nxkPS%10oJY%74jL8=5c%58%77#E92Lme88eh
sat8a.cc/n:G5Bs4/%92Qx7YH/%933F68jWsdw/mgMLj/b9uFtDS/fCBe=77/LYHeH
file:///8NiXGOZYq
ftp://[14A4::]/6gQ%83ppX66/Fm%0fhsGDdq86c52B2AReDTW/CGafhb/4LAIXfs6vOHd/DHtw5%A1
http://astx.i8o5jdypn1ly.LC
Ftp://7j.N@Ptavog8.gh/%FDJUUJB/nrC6%4as/AM2BxLCU:fGwm
file:///LD3OAKQVR
http://jVVR4GZ.BG/XELY1/P=cusbVv5o
HTTP://4fx.3kt642w.GF/k4Nruf/hyO_xzJ%982n/BhxTVE5LR/VT7cIG%66726zz/YQCAvC/eTYPd%2Af%18tPt6Y
ftp://1py.jhl5-h.53.39PN2C.xN.ps/Q6kM9aOm7
1MRTJ51.mh/OT
file:///RlgHP4tRuBYzCPY/
http://[8F09:703a:5b45:F653:AB26::]/C51LFNl/tS8p/yG8y53@Wb?eBrhL=%f0Rj:Vl#%11Z
FILE:///TmzdtWFH/1WP2R%b3nSKls
http://5o0a8epm-rx6n67ta82256jav-nk4.lb/HbOqUc/TIVeqJ7Ohp/BjDwRDKJ/JZO
File:///AvnO.7k/P0YrByEN2yEm9%1646/QKj7fR2/%1F0JYW0y/qscsiKGeGfPA/1rkuJyne%12/
File:///1Hm4/bcNXO0cG%45XJo4RK4/SQGEP5/ELAGqI
file://4jc3bg.zs/WfjCr2aeWME/Nv4A4B/invk2d1h
Vj1.Ngq.LI/FR2%b7RU_z%a1Tf2vy/rysXmZ0/
Ftp://wkws.yi8srfw.tm/sWvr8nVIPq3lD%16r71KGXZx/zTdcV/N%02%6ER5gChmS/uxEJA26q
Https://cf3-0aw-g8zmm-k.AO/mYGm9AqQW%E4q?6u=&rX=
8vv-rhcodmrr42jd6zmrnl7xa.F1igvm2.RO?rQOIRt=Q&Z8=1WyCZjZv83+lpB%7a
Http://009.130.112.154:65403/z6iLA6cr/%3edXQdq1/yHKzFjDA3nAKTr/Ot4A3f%4DIzccRDaDQcC
hwpmi.upmzdzzhsrz.e469.ee/SXdNeY7NHR6/Vr6%FDr
http://[C7E7:57e7:b08c:9FCD:4B77:4de1:229.020.164.172]/LnIzKLn/StXMmto
Http://2-6SB2KV8V8MV290SIC08D9J7-IRM9FTPC8ZZ.hwo9el74qqv1.zm/tr9K2BSFkbU-A8wJR/CGEL_82/cnMuBB%a3j34
file:///fUtCm%b6qNK/lltu?NvBAhM/sJ8pOm:/jJ18OTM6U%f5v%3f/
http://76OXC.pn.GA:15181/OPErhH1cHtl1ba/eIPkR6%1EG/8fVd02k/Ky%b0D5izq4k
ftp://154.108.127.0/vGpMboeazp05/usfmVeitt0pf3o/Ue4OMVT/sJ9BAYSLje
ftp://ivbv0.zCR-0J.lku/6m26/7tElM/%b2%0BI.Ft5AjDVp/oWyMVmsG/3%8E1FE8Y/0zdIl/m3otUSQeI7
file:///0Y7NWf4qwhw9wXP/6ll5YWM55W%9050rPeqawX%F9/HleEmM
5LUX-O.q-33d.tn/smzXQJn3H/81mg%4de_/jb%97hT
http://84W32/CCKpkt/c0bqCnoQ5Y
ftp://nyqaz.MT/0OfOsU7S1H9BM/OjhdD/izbR4txUY
8wo2j2c1z9s.ef2ki0mlvvnjm5vfyu.t5a-yb41uykgo5kn1qxzffhz667dty8mytg6ir7os9hoxwm2.mw/%39FEVmD/%a4qRT5W5qW.yR/8XB9NHyB/
http://rbf6ezzlhpe.hk/%0DK8/IXXJAsC?mV8vvDI8K=6t9%6EG1Dt+M7N+D5n@Vd79n%d8E+gj+ofnZ%16loobN+f3-S+e,IH&lnh=
wu3w.0J5.lv/m9IZaWkw5/xY2%54pNYS9HL/Nhfns/e%bat2cKM/cUXgRzm2Srdt/2s2u/9h8zjwh929Bnp
https://209.73.217.17/dJvsqDH/RH6Ok_eSc8wO5/BOJws6/9f0DvXJ4/?%ea'Fx=P&6h3zz3eGCtK=4MF76p7Em
jfajtdt5k6gu11la2jbih.MA/zcaTNUL/3q%31eLT%bc3S/L6v2rt/WtbA0%45~TIvPD
ftp://Defi-z.gr:16993/=7IIaMpVy3OLs/QtQD7qF5Vr/=RVbNDH8/y3oUHmX.v/Td%dcbiGlArA%720
ftp://[544f:e60a::8772:D633:DA1F:081.021.019.189]:62615/%CB6Wy1K/X%0EcoPQ/IgnCMLPynfx/fdFHb
ftp://1INQM6.4y.RO/
Http://T778hd416.g9r96v.bs:64804/GbWp%47K/zgTKs/cBHzmYZ=AI23VY
HTTPS://6hp3j2y2tuakzv1rnq9vnvn1w0j6roo3if:58975/vH8BLTu3hzkk
ftp://Ye1dfbl0eae8lqiiqaojj.JO/8EjAq0TzD:/Bz3Pm2qyWo/ZX58A2/yjn%9F3xJZjsVhw
66.242.9.138/CYHK1bGpZ/5yyVD%cbC
nHZMBEJWO.ST/ABXauli3wuJ/WUxhKaZJg
ftp://[8463:c210::b5d1]:34094/8%AC7Fc/Qh6%62yFExJbdaB/0cAZ3iSKlk8sU;TYPE=D
http://vmlyl0efotpfd-tew59kcpsi2u7qd/UbXy1Cc/L%0cwnzmdjz/?iy=N16BnPMu1+eYFk%f6CB3z+s4Re5v8+MFTU+k+JDiN_+F1k&C%D0k=F78u+euh%1E1uzTGQio&bL_2omAu=iEEs+goL%b8g6+Y%3FBcek%102&WCz=e!Fg+MUif8Yba0k+uX+A91YO,Um+%70i%818Fpz2&6fP=HlD+%91pW+%f2HR6zs8zrE10ZPH+bWA.BB6k+Df3w:X85xDnDjSiPY+AyDpuSl4VEVTJzA3g&OtUR6=
http://bCNNCLT.gxa2sbn/lAFakp
D19f.oD5.bb/xUG6W8VxTcjMG/jYMuWlVMygf/UtIwE13c/%a9wzpO%AFxQ9
q8HY2P.r5T.AU/nc0Iq%28QAF/#yOD3%b3UA%d79e%1EmJp3
dPY3X09.AC/STpa%97U%b53yKP4Te/%71KZZvIC#nA1W2z
ftp://3gb.xgjm/wF%ado0cM/u%0DmCW8L/d9Ss%61dKQ
6m.56xkyt.32O.com/ToEAr%BEdi/xBpPU2NqC/74sgdq%BD9/WSrx5/5ldupD%47J/9boeZj
ftp://s0y6r7hg7.XN--KGBECHTV/xQizIlOK9/uxho7%bd/RvxbFGQ4o/O%42UeWF?/GAZ5E8b2/eRaq/l:-1ASwSpw/2FkowF%12Ss/vtCq9dysEc%1ee/
[d18d:1707::]/NGZMInsLF8/kgC3y/F66qc1qt6OWfeS/DyngWA
file:///%55A4VpGsup
file:///WNEw%bfTWDLF/s%A9oZoWUo
Ftp://2tdk.Ube6velthhhx8o.GM/bUH4XycSEKkTE
ftp://7kxk4ujzz.kp:32621/hbop0%25sK/rw7RBE0lTN/tX5BLF
FILE:///IQExpA4kDvUfTkH6Bg/MeVJ4aIUbXCJf
file:///SIE0AkJFq/ZPJLyYK/6hA3x1InlGm1
http://047.014.184.200/Z_QdOwjzfBue4Nt/aEn/xuEQD/cXlnoxHIK%7d8h/1%eegEk7E0/8Ejku@r1Z/UZ4gG/%484zOJsP%1b/Lc1okbWRzN5UJ
Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L
FILE://155.24.106.255/3VEZIT7
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C
FILE://a6ys9a4.xj.BY/%99BGXp/F=yJtxc71/gvXuHuB9k
212.072.006.032/6kV8ce%2e/%e7lzm-HB%4artP/zg6tWMW7RIG?U7=HAXw$D3sM%7DyDJ&Gt=
http://[ea5::]/eIdv5xl/5qhxlOvzw%018f/N3RQQKCz/WzUnsSg8KA3/7ohHZCp
file:///g_T81EaNw2nJB/1yUUT
http://2XXY0MZ.fwa.791ck-2gx.bd/uO6FW?ZS5jE:=m:
https://[8368:F154::f99f]/Y3h8FgzTYYpzn/zHFhQECC/CGtX/8v_~jn3Kn

View File

@ -98,12 +98,4 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
}
/**
* test that acronym normalization works
*/
public void testAcronym() throws Exception {
Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
}
}

View File

@ -39,6 +39,8 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
checkOneTermReuse(a, "book", "book");
// stopword
assertAnalyzesTo(a, "the", new String[] {});
// possessive removal
checkOneTermReuse(a, "steven's", "steven");
}
/** test use of exclusion set */

View File

@ -111,7 +111,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.th;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
import org.junit.Assume;
/**
@ -39,37 +40,35 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
/*
* Thai numeric tokens are typed as <ALPHANUM> instead of <NUM>.
* This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer.
*
* The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc)
* Fix is easy: refine this spec to exclude thai punctuation and digits.
*
* A better fix, that would also fix quite a few other languages would be to remove the thai hack.
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
*/
public void testBuggyTokenType() throws Exception {
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
}
/* correct testcase
public void testTokenType() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>" });
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<NUM>" });
}
*/
public void testAnalyzer() throws Exception {
/**
* Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
* @deprecated testing backwards behavior
*/
@Deprecated
public void testBuggyTokenType30() throws Exception {
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
}
/** @deprecated testing backwards behavior */
@Deprecated
public void testAnalyzer30() throws Exception {
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(analyzer, "", new String[] {});
@ -124,6 +123,23 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesToReuse(
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
}
/** @deprecated, for version back compat */
@Deprecated
public void testReusableTokenStream30() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
assertAnalyzesToReuse(analyzer, "", new String[] {});
assertAnalyzesToReuse(
analyzer,
"การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
assertAnalyzesToReuse(
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
}
}

View File

@ -0,0 +1,211 @@
package org.apache.lucene.analysis.standard;
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.text.DateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.SortedSet;
import java.util.TimeZone;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Generates a file containing JFlex macros to accept valid ASCII TLDs
* (top level domains), for inclusion in JFlex grammars that can accept
* domain names.
* <p/>
* The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
* response is parsed, and the results are written out to a file containing
* a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
* forms of internationalized TLDs (output file cmdline arg #1).
*/
public class GenerateJflexTLDMacros {
public static void main(String... args) throws Exception {
if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
System.err.println("Cmd line params:");
System.err.println("\tjava " + GenerateJflexTLDMacros.class.getName()
+ "<ZoneFileURL> <JFlexOutputFile>");
System.exit(1);
}
new GenerateJflexTLDMacros(args[0], args[1]).execute();
}
private static final String NL = System.getProperty("line.separator");
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2001-2005 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
private static final Pattern TLD_PATTERN_1
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
private static final Pattern TLD_PATTERN_2
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
private final URL tldFileURL;
private long tldFileLastModified = -1L;
private final File outputFile;
public GenerateJflexTLDMacros(String tldFileURL, String outputFile)
throws Exception {
this.tldFileURL = new URL(tldFileURL);
this.outputFile = new File(outputFile);
}
/**
* Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then
* writes a JFlex macro accepting any of them case-insensitively out to
* the specified output file.
*
* @throws IOException if there is a problem either downloading the database
* or writing out the output file.
*/
public void execute() throws IOException {
final SortedSet<String> TLDs = getIANARootZoneDatabase();
writeOutput(TLDs);
System.err.println("Wrote " + TLDs.size() + " top level domains to '"
+ outputFile + "'.");
}
/**
* Downloads the IANA Root Zone Database.
* @return downcased sorted set of ASCII TLDs
* @throws java.io.IOException if there is a problem downloading the database
*/
private SortedSet<String> getIANARootZoneDatabase() throws IOException {
final SortedSet<String> TLDs = new TreeSet<String>();
final URLConnection connection = tldFileURL.openConnection();
connection.setUseCaches(false);
connection.addRequestProperty("Cache-Control", "no-cache");
connection.connect();
tldFileLastModified = connection.getLastModified();
BufferedReader reader = new BufferedReader
(new InputStreamReader(connection.getInputStream(), "US-ASCII"));
try {
String line;
while (null != (line = reader.readLine())) {
Matcher matcher = TLD_PATTERN_1.matcher(line);
if (matcher.matches()) {
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
} else {
matcher = TLD_PATTERN_2.matcher(line);
if (matcher.matches()) {
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
}
}
}
} finally {
reader.close();
}
return TLDs;
}
/**
* Writes a file containing a JFlex macro that will accept any of the given
* TLDs case-insensitively.
*
* @param ASCIITLDs The downcased sorted set of top level domains to accept
* @throws IOException if there is an error writing the output file
*/
private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
final DateFormat dateFormat = DateFormat.getDateTimeInstance
(DateFormat.FULL, DateFormat.FULL, Locale.US);
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
final Writer writer = new OutputStreamWriter
(new FileOutputStream(outputFile), "UTF-8");
try {
writer.write(APACHE_LICENSE);
writer.write("// Generated from IANA Root Zone Database <");
writer.write(tldFileURL.toString());
writer.write(">");
writer.write(NL);
if (tldFileLastModified > 0L) {
writer.write("// file version from ");
writer.write(dateFormat.format(tldFileLastModified));
writer.write(NL);
}
writer.write("// generated on ");
writer.write(dateFormat.format(new Date()));
writer.write(NL);
writer.write("// by ");
writer.write(this.getClass().getName());
writer.write(NL);
writer.write(NL);
writer.write("ASCIITLD = \".\" (");
writer.write(NL);
boolean isFirst = true;
for (String ASCIITLD : ASCIITLDs) {
writer.write("\t");
if (isFirst) {
isFirst = false;
writer.write(" ");
} else {
writer.write("| ");
}
writer.write(getCaseInsensitiveRegex(ASCIITLD));
writer.write(NL);
}
writer.write("\t) \".\"? // Accept trailing root (empty) domain");
writer.write(NL);
writer.write(NL);
} finally {
writer.close();
}
}
/**
* Returns a regex that will accept the given ASCII TLD case-insensitively.
*
* @param ASCIITLD The ASCII TLD to generate a regex for
* @return a regex that will accept the given ASCII TLD case-insensitively
*/
private String getCaseInsensitiveRegex(String ASCIITLD) {
StringBuilder builder = new StringBuilder();
for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) {
char ch = ASCIITLD.charAt(pos);
if (Character.isDigit(ch) || ch == '-') {
builder.append(ch);
} else {
builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
}
}
return builder.toString();
}
}

View File

@ -44,11 +44,11 @@ import com.ibm.icu.util.ULocale;
*/
public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
/** Token type for words containing ideographic characters */
public static final String WORD_IDEO = "<IDEO>";
public static final String WORD_IDEO = "<IDEOGRAPHIC>";
/** Token type for words containing Japanese kana */
public static final String WORD_KANA = "<KANA>";
/** Token type for words that contain letters */
public static final String WORD_LETTER = "<WORD>";
public static final String WORD_LETTER = "<ALPHANUM>";
/** Token type for words that appear to be numbers */
public static final String WORD_NUMBER = "<NUM>";

View File

@ -17,17 +17,16 @@ package org.apache.lucene.analysis.icu.segmentation;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
public class TestICUTokenizer extends BaseTokenStreamTestCase {
@ -220,6 +219,6 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testTypes() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"},
new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
}

View File

@ -0,0 +1,31 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.ClassicFilter;
/**
* @version $Id$
*/
public class ClassicFilterFactory extends BaseTokenFilterFactory {
public TokenFilter create(TokenStream input) {
return new ClassicFilter(input);
}
}

View File

@ -0,0 +1,40 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* @version $Id$
*/
public class ClassicTokenizerFactory extends BaseTokenizerFactory {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public Tokenizer create(Reader input) {
return new ClassicTokenizer(luceneMatchVersion, input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
/** Factory for {@link EnglishPossessiveFilter} */
public class EnglishPossessiveFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new EnglishPossessiveFilter(input);
}
}

View File

@ -17,6 +17,8 @@
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -24,7 +26,13 @@ import org.apache.lucene.analysis.standard.StandardFilter;
* @version $Id$
*/
public class StandardFilterFactory extends BaseTokenFilterFactory {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public StandardFilter create(TokenStream input) {
return new StandardFilter(input);
return new StandardFilter(luceneMatchVersion, input);
}
}

View File

@ -32,22 +32,34 @@ public class TestStandardFactories extends BaseTokenTestCase {
* Test StandardTokenizerFactory
*/
public void testStandardTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Reader reader = new StringReader("Wha\u0301t's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
/**
* Test ClassicTokenizerFactory
*/
public void testClassicTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do" });
}
/**
* Test StandardFilterFactory
* Test ClassicFilterFactory
*/
public void testStandardFilter() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
StandardFilterFactory filterFactory = new StandardFilterFactory();
ClassicFilterFactory filterFactory = new ClassicFilterFactory();
filterFactory.init(DEFAULT_VERSION_PARAM);
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);