mirror of https://github.com/apache/lucene.git
LUCENE-2167: Implement StandardTokenizer with the UAX#29 Standard
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1002032 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c562b10b2e
commit
3c26a9167c
|
@ -17,18 +17,7 @@ package org.apache.lucene.benchmark.quality;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic;
|
||||
import org.apache.lucene.benchmark.quality.Judge;
|
||||
import org.apache.lucene.benchmark.quality.QualityQuery;
|
||||
import org.apache.lucene.benchmark.quality.QualityQueryParser;
|
||||
import org.apache.lucene.benchmark.quality.QualityBenchmark;
|
||||
import org.apache.lucene.benchmark.quality.trec.TrecJudge;
|
||||
import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader;
|
||||
import org.apache.lucene.benchmark.quality.utils.SimpleQQParser;
|
||||
|
@ -36,6 +25,12 @@ import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
|
|||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* Test that quality run does its job.
|
||||
* <p>
|
||||
|
@ -177,6 +172,7 @@ public class TestQualityRun extends BenchmarkTestCase {
|
|||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"analyzer=org.apache.lucene.analysis.standard.ClassicAnalyzer",
|
||||
"docs.file=" + getWorkDirResourcePath("reuters.578.lines.txt.bz2"),
|
||||
"content.source.log.step=2500",
|
||||
"doc.term.vector=false",
|
||||
|
|
|
@ -9,6 +9,12 @@ API Changes
|
|||
|
||||
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
|
||||
|
||||
* LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
|
||||
the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
|
||||
as well as tokenizing URLs and email addresses according to the relevant
|
||||
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
||||
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2413: Consolidated Solr analysis components into common.
|
||||
|
|
|
@ -52,3 +52,8 @@ See http://project.carrot2.org/license.html.
|
|||
|
||||
The SmartChineseAnalyzer source code (smartcn) was
|
||||
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
|
||||
|
||||
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
||||
is derived from Unicode data such as the Unicode Character Database.
|
||||
See http://unicode.org/copyright.html for more details.
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
|
||||
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
|
||||
|
||||
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-wiki-tokenizer"/>
|
||||
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
|
||||
|
||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
|
@ -49,27 +49,61 @@
|
|||
nobak="on"/>
|
||||
</target>
|
||||
|
||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check,gen-tlds" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex"
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex"
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
</target>
|
||||
|
||||
<target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
</target>
|
||||
|
||||
<target name="clean-jflex">
|
||||
<delete>
|
||||
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
<fileset dir="src/java/org/apache/lucene/analysis/standard" includes="*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
</delete>
|
||||
</target>
|
||||
|
||||
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
|
||||
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
|
||||
|
||||
<target name="gen-tlds" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true">
|
||||
<classpath>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
<arg value="${tld.zones}"/>
|
||||
<arg value="${tld.output}"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="compile-tools">
|
||||
<compile
|
||||
srcdir="src/tools/java"
|
||||
destdir="${build.dir}/classes/tools">
|
||||
<classpath refid="classpath"/>
|
||||
</compile>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
@ -132,7 +132,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -218,7 +218,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StandardFilter(result);
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(excltable != null && !excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
|
|
|
@ -247,7 +247,7 @@ public final class CzechAnalyzer extends ReusableAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stoptable);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -237,7 +237,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
result = new KeywordMarkerFilter(result, exclusionSet);
|
||||
|
|
|
@ -135,7 +135,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new StandardFilter(result);
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new GreekStemFilter(result);
|
||||
|
|
|
@ -104,6 +104,9 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
// prior to this we get the classic behavior, standardfilter does it for us.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new EnglishPossessiveFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.lucene.analysis.en;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* TokenFilter that removes possessives (trailing 's) from words.
|
||||
*/
|
||||
public final class EnglishPossessiveFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public EnglishPossessiveFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int bufferLength = termAtt.length();
|
||||
|
||||
if (bufferLength >= 2 &&
|
||||
buffer[bufferLength-2] == '\'' &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S'))
|
||||
termAtt.setLength(bufferLength - 2); // Strip last 2 characters off
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -120,7 +120,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -240,7 +240,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new ElisionFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -119,7 +119,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) {
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -246,7 +246,7 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
|||
Reader aReader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -124,7 +124,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -175,7 +175,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.snowball;
|
|||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -80,7 +81,11 @@ public final class SnowballAnalyzer extends Analyzer {
|
|||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
// remove the possessive 's for english stemmers
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) &&
|
||||
(name.equals("English") || name.equals("Porter") || name.equals("Lovins")))
|
||||
result = new EnglishPossessiveFilter(result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
|
@ -108,7 +113,7 @@ public final class SnowballAnalyzer extends Analyzer {
|
|||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StandardFilter(matchVersion, streams.source);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
streams.result = new TurkishLowerCaseFilter(streams.result);
|
||||
|
|
|
@ -0,0 +1,318 @@
|
|||
/*
|
||||
* Copyright 2001-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Tuesday, September 14, 2010 11:34:20 AM UTC
|
||||
// generated on Wednesday, September 15, 2010 7:00:44 AM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
[aA][cC]
|
||||
| [aA][dD]
|
||||
| [aA][eE]
|
||||
| [aA][eE][rR][oO]
|
||||
| [aA][fF]
|
||||
| [aA][gG]
|
||||
| [aA][iI]
|
||||
| [aA][lL]
|
||||
| [aA][mM]
|
||||
| [aA][nN]
|
||||
| [aA][oO]
|
||||
| [aA][qQ]
|
||||
| [aA][rR]
|
||||
| [aA][rR][pP][aA]
|
||||
| [aA][sS]
|
||||
| [aA][sS][iI][aA]
|
||||
| [aA][tT]
|
||||
| [aA][uU]
|
||||
| [aA][wW]
|
||||
| [aA][xX]
|
||||
| [aA][zZ]
|
||||
| [bB][aA]
|
||||
| [bB][bB]
|
||||
| [bB][dD]
|
||||
| [bB][eE]
|
||||
| [bB][fF]
|
||||
| [bB][gG]
|
||||
| [bB][hH]
|
||||
| [bB][iI]
|
||||
| [bB][iI][zZ]
|
||||
| [bB][jJ]
|
||||
| [bB][mM]
|
||||
| [bB][nN]
|
||||
| [bB][oO]
|
||||
| [bB][rR]
|
||||
| [bB][sS]
|
||||
| [bB][tT]
|
||||
| [bB][vV]
|
||||
| [bB][wW]
|
||||
| [bB][yY]
|
||||
| [bB][zZ]
|
||||
| [cC][aA]
|
||||
| [cC][aA][tT]
|
||||
| [cC][cC]
|
||||
| [cC][dD]
|
||||
| [cC][fF]
|
||||
| [cC][gG]
|
||||
| [cC][hH]
|
||||
| [cC][iI]
|
||||
| [cC][kK]
|
||||
| [cC][lL]
|
||||
| [cC][mM]
|
||||
| [cC][nN]
|
||||
| [cC][oO]
|
||||
| [cC][oO][mM]
|
||||
| [cC][oO][oO][pP]
|
||||
| [cC][rR]
|
||||
| [cC][uU]
|
||||
| [cC][vV]
|
||||
| [cC][xX]
|
||||
| [cC][yY]
|
||||
| [cC][zZ]
|
||||
| [dD][eE]
|
||||
| [dD][jJ]
|
||||
| [dD][kK]
|
||||
| [dD][mM]
|
||||
| [dD][oO]
|
||||
| [dD][zZ]
|
||||
| [eE][cC]
|
||||
| [eE][dD][uU]
|
||||
| [eE][eE]
|
||||
| [eE][gG]
|
||||
| [eE][rR]
|
||||
| [eE][sS]
|
||||
| [eE][tT]
|
||||
| [eE][uU]
|
||||
| [fF][iI]
|
||||
| [fF][jJ]
|
||||
| [fF][kK]
|
||||
| [fF][mM]
|
||||
| [fF][oO]
|
||||
| [fF][rR]
|
||||
| [gG][aA]
|
||||
| [gG][bB]
|
||||
| [gG][dD]
|
||||
| [gG][eE]
|
||||
| [gG][fF]
|
||||
| [gG][gG]
|
||||
| [gG][hH]
|
||||
| [gG][iI]
|
||||
| [gG][lL]
|
||||
| [gG][mM]
|
||||
| [gG][nN]
|
||||
| [gG][oO][vV]
|
||||
| [gG][pP]
|
||||
| [gG][qQ]
|
||||
| [gG][rR]
|
||||
| [gG][sS]
|
||||
| [gG][tT]
|
||||
| [gG][uU]
|
||||
| [gG][wW]
|
||||
| [gG][yY]
|
||||
| [hH][kK]
|
||||
| [hH][mM]
|
||||
| [hH][nN]
|
||||
| [hH][rR]
|
||||
| [hH][tT]
|
||||
| [hH][uU]
|
||||
| [iI][dD]
|
||||
| [iI][eE]
|
||||
| [iI][lL]
|
||||
| [iI][mM]
|
||||
| [iI][nN]
|
||||
| [iI][nN][fF][oO]
|
||||
| [iI][nN][tT]
|
||||
| [iI][oO]
|
||||
| [iI][qQ]
|
||||
| [iI][rR]
|
||||
| [iI][sS]
|
||||
| [iI][tT]
|
||||
| [jJ][eE]
|
||||
| [jJ][mM]
|
||||
| [jJ][oO]
|
||||
| [jJ][oO][bB][sS]
|
||||
| [jJ][pP]
|
||||
| [kK][eE]
|
||||
| [kK][gG]
|
||||
| [kK][hH]
|
||||
| [kK][iI]
|
||||
| [kK][mM]
|
||||
| [kK][nN]
|
||||
| [kK][pP]
|
||||
| [kK][rR]
|
||||
| [kK][wW]
|
||||
| [kK][yY]
|
||||
| [kK][zZ]
|
||||
| [lL][aA]
|
||||
| [lL][bB]
|
||||
| [lL][cC]
|
||||
| [lL][iI]
|
||||
| [lL][kK]
|
||||
| [lL][rR]
|
||||
| [lL][sS]
|
||||
| [lL][tT]
|
||||
| [lL][uU]
|
||||
| [lL][vV]
|
||||
| [lL][yY]
|
||||
| [mM][aA]
|
||||
| [mM][cC]
|
||||
| [mM][dD]
|
||||
| [mM][eE]
|
||||
| [mM][gG]
|
||||
| [mM][hH]
|
||||
| [mM][iI][lL]
|
||||
| [mM][kK]
|
||||
| [mM][lL]
|
||||
| [mM][mM]
|
||||
| [mM][nN]
|
||||
| [mM][oO]
|
||||
| [mM][oO][bB][iI]
|
||||
| [mM][pP]
|
||||
| [mM][qQ]
|
||||
| [mM][rR]
|
||||
| [mM][sS]
|
||||
| [mM][tT]
|
||||
| [mM][uU]
|
||||
| [mM][uU][sS][eE][uU][mM]
|
||||
| [mM][vV]
|
||||
| [mM][wW]
|
||||
| [mM][xX]
|
||||
| [mM][yY]
|
||||
| [mM][zZ]
|
||||
| [nN][aA]
|
||||
| [nN][aA][mM][eE]
|
||||
| [nN][cC]
|
||||
| [nN][eE]
|
||||
| [nN][eE][tT]
|
||||
| [nN][fF]
|
||||
| [nN][gG]
|
||||
| [nN][iI]
|
||||
| [nN][lL]
|
||||
| [nN][oO]
|
||||
| [nN][pP]
|
||||
| [nN][rR]
|
||||
| [nN][uU]
|
||||
| [nN][zZ]
|
||||
| [oO][mM]
|
||||
| [oO][rR][gG]
|
||||
| [pP][aA]
|
||||
| [pP][eE]
|
||||
| [pP][fF]
|
||||
| [pP][gG]
|
||||
| [pP][hH]
|
||||
| [pP][kK]
|
||||
| [pP][lL]
|
||||
| [pP][mM]
|
||||
| [pP][nN]
|
||||
| [pP][rR]
|
||||
| [pP][rR][oO]
|
||||
| [pP][sS]
|
||||
| [pP][tT]
|
||||
| [pP][wW]
|
||||
| [pP][yY]
|
||||
| [qQ][aA]
|
||||
| [rR][eE]
|
||||
| [rR][oO]
|
||||
| [rR][sS]
|
||||
| [rR][uU]
|
||||
| [rR][wW]
|
||||
| [sS][aA]
|
||||
| [sS][bB]
|
||||
| [sS][cC]
|
||||
| [sS][dD]
|
||||
| [sS][eE]
|
||||
| [sS][gG]
|
||||
| [sS][hH]
|
||||
| [sS][iI]
|
||||
| [sS][jJ]
|
||||
| [sS][kK]
|
||||
| [sS][lL]
|
||||
| [sS][mM]
|
||||
| [sS][nN]
|
||||
| [sS][oO]
|
||||
| [sS][rR]
|
||||
| [sS][tT]
|
||||
| [sS][uU]
|
||||
| [sS][vV]
|
||||
| [sS][yY]
|
||||
| [sS][zZ]
|
||||
| [tT][cC]
|
||||
| [tT][dD]
|
||||
| [tT][eE][lL]
|
||||
| [tT][fF]
|
||||
| [tT][gG]
|
||||
| [tT][hH]
|
||||
| [tT][jJ]
|
||||
| [tT][kK]
|
||||
| [tT][lL]
|
||||
| [tT][mM]
|
||||
| [tT][nN]
|
||||
| [tT][oO]
|
||||
| [tT][pP]
|
||||
| [tT][rR]
|
||||
| [tT][rR][aA][vV][eE][lL]
|
||||
| [tT][tT]
|
||||
| [tT][vV]
|
||||
| [tT][wW]
|
||||
| [tT][zZ]
|
||||
| [uU][aA]
|
||||
| [uU][gG]
|
||||
| [uU][kK]
|
||||
| [uU][sS]
|
||||
| [uU][yY]
|
||||
| [uU][zZ]
|
||||
| [vV][aA]
|
||||
| [vV][cC]
|
||||
| [vV][eE]
|
||||
| [vV][gG]
|
||||
| [vV][iI]
|
||||
| [vV][nN]
|
||||
| [vV][uU]
|
||||
| [wW][fF]
|
||||
| [wW][sS]
|
||||
| [xX][nN]--0[zZ][wW][mM]56[dD]
|
||||
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
|
||||
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
|
||||
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
|
||||
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
|
||||
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
|
||||
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
|
||||
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
|
||||
| [xX][nN]--[gG]6[wW]251[dD]
|
||||
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
|
||||
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
|
||||
| [xX][nN]--[jJ]6[wW]193[gG]
|
||||
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
|
||||
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
|
||||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
|
||||
| [xX][nN]--[oO]3[cC][wW]4[hH]
|
||||
| [xX][nN]--[pP]1[aA][iI]
|
||||
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
|
||||
| [xX][nN]--[wW][gG][bB][hH]1[cC]
|
||||
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
|
||||
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
|
||||
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
|
||||
| [yY][eE]
|
||||
| [yY][tT]
|
||||
| [zZ][aA]
|
||||
| [zZ][mM]
|
||||
| [zZ][wW]
|
||||
) "."? // Accept trailing root (empty) domain
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
|
||||
* LowerCaseFilter} and {@link StopFilter}, using a list of
|
||||
* English stop words.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ClassicAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
|
||||
* </ul>
|
||||
*
|
||||
* ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
|
||||
* As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
|
||||
* as specified by UAX#29.
|
||||
*/
|
||||
public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/** Default maximum allowed token length */
|
||||
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
||||
|
||||
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/**
|
||||
* Specifies whether deprecated acronyms should be replaced with HOST type.
|
||||
* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
|
||||
*/
|
||||
private final boolean replaceInvalidAcronym;
|
||||
|
||||
/** An unmodifiable set containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopWords stop words */
|
||||
public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
super(matchVersion, stopWords);
|
||||
replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the default stop words ({@link
|
||||
* #STOP_WORDS_SET}).
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
*/
|
||||
public ClassicAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
* @see WordlistLoader#getWordSet(File)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopwords File to read stop words from */
|
||||
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopwords Reader to read stop words from */
|
||||
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Set maximum allowed token length. If a token is seen
|
||||
* that exceeds this length then it is discarded. This
|
||||
* setting only takes effect the next time tokenStream or
|
||||
* reusableTokenStream is called.
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
maxTokenLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setMaxTokenLength
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
|
||||
final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
|
||||
src.setMaxTokenLength(maxTokenLength);
|
||||
src.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
||||
TokenStream tok = new ClassicFilter(src);
|
||||
tok = new LowerCaseFilter(matchVersion, tok);
|
||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected boolean reset(final Reader reader) throws IOException {
|
||||
src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
|
||||
return super.reset(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/** Normalizes tokens extracted with {@link ClassicTokenizer}. */
|
||||
|
||||
public class ClassicFilter extends TokenFilter {
|
||||
|
||||
/** Construct filtering <i>in</i>. */
|
||||
public ClassicFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
|
||||
|
||||
// this filters uses attribute type
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int bufferLength = termAtt.length();
|
||||
final String type = typeAtt.type();
|
||||
|
||||
if (type == APOSTROPHE_TYPE && // remove 's
|
||||
bufferLength >= 2 &&
|
||||
buffer[bufferLength-2] == '\'' &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||
// Strip last 2 characters off
|
||||
termAtt.setLength(bufferLength - 2);
|
||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||
int upto = 0;
|
||||
for(int i=0;i<bufferLength;i++) {
|
||||
char c = buffer[i];
|
||||
if (c != '.')
|
||||
buffer[upto++] = c;
|
||||
}
|
||||
termAtt.setLength(upto);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,234 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex
|
||||
*
|
||||
* <p> This should be a good tokenizer for most European-language documents:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Splits words at punctuation characters, removing punctuation. However, a
|
||||
* dot that's not followed by whitespace is considered part of a token.
|
||||
* <li>Splits words at hyphens, unless there's a number in the token, in which case
|
||||
* the whole token is interpreted as a product number and is not split.
|
||||
* <li>Recognizes email addresses and internet hostnames as one token.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Many applications have specific tokenizer needs. If this tokenizer does
|
||||
* not suit your application, please consider copying this source code
|
||||
* directory to your project and maintaining your own grammar-based tokenizer.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ClassicAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
|
||||
* </ul>
|
||||
*
|
||||
* ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
|
||||
* As of 3.1, {@link StandardTokenizer} implements Unicode text segmentation,
|
||||
* as specified by UAX#29.
|
||||
*/
|
||||
|
||||
public final class ClassicTokenizer extends Tokenizer {
|
||||
/** A private instance of the JFlex-constructed scanner */
|
||||
private StandardTokenizerInterface scanner;
|
||||
|
||||
public static final int ALPHANUM = 0;
|
||||
public static final int APOSTROPHE = 1;
|
||||
public static final int ACRONYM = 2;
|
||||
public static final int COMPANY = 3;
|
||||
public static final int EMAIL = 4;
|
||||
public static final int HOST = 5;
|
||||
public static final int NUM = 6;
|
||||
public static final int CJ = 7;
|
||||
|
||||
/**
|
||||
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||
* as ACRONYMs.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int ACRONYM_DEP = 8;
|
||||
|
||||
/** String token types that correspond to token type int constants */
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
"<APOSTROPHE>",
|
||||
"<ACRONYM>",
|
||||
"<COMPANY>",
|
||||
"<EMAIL>",
|
||||
"<HOST>",
|
||||
"<NUM>",
|
||||
"<CJ>",
|
||||
"<ACRONYM_DEP>"
|
||||
};
|
||||
|
||||
private boolean replaceInvalidAcronym;
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
/** @see #setMaxTokenLength */
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of the {@link ClassicTokenizer}. Attaches
|
||||
* the <code>input</code> to the newly created JFlex scanner.
|
||||
*
|
||||
* @param input The input reader
|
||||
*
|
||||
* See http://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
public ClassicTokenizer(Version matchVersion, Reader input) {
|
||||
super();
|
||||
init(input, matchVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new ClassicTokenizer with a given {@link AttributeSource}.
|
||||
*/
|
||||
public ClassicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
|
||||
super(source);
|
||||
init(input, matchVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
|
||||
*/
|
||||
public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
|
||||
super(factory);
|
||||
init(input, matchVersion);
|
||||
}
|
||||
|
||||
private final void init(Reader input, Version matchVersion) {
|
||||
this.scanner = new ClassicTokenizerImpl(input);
|
||||
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_24)) {
|
||||
replaceInvalidAcronym = true;
|
||||
} else {
|
||||
replaceInvalidAcronym = false;
|
||||
}
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
// term offset, positionIncrement and type
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int posIncr = 1;
|
||||
|
||||
while(true) {
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == StandardTokenizerInterface.YYEOF) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (scanner.yylength() <= maxTokenLength) {
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
scanner.getText(termAtt);
|
||||
final int start = scanner.yychar();
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == ClassicTokenizer.ACRONYM_DEP) {
|
||||
if (replaceInvalidAcronym) {
|
||||
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]);
|
||||
termAtt.setLength(termAtt.length() - 1); // remove extra '.'
|
||||
} else {
|
||||
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM]);
|
||||
}
|
||||
} else {
|
||||
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
return true;
|
||||
} else
|
||||
// When we skip a too-long term, we still increment the
|
||||
// position increment
|
||||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
scanner.yyreset(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, ClassicTokenizer mischaracterized as acronyms tokens like www.abc.com
|
||||
* when they should have been labeled as hosts instead.
|
||||
* @return true if ClassicTokenizer now returns these tokens as Hosts, otherwise false
|
||||
*
|
||||
* @deprecated Remove in 3.X and make true the only valid value
|
||||
*/
|
||||
@Deprecated
|
||||
public boolean isReplaceInvalidAcronym() {
|
||||
return replaceInvalidAcronym;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
|
||||
* @deprecated Remove in 3.X and make true the only valid value
|
||||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
@Deprecated
|
||||
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:50 */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/15/10 3:01 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -21,7 +21,7 @@ package org.apache.lucene.analysis.standard;
|
|||
|
||||
/*
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
|
||||
WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
|
||||
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
|
||||
|
||||
*/
|
||||
|
@ -33,10 +33,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 17.05.10 14:50 from the specification file
|
||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/newtrunk/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex</tt>
|
||||
* on 9/15/10 3:01 AM from the specification file
|
||||
* <tt>c:/Users/us/IdeaProjects/lucene/test-dev-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class StandardTokenizerImplOrig implements StandardTokenizerInterface {
|
||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||
|
||||
/** This character denotes the end of file */
|
||||
public static final int YYEOF = -1;
|
||||
|
@ -383,7 +383,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
StandardTokenizerImplOrig(java.io.Reader in) {
|
||||
ClassicTokenizerImpl(java.io.Reader in) {
|
||||
this.zzReader = in;
|
||||
}
|
||||
|
||||
|
@ -393,7 +393,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
StandardTokenizerImplOrig(java.io.InputStream in) {
|
||||
ClassicTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.standard;
|
|||
|
||||
/*
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
|
||||
WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
|
||||
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
|
||||
|
||||
*/
|
||||
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
|
||||
%%
|
||||
|
||||
%class StandardTokenizerImplOrig
|
||||
%class ClassicTokenizerImpl
|
||||
%implements StandardTokenizerInterface
|
||||
%unicode 3.0
|
||||
%integer
|
|
@ -39,10 +39,12 @@ import java.util.Set;
|
|||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StandardAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
|
||||
* and StopFilter correctly handles Unicode 4.0 supplementary characters
|
||||
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
|
||||
* are the pre-3.1 implementations of StandardTokenizer and
|
||||
* StandardAnalyzer.
|
||||
* <li> As of 2.9, StopFilter preserves position increments
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
|
||||
* </ul>
|
||||
|
@ -122,7 +124,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
|
||||
src.setMaxTokenLength(maxTokenLength);
|
||||
src.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
||||
TokenStream tok = new StandardFilter(src);
|
||||
TokenStream tok = new StandardFilter(matchVersion, src);
|
||||
tok = new LowerCaseFilter(matchVersion, tok);
|
||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||
return new TokenStreamComponents(src, tok) {
|
||||
|
|
|
@ -17,33 +17,45 @@ package org.apache.lucene.analysis.standard;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
||||
|
||||
public final class StandardFilter extends TokenFilter {
|
||||
|
||||
/** Construct filtering <i>in</i>. */
|
||||
/**
|
||||
* Normalizes tokens extracted with {@link StandardTokenizer}.
|
||||
*/
|
||||
public class StandardFilter extends TokenFilter {
|
||||
private final Version matchVersion;
|
||||
|
||||
public StandardFilter(TokenStream in) {
|
||||
super(in);
|
||||
this(Version.LUCENE_30, in);
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ACRONYM];
|
||||
|
||||
public StandardFilter(Version matchVersion, TokenStream in) {
|
||||
super(in);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
|
||||
|
||||
// this filters uses attribute type
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
return input.incrementToken(); // TODO: add some niceties for the new grammar
|
||||
else
|
||||
return incrementTokenClassic();
|
||||
}
|
||||
|
||||
public final boolean incrementTokenClassic() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -17,39 +17,42 @@
|
|||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex
|
||||
*
|
||||
* <p> This should be a good tokenizer for most European-language documents:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Splits words at punctuation characters, removing punctuation. However, a
|
||||
* dot that's not followed by whitespace is considered part of a token.
|
||||
* <li>Splits words at hyphens, unless there's a number in the token, in which case
|
||||
* the whole token is interpreted as a product number and is not split.
|
||||
* <li>Recognizes email addresses and internet hostnames as one token.
|
||||
* </ul>
|
||||
*
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex.
|
||||
* <p>
|
||||
* As of Lucene version 3.1, this class implements the Word Break rules from the
|
||||
* Unicode Text Segmentation algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
* <p/>
|
||||
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
|
||||
* characters (characters above the Basic Multilingual Plane, which contains
|
||||
* those up to and including U+FFFF), this scanner will not recognize them
|
||||
* properly. If you need to be able to process text containing supplementary
|
||||
* characters, consider using the ICU4J-backed implementation in contrib/icu
|
||||
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
|
||||
* instead of this class, since the ICU4J-backed implementation does not have
|
||||
* this limitation.
|
||||
* <p>Many applications have specific tokenizer needs. If this tokenizer does
|
||||
* not suit your application, please consider copying this source code
|
||||
* directory to your project and maintaining your own grammar-based tokenizer.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StandardAnalyzer:
|
||||
* compatibility when creating StandardTokenizer:
|
||||
* <ul>
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
|
||||
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
|
||||
* If you use a previous version number, you get the exact behavior of
|
||||
* {@link ClassicTokenizer} for backwards compatibility.
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
|
@ -58,12 +61,22 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
private StandardTokenizerInterface scanner;
|
||||
|
||||
public static final int ALPHANUM = 0;
|
||||
/** @deprecated */
|
||||
@Deprecated
|
||||
public static final int APOSTROPHE = 1;
|
||||
/** @deprecated */
|
||||
@Deprecated
|
||||
public static final int ACRONYM = 2;
|
||||
/** @deprecated */
|
||||
@Deprecated
|
||||
public static final int COMPANY = 3;
|
||||
public static final int EMAIL = 4;
|
||||
/** @deprecated */
|
||||
@Deprecated
|
||||
public static final int HOST = 5;
|
||||
public static final int NUM = 6;
|
||||
/** @deprecated */
|
||||
@Deprecated
|
||||
public static final int CJ = 7;
|
||||
|
||||
/**
|
||||
|
@ -73,6 +86,11 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
@Deprecated
|
||||
public static final int ACRONYM_DEP = 8;
|
||||
|
||||
public static final int URL = 9;
|
||||
public static final int SOUTHEAST_ASIAN = 10;
|
||||
public static final int IDEOGRAPHIC = 11;
|
||||
public static final int HIRAGANA = 12;
|
||||
|
||||
/** String token types that correspond to token type int constants */
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
|
@ -83,7 +101,11 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
"<HOST>",
|
||||
"<NUM>",
|
||||
"<CJ>",
|
||||
"<ACRONYM_DEP>"
|
||||
"<ACRONYM_DEP>",
|
||||
"<URL>",
|
||||
"<SOUTHEAST_ASIAN>",
|
||||
"<IDEOGRAPHIC>",
|
||||
"<HIRAGANA>"
|
||||
};
|
||||
|
||||
private boolean replaceInvalidAcronym;
|
||||
|
@ -132,7 +154,7 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
|
||||
private final void init(Reader input, Version matchVersion) {
|
||||
this.scanner = matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||
new StandardTokenizerImpl31(input) : new StandardTokenizerImplOrig(input);
|
||||
new StandardTokenizerImpl(input) : new ClassicTokenizerImpl(input);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_24)) {
|
||||
replaceInvalidAcronym = true;
|
||||
} else {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,260 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
* <p/>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
* <li><NUM>: A number</li>
|
||||
* <li><URL>: A URL</li>
|
||||
* <li><EMAIL>: An email address</li>
|
||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* </ul>
|
||||
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
|
||||
* characters (characters above the Basic Multilingual Plane, which contains
|
||||
* those up to and including U+FFFF), this scanner will not recognize them
|
||||
* properly. If you need to be able to process text containing supplementary
|
||||
* characters, consider using the ICU4J-backed implementation in contrib/icu
|
||||
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
|
||||
* instead of this class, since the ICU4J-backed implementation does not have
|
||||
* this limitation.
|
||||
*/
|
||||
%%
|
||||
|
||||
%unicode 5.2
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
%class StandardTokenizerImpl
|
||||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = [\p{WB:Numeric}\uFF10-\uFF19] [\p{WB:Format}\p{WB:Extend}]*
|
||||
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||
// RFC-1738: Uniform Resource Locators (URL)
|
||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||
|
||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||
|
||||
URIunreserved = [-._~A-Za-z0-9]
|
||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||
URIsubDelims = [!$&'()*+,;=]
|
||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIport = ":" [0-9]{1,5}
|
||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||
|
||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||
|
||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||
HTTPpath = ("/" {HTTPsegment})*
|
||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||
|
||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||
FTPscheme = [fF][tT][pP] "://"
|
||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||
|
||||
FILEscheme = [fF][iI][lL][eE] "://"
|
||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||
|
||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||
|
||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||
// reminder that they are acceptable bracketed host forms.
|
||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
|
||||
|
||||
/** Numbers */
|
||||
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
|
||||
|
||||
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
|
||||
public static final int URL_TYPE = StandardTokenizer.URL;
|
||||
|
||||
/** E-mail addresses */
|
||||
public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* {@see Unicode Line Breaking Algorithm http://www.unicode.org/reports/tr14/#SA}
|
||||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
|
||||
|
||||
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
{URL} { return URL_TYPE; }
|
||||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
//
|
||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||
// boundary property values based on criteria outside of the scope of this
|
||||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 5.2, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||
// Lao, etc.) are kept together. This grammar does the same below.
|
||||
//
|
||||
// See also the Unicode Line Breaking Algorithm:
|
||||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
\p{LB:Complex_Context}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
\p{Script:Han} { return IDEOGRAPHIC_TYPE; }
|
||||
\p{Script:Hiragana} { return HIRAGANA_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -1,134 +0,0 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
|
||||
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
|
||||
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
%%
|
||||
|
||||
%class StandardTokenizerImpl31
|
||||
%implements StandardTokenizerInterface
|
||||
%unicode 4.0
|
||||
%integer
|
||||
%function getNextToken
|
||||
%pack
|
||||
%char
|
||||
|
||||
%{
|
||||
|
||||
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = StandardTokenizer.ACRONYM;
|
||||
public static final int COMPANY = StandardTokenizer.COMPANY;
|
||||
public static final int EMAIL = StandardTokenizer.EMAIL;
|
||||
public static final int HOST = StandardTokenizer.HOST;
|
||||
public static final int NUM = StandardTokenizer.NUM;
|
||||
public static final int CJ = StandardTokenizer.CJ;
|
||||
/**
|
||||
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||
* as ACRONYMs.
|
||||
*/
|
||||
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
|
||||
|
||||
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
THAI = [\u0E00-\u0E59]
|
||||
|
||||
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
|
||||
ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
|
||||
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
// use a post-filter to remove possessives
|
||||
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
|
||||
|
||||
// acronyms: U.S.A., I.B.M., etc.
|
||||
// use a post-filter to remove dots
|
||||
ACRONYM = {LETTER} "." ({LETTER} ".")+
|
||||
|
||||
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
|
||||
|
||||
// company names like AT&T and Excite@Home.
|
||||
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
|
||||
|
||||
// email addresses
|
||||
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
|
||||
|
||||
// hostname
|
||||
HOST = {ALPHANUM} ((".") {ALPHANUM})+
|
||||
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
|
||||
| {HAS_DIGIT} {P} {ALPHANUM}
|
||||
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
|
||||
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
|
||||
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
|
||||
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
|
||||
|
||||
// punctuation
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
|
||||
// at least one digit
|
||||
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
|
||||
|
||||
ALPHA = ({LETTER})+
|
||||
|
||||
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
|
||||
LETTER = !(![:letter:]|{CJ})
|
||||
|
||||
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
|
||||
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
|
||||
|
||||
WHITESPACE = \r\n | [ \r\n\t\f]
|
||||
|
||||
%%
|
||||
|
||||
{ALPHANUM} { return ALPHANUM; }
|
||||
{APOSTROPHE} { return APOSTROPHE; }
|
||||
{ACRONYM} { return ACRONYM; }
|
||||
{COMPANY} { return COMPANY; }
|
||||
{EMAIL} { return EMAIL; }
|
||||
{HOST} { return HOST; }
|
||||
{NUM} { return NUM; }
|
||||
{CJ} { return CJ; }
|
||||
{ACRONYM_DEP} { return ACRONYM_DEP; }
|
||||
|
||||
/** Ignore the rest */
|
||||
. | {WHITESPACE} { /* ignore */ }
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:50 */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/15/10 3:01 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -19,33 +19,51 @@ package org.apache.lucene.analysis.standard;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
|
||||
the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
|
||||
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 17.05.10 14:50 from the specification file
|
||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/newtrunk/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex</tt>
|
||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* <p/>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
* <li><NUM>: A number</li>
|
||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* </ul>
|
||||
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
|
||||
* characters (characters above the Basic Multilingual Plane, which contains
|
||||
* those up to and including U+FFFF), this scanner will not recognize them
|
||||
* properly. If you need to be able to process text containing supplementary
|
||||
* characters, consider using the ICU4J-backed implementation in contrib/icu
|
||||
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
|
||||
* instead of this class, since the ICU4J-backed implementation does not have
|
||||
* this limitation.
|
||||
*/
|
||||
class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
||||
|
||||
public final class UAX29Tokenizer extends Tokenizer {
|
||||
|
||||
/** This character denotes the end of file */
|
||||
public static final int YYEOF = -1;
|
||||
private static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
|
||||
/** lexical states */
|
||||
public static final int YYINITIAL = 0;
|
||||
private static final int YYINITIAL = 0;
|
||||
|
||||
/**
|
||||
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
||||
|
@ -61,68 +79,113 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
|||
* Translates characters to character classes
|
||||
*/
|
||||
private static final String ZZ_CMAP_PACKED =
|
||||
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
|
||||
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
|
||||
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
|
||||
"\5\0\27\12\1\0\37\12\1\0\u013f\12\31\0\162\12\4\0\14\12"+
|
||||
"\16\0\5\12\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12"+
|
||||
"\1\0\1\12\1\0\24\12\1\0\54\12\1\0\46\12\1\0\5\12"+
|
||||
"\4\0\202\12\10\0\105\12\1\0\46\12\2\0\2\12\6\0\20\12"+
|
||||
"\41\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"+
|
||||
"\56\0\32\12\5\0\13\12\25\0\12\2\4\0\2\12\1\0\143\12"+
|
||||
"\1\0\1\12\17\0\2\12\7\0\2\12\12\2\3\12\2\0\1\12"+
|
||||
"\20\0\1\12\1\0\36\12\35\0\3\12\60\0\46\12\13\0\1\12"+
|
||||
"\u0152\0\66\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2"+
|
||||
"\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12"+
|
||||
"\3\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12\4\0\12\2"+
|
||||
"\2\12\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0"+
|
||||
"\2\12\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0"+
|
||||
"\12\2\2\0\3\12\20\0\11\12\1\0\3\12\1\0\26\12\1\0"+
|
||||
"\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0"+
|
||||
"\2\12\4\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0"+
|
||||
"\7\12\1\0\2\12\1\0\5\12\3\0\1\12\36\0\2\12\1\0"+
|
||||
"\3\12\4\0\12\2\1\0\1\12\21\0\1\12\1\0\6\12\3\0"+
|
||||
"\3\12\1\0\4\12\3\0\2\12\1\0\1\12\1\0\2\12\3\0"+
|
||||
"\2\12\3\0\3\12\3\0\10\12\1\0\3\12\55\0\11\2\25\0"+
|
||||
"\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\46\0"+
|
||||
"\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12\1\0"+
|
||||
"\12\12\1\0\5\12\3\0\1\12\40\0\1\12\1\0\2\12\4\0"+
|
||||
"\12\2\25\0\10\12\1\0\3\12\1\0\27\12\1\0\20\12\46\0"+
|
||||
"\2\12\4\0\12\2\25\0\22\12\3\0\30\12\1\0\11\12\1\0"+
|
||||
"\1\12\2\0\7\12\71\0\1\1\60\12\1\1\2\12\14\1\7\12"+
|
||||
"\11\1\12\2\47\0\2\12\1\0\1\12\2\0\2\12\1\0\1\12"+
|
||||
"\2\0\1\12\6\0\4\12\1\0\7\12\1\0\3\12\1\0\1\12"+
|
||||
"\1\0\1\12\2\0\2\12\1\0\4\12\1\0\2\12\11\0\1\12"+
|
||||
"\2\0\5\12\1\0\1\12\11\0\12\2\2\0\2\12\42\0\1\12"+
|
||||
"\37\0\12\2\26\0\10\12\1\0\42\12\35\0\4\12\164\0\42\12"+
|
||||
"\1\0\5\12\1\0\2\12\25\0\12\2\6\0\6\12\112\0\46\12"+
|
||||
"\12\0\51\12\7\0\132\12\5\0\104\12\5\0\122\12\6\0\7\12"+
|
||||
"\1\0\77\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\1\12"+
|
||||
"\1\0\4\12\2\0\47\12\1\0\1\12\1\0\4\12\2\0\37\12"+
|
||||
"\1\0\1\12\1\0\4\12\2\0\7\12\1\0\1\12\1\0\4\12"+
|
||||
"\2\0\7\12\1\0\7\12\1\0\27\12\1\0\37\12\1\0\1\12"+
|
||||
"\1\0\4\12\2\0\7\12\1\0\47\12\1\0\23\12\16\0\11\2"+
|
||||
"\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0\32\12\5\0\113\12"+
|
||||
"\25\0\15\12\1\0\4\12\16\0\22\12\16\0\22\12\16\0\15\12"+
|
||||
"\1\0\3\12\17\0\64\12\43\0\1\12\4\0\1\12\3\0\12\2"+
|
||||
"\46\0\12\2\6\0\130\12\10\0\51\12\127\0\35\12\51\0\12\2"+
|
||||
"\36\12\2\0\5\12\u038b\0\154\12\224\0\234\12\4\0\132\12\6\0"+
|
||||
"\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"+
|
||||
"\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"+
|
||||
"\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"+
|
||||
"\6\12\4\0\15\12\5\0\3\12\1\0\7\12\164\0\1\12\15\0"+
|
||||
"\1\12\202\0\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0"+
|
||||
"\5\12\6\0\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0"+
|
||||
"\3\12\1\0\7\12\3\0\3\12\5\0\5\12\u0ebb\0\2\12\52\0"+
|
||||
"\5\12\5\0\2\12\3\0\1\13\126\13\6\13\3\13\1\13\132\13"+
|
||||
"\1\13\4\13\5\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
|
||||
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
|
||||
"\u0773\0\u2ba4\12\u215c\0\u012e\13\2\13\73\13\225\13\7\12\14\0\5\12"+
|
||||
"\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"+
|
||||
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
|
||||
"\2\0\66\12\50\0\14\12\164\0\5\12\1\0\207\12\23\0\12\2"+
|
||||
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
|
||||
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
|
||||
"\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
|
||||
"\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
|
||||
"\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
|
||||
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
|
||||
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
|
||||
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
|
||||
"\1\0\7\2\234\1\13\0\46\1\2\0\1\1\7\0\47\1\1\0"+
|
||||
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
|
||||
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
|
||||
"\2\0\13\2\6\0\52\1\24\2\1\0\12\3\1\0\1\3\1\6"+
|
||||
"\1\0\2\1\1\2\143\1\1\0\1\1\17\2\2\1\2\2\1\0"+
|
||||
"\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1\1\2"+
|
||||
"\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1\11\2"+
|
||||
"\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1\11\2"+
|
||||
"\1\1\3\2\1\1\5\2\322\0\4\2\66\1\2\0\1\2\1\1"+
|
||||
"\21\2\1\0\1\1\5\2\2\0\12\1\2\2\2\0\12\3\1\0"+
|
||||
"\2\1\6\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1\2\0"+
|
||||
"\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2\1\1"+
|
||||
"\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0\2\1"+
|
||||
"\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0\6\1"+
|
||||
"\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0\2\1"+
|
||||
"\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0\3\2"+
|
||||
"\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2\3\1"+
|
||||
"\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1\1\0"+
|
||||
"\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2\1\0"+
|
||||
"\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0\12\3"+
|
||||
"\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
|
||||
"\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2"+
|
||||
"\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0"+
|
||||
"\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1"+
|
||||
"\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1"+
|
||||
"\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2"+
|
||||
"\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0\10\1"+
|
||||
"\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0\1\1"+
|
||||
"\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1\6\0"+
|
||||
"\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1"+
|
||||
"\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1\7\2"+
|
||||
"\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0\2\1"+
|
||||
"\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1\1\0"+
|
||||
"\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2\1\0\4\2"+
|
||||
"\11\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0\6\1\2\0"+
|
||||
"\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0"+
|
||||
"\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\22\0"+
|
||||
"\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11\10\12\1\0"+
|
||||
"\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0\1\11\2\0"+
|
||||
"\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0\1\11\1\0"+
|
||||
"\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12\1\0\2\12"+
|
||||
"\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0\12\3\2\0"+
|
||||
"\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0\1\2\1\0"+
|
||||
"\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1\4\0\24\2"+
|
||||
"\1\0\2\2\4\1\4\0\10\2\1\0\44\2\11\0\1\2\71\0"+
|
||||
"\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12\1\11"+
|
||||
"\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12\12\3"+
|
||||
"\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1\1\0"+
|
||||
"\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0"+
|
||||
"\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0"+
|
||||
"\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1\4\0"+
|
||||
"\1\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0"+
|
||||
"\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\2"+
|
||||
"\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0\3\1"+
|
||||
"\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11\1\12"+
|
||||
"\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0\51\1"+
|
||||
"\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0\14\2"+
|
||||
"\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12\7\11"+
|
||||
"\2\12\6\0\13\3\3\0\2\11\40\0\27\1\5\2\4\0\65\11"+
|
||||
"\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3\6\0\16\11"+
|
||||
"\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0\11\2\14\0"+
|
||||
"\3\2\36\1\12\2\3\0\2\1\12\3\106\0\44\1\24\2\10\0"+
|
||||
"\12\3\3\0\3\1\12\3\44\1\122\0\3\2\1\0\25\2\4\1"+
|
||||
"\1\2\4\1\1\2\15\0\300\1\47\2\26\0\3\2\u0116\1\2\0"+
|
||||
"\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1\1\0"+
|
||||
"\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1\1\0"+
|
||||
"\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1\4\0"+
|
||||
"\15\1\5\0\3\1\1\0\7\1\17\0\4\2\10\0\2\7\12\0"+
|
||||
"\1\7\2\0\1\5\2\0\5\2\20\0\2\10\3\0\1\6\17\0"+
|
||||
"\1\10\13\0\5\2\5\0\6\2\1\0\1\1\15\0\1\1\20\0"+
|
||||
"\5\1\73\0\41\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0"+
|
||||
"\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0"+
|
||||
"\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1\21\0"+
|
||||
"\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1\6\0"+
|
||||
"\4\1\3\2\16\0\46\1\12\0\66\1\11\0\1\1\20\0\27\1"+
|
||||
"\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
|
||||
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\57\0\1\1"+
|
||||
"\120\0\32\13\1\0\131\13\14\0\326\13\57\0\1\1\1\0\1\13"+
|
||||
"\31\0\11\13\6\2\1\0\5\4\2\0\3\13\1\1\1\1\4\0"+
|
||||
"\126\14\2\0\2\2\2\4\3\14\133\4\1\0\4\4\5\0\51\1"+
|
||||
"\3\0\136\1\21\0\30\1\70\0\20\4\320\0\57\4\1\0\130\4"+
|
||||
"\250\0\u19b6\13\112\0\u51cc\13\64\0\u048d\1\103\0\56\1\2\0\u010d\1"+
|
||||
"\3\0\20\1\12\3\2\1\24\0\40\1\2\0\15\1\4\2\11\0"+
|
||||
"\2\2\1\0\31\1\10\0\120\1\2\2\45\0\11\1\2\0\147\1"+
|
||||
"\2\0\2\1\156\0\7\1\1\2\3\1\1\2\4\1\1\2\27\1"+
|
||||
"\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0"+
|
||||
"\22\2\6\1\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1"+
|
||||
"\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3"+
|
||||
"\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3"+
|
||||
"\6\0\33\11\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12"+
|
||||
"\5\11\2\12\1\11\1\12\1\11\30\0\5\11\340\0\43\1\10\2"+
|
||||
"\1\0\2\2\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1"+
|
||||
"\u2104\0\u012e\13\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1"+
|
||||
"\5\0\1\1\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1"+
|
||||
"\1\0\2\1\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1"+
|
||||
"\2\0\66\1\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6"+
|
||||
"\13\0\7\2\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0"+
|
||||
"\1\6\1\5\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7"+
|
||||
"\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1"+
|
||||
"\4\0\1\10\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1"+
|
||||
"\2\0\6\1\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
|
@ -135,13 +198,11 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
|
||||
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
|
||||
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
|
||||
"\1\4";
|
||||
"\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
|
||||
"\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[51];
|
||||
int [] result = new int[16];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -166,16 +227,11 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||
|
||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
|
||||
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
|
||||
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
|
||||
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
|
||||
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
|
||||
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
|
||||
"\0\u0268\0\u0276\0\u0284";
|
||||
"\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
|
||||
"\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[51];
|
||||
int [] result = new int[16];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -198,49 +254,21 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||
|
||||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
|
||||
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
|
||||
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
|
||||
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
|
||||
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
|
||||
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
|
||||
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
|
||||
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
|
||||
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
|
||||
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
|
||||
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
|
||||
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
|
||||
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
|
||||
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
|
||||
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
|
||||
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
|
||||
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
|
||||
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
|
||||
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
|
||||
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
|
||||
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
|
||||
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
|
||||
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
|
||||
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
|
||||
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
|
||||
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
|
||||
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
|
||||
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
|
||||
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
|
||||
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
|
||||
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
|
||||
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
|
||||
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
|
||||
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
|
||||
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
|
||||
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
|
||||
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
|
||||
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
|
||||
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
|
||||
"\1\11\2\52\1\0\1\24\3\0";
|
||||
"\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
|
||||
"\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
|
||||
"\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
|
||||
"\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
|
||||
"\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
|
||||
"\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
|
||||
"\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
|
||||
"\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
|
||||
"\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
|
||||
"\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
|
||||
"\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
|
||||
"\2\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[658];
|
||||
int [] result = new int[169];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -278,11 +306,11 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
|
||||
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
|
||||
"\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
|
||||
"\1\1\2\0";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[51];
|
||||
int [] result = new int[16];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -350,35 +378,124 @@ class StandardTokenizerImpl31 implements StandardTokenizerInterface {
|
|||
private boolean zzEOFDone;
|
||||
|
||||
/* user code: */
|
||||
/** Alphanumeric sequences */
|
||||
public static final String WORD_TYPE = "<ALPHANUM>";
|
||||
|
||||
/** Numbers */
|
||||
public static final String NUMERIC_TYPE = "<NUM>";
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* {@see Unicode Line Breaking Algorithm http://www.unicode.org/reports/tr14/#SA}
|
||||
*/
|
||||
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
|
||||
|
||||
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
|
||||
|
||||
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt
|
||||
= addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
private int posIncr;
|
||||
|
||||
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = StandardTokenizer.ACRONYM;
|
||||
public static final int COMPANY = StandardTokenizer.COMPANY;
|
||||
public static final int EMAIL = StandardTokenizer.EMAIL;
|
||||
public static final int HOST = StandardTokenizer.HOST;
|
||||
public static final int NUM = StandardTokenizer.NUM;
|
||||
public static final int CJ = StandardTokenizer.CJ;
|
||||
/**
|
||||
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||
* as ACRONYMs.
|
||||
*/
|
||||
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
|
||||
|
||||
/**
|
||||
* @param source The AttributeSource to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param factory The AttributeFactory to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max allowed token length. Any token longer than this is skipped.
|
||||
* @param length the new max allowed token length
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
|
||||
/**
|
||||
* Returns the max allowed token length. Any token longer than this is
|
||||
* skipped.
|
||||
* @return the max allowed token length
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(yychar + yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
yyreset(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// This method is required because of two JFlex limitations:
|
||||
// 1. No way to insert code at the beginning of the generated scanning
|
||||
// get-next-token method; and
|
||||
// 2. No way to declare @Override on the generated scanning method.
|
||||
clearAttributes();
|
||||
posIncr = 1;
|
||||
return getNextToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
|
||||
* the current match, the TypeAttribute from the passed-in tokenType, and
|
||||
* the PositionIncrementAttribute to one, unless the immediately previous
|
||||
* token(s) was/were skipped because maxTokenLength was exceeded, in which
|
||||
* case the PositionIncrementAttribute is set to one plus the number of
|
||||
* skipped overly long tokens.
|
||||
* <p/>
|
||||
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
|
||||
* and false is returned.
|
||||
*
|
||||
* @param tokenType The type of the matching token
|
||||
* @return true there is a token available (not too long); false otherwise
|
||||
*/
|
||||
private boolean populateAttributes(String tokenType) {
|
||||
boolean isTokenAvailable = false;
|
||||
if (yylength() > maxTokenLength) {
|
||||
// When we skip a too-long token, we treat it like a stopword, introducing
|
||||
// a position increment gap
|
||||
++posIncr;
|
||||
} else {
|
||||
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
offsetAtt.setOffset(correctOffset(yychar),
|
||||
correctOffset(yychar + yylength()));
|
||||
typeAtt.setType(tokenType);
|
||||
isTokenAvailable = true;
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -387,7 +504,8 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
StandardTokenizerImpl31(java.io.Reader in) {
|
||||
public UAX29Tokenizer(java.io.Reader in) {
|
||||
super(in);
|
||||
this.zzReader = in;
|
||||
}
|
||||
|
||||
|
@ -397,7 +515,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
StandardTokenizerImpl31(java.io.InputStream in) {
|
||||
public UAX29Tokenizer(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
|
@ -411,7 +529,7 @@ public final void getText(CharTermAttribute t) {
|
|||
char [] map = new char[0x10000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 1234) {
|
||||
while (i < 2138) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
|
@ -477,7 +595,7 @@ public final void getText(CharTermAttribute t) {
|
|||
/**
|
||||
* Closes the input stream.
|
||||
*/
|
||||
public final void yyclose() throws java.io.IOException {
|
||||
private final void yyclose() throws java.io.IOException {
|
||||
zzAtEOF = true; /* indicate end of file */
|
||||
zzEndRead = zzStartRead; /* invalidate buffer */
|
||||
|
||||
|
@ -498,7 +616,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @param reader the new input stream
|
||||
*/
|
||||
public final void yyreset(java.io.Reader reader) {
|
||||
private final void yyreset(java.io.Reader reader) {
|
||||
zzReader = reader;
|
||||
zzAtBOL = true;
|
||||
zzAtEOF = false;
|
||||
|
@ -515,7 +633,7 @@ public final void getText(CharTermAttribute t) {
|
|||
/**
|
||||
* Returns the current lexical state.
|
||||
*/
|
||||
public final int yystate() {
|
||||
private final int yystate() {
|
||||
return zzLexicalState;
|
||||
}
|
||||
|
||||
|
@ -525,7 +643,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @param newState the new lexical state
|
||||
*/
|
||||
public final void yybegin(int newState) {
|
||||
private final void yybegin(int newState) {
|
||||
zzLexicalState = newState;
|
||||
}
|
||||
|
||||
|
@ -533,7 +651,7 @@ public final void getText(CharTermAttribute t) {
|
|||
/**
|
||||
* Returns the text matched by the current regular expression.
|
||||
*/
|
||||
public final String yytext() {
|
||||
private final String yytext() {
|
||||
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
||||
}
|
||||
|
||||
|
@ -549,7 +667,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* @return the character at position pos
|
||||
*/
|
||||
public final char yycharat(int pos) {
|
||||
private final char yycharat(int pos) {
|
||||
return zzBuffer[zzStartRead+pos];
|
||||
}
|
||||
|
||||
|
@ -557,7 +675,7 @@ public final void getText(CharTermAttribute t) {
|
|||
/**
|
||||
* Returns the length of the matched text region.
|
||||
*/
|
||||
public final int yylength() {
|
||||
private final int yylength() {
|
||||
return zzMarkedPos-zzStartRead;
|
||||
}
|
||||
|
||||
|
@ -597,7 +715,7 @@ public final void getText(CharTermAttribute t) {
|
|||
* @param number the number of characters to be read again.
|
||||
* This number must not be greater than yylength()!
|
||||
*/
|
||||
public void yypushback(int number) {
|
||||
private void yypushback(int number) {
|
||||
if ( number > yylength() )
|
||||
zzScanError(ZZ_PUSHBACK_2BIG);
|
||||
|
||||
|
@ -612,7 +730,7 @@ public final void getText(CharTermAttribute t) {
|
|||
* @return the next token
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
public int getNextToken() throws java.io.IOException {
|
||||
private boolean getNextToken() throws java.io.IOException {
|
||||
int zzInput;
|
||||
int zzAction;
|
||||
|
||||
|
@ -685,49 +803,35 @@ public final void getText(CharTermAttribute t) {
|
|||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 5:
|
||||
{ return NUM;
|
||||
{ if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
|
||||
}
|
||||
case 7: break;
|
||||
case 1:
|
||||
{ /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
||||
}
|
||||
case 8: break;
|
||||
case 3:
|
||||
{ if (populateAttributes(NUMERIC_TYPE)) return true;
|
||||
}
|
||||
case 9: break;
|
||||
case 6:
|
||||
{ if (populateAttributes(HIRAGANA_TYPE)) return true;
|
||||
}
|
||||
case 10: break;
|
||||
case 4:
|
||||
{ if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
|
||||
}
|
||||
case 11: break;
|
||||
case 9:
|
||||
{ return ACRONYM;
|
||||
case 2:
|
||||
{ if (populateAttributes(WORD_TYPE)) return true;
|
||||
}
|
||||
case 12: break;
|
||||
case 7:
|
||||
{ return COMPANY;
|
||||
}
|
||||
case 13: break;
|
||||
case 10:
|
||||
{ return EMAIL;
|
||||
}
|
||||
case 14: break;
|
||||
case 1:
|
||||
{ /* ignore */
|
||||
}
|
||||
case 15: break;
|
||||
case 6:
|
||||
{ return APOSTROPHE;
|
||||
}
|
||||
case 16: break;
|
||||
case 3:
|
||||
{ return CJ;
|
||||
}
|
||||
case 17: break;
|
||||
case 8:
|
||||
{ return ACRONYM_DEP;
|
||||
}
|
||||
case 18: break;
|
||||
case 2:
|
||||
{ return ALPHANUM;
|
||||
}
|
||||
case 19: break;
|
||||
case 4:
|
||||
{ return HOST;
|
||||
}
|
||||
case 20: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
zzAtEOF = true;
|
||||
return YYEOF;
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
zzScanError(ZZ_NO_MATCH);
|
|
@ -0,0 +1,273 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* <p/>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
* <li><NUM>: A number</li>
|
||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* </ul>
|
||||
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
|
||||
* characters (characters above the Basic Multilingual Plane, which contains
|
||||
* those up to and including U+FFFF), this scanner will not recognize them
|
||||
* properly. If you need to be able to process text containing supplementary
|
||||
* characters, consider using the ICU4J-backed implementation in contrib/icu
|
||||
* ({@link org.apache.lucene.analysis.icu.segmentation.ICUTokenizer})
|
||||
* instead of this class, since the ICU4J-backed implementation does not have
|
||||
* this limitation.
|
||||
*/
|
||||
%%
|
||||
|
||||
%unicode 5.2
|
||||
%final
|
||||
%public
|
||||
%apiprivate
|
||||
%class UAX29Tokenizer
|
||||
%extends Tokenizer
|
||||
%type boolean
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
%init{
|
||||
super(in);
|
||||
%init}
|
||||
|
||||
// WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = [\p{WB:Numeric}\uFF10-\uFF19] [\p{WB:Format}\p{WB:Extend}]*
|
||||
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final String WORD_TYPE = "<ALPHANUM>";
|
||||
|
||||
/** Numbers */
|
||||
public static final String NUMERIC_TYPE = "<NUM>";
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* {@see Unicode Line Breaking Algorithm http://www.unicode.org/reports/tr14/#SA}
|
||||
*/
|
||||
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
|
||||
|
||||
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
|
||||
|
||||
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt
|
||||
= addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
private int posIncr;
|
||||
|
||||
|
||||
/**
|
||||
* @param source The AttributeSource to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param factory The AttributeFactory to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max allowed token length. Any token longer than this is skipped.
|
||||
* @param length the new max allowed token length
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the max allowed token length. Any token longer than this is
|
||||
* skipped.
|
||||
* @return the max allowed token length
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(yychar + yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
yyreset(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// This method is required because of two JFlex limitations:
|
||||
// 1. No way to insert code at the beginning of the generated scanning
|
||||
// get-next-token method; and
|
||||
// 2. No way to declare @Override on the generated scanning method.
|
||||
clearAttributes();
|
||||
posIncr = 1;
|
||||
return getNextToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
|
||||
* the current match, the TypeAttribute from the passed-in tokenType, and
|
||||
* the PositionIncrementAttribute to one, unless the immediately previous
|
||||
* token(s) was/were skipped because maxTokenLength was exceeded, in which
|
||||
* case the PositionIncrementAttribute is set to one plus the number of
|
||||
* skipped overly long tokens.
|
||||
* <p/>
|
||||
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
|
||||
* and false is returned.
|
||||
*
|
||||
* @param tokenType The type of the matching token
|
||||
* @return true there is a token available (not too long); false otherwise
|
||||
*/
|
||||
private boolean populateAttributes(String tokenType) {
|
||||
boolean isTokenAvailable = false;
|
||||
if (yylength() > maxTokenLength) {
|
||||
// When we skip a too-long token, we treat it like a stopword, introducing
|
||||
// a position increment gap
|
||||
++posIncr;
|
||||
} else {
|
||||
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
offsetAtt.setOffset(correctOffset(yychar),
|
||||
correctOffset(yychar + yylength()));
|
||||
typeAtt.setType(tokenType);
|
||||
isTokenAvailable = true;
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
// WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return false; }
|
||||
|
||||
|
||||
// WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
|
||||
|
||||
|
||||
// WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ if (populateAttributes(WORD_TYPE)) return true; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
//
|
||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||
// boundary property values based on criteria outside of the scope of this
|
||||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 5.2, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||
// Lao, etc.) are kept together. This grammar does the same below.
|
||||
//
|
||||
// See also the Unicode Line Breaking Algorithm:
|
||||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
|
||||
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
|
||||
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
|
||||
|
||||
|
||||
// WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -17,9 +17,43 @@
|
|||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
A fast grammar-based tokenizer constructed with JFlex.
|
||||
<p>The <code>org.apache.lucene.analysis.standard</code> package contains three
|
||||
fast grammar-based tokenizers constructed with JFlex:</p>
|
||||
<ul>
|
||||
<li><code><a href="StandardTokenizer.html">StandardTokenizer</a></code>:
|
||||
as of Lucene 3.1, implements the Word Break rules from the Unicode Text
|
||||
Segmentation algorithm, as specified in
|
||||
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
|
||||
<code>StandardTokenizer</code>,
|
||||
<code><a href="StandardFilter">StandardFilter</a></code>,
|
||||
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
|
||||
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
|
||||
When the <code>Version</code> specified in the constructor is lower than
|
||||
3.1, the <code><a href="ClassicTokenizer.html">ClassicTokenizer</a></code>
|
||||
implementation is invoked.</li>
|
||||
<li><code><a href="ClassicTokenizer.html">ClassicTokenizer</a></code>:
|
||||
this class was formerly (prior to Lucene 3.1) named
|
||||
<code>StandardTokenizer</code>. (Its tokenization rules are not
|
||||
based on the Unicode Text Segmentation algorithm.)
|
||||
<code><a href="ClassicAnalyzer">ClassicAnalyzer</a></code> includes
|
||||
<code>ClassicTokenizer</code>,
|
||||
<code><a href="StandardFilter">StandardFilter</a></code>,
|
||||
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
|
||||
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
|
||||
</li>
|
||||
<li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>:
|
||||
implements the Word Break rules from the Unicode Text Segmentation
|
||||
algorithm, as specified in
|
||||
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
Unlike <code>StandardTokenizer</code>, URLs and email addresses are
|
||||
<b>not</b> tokenized as single tokens, but are instead split up into
|
||||
tokens according to the UAX#29 word break rules.
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -120,7 +120,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -58,7 +58,7 @@ public final class ThaiAnalyzer extends ReusableAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new ThaiWordFilter(matchVersion, result);
|
||||
|
|
|
@ -123,7 +123,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -0,0 +1,267 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<meta name="robots" content="index,nofollow">
|
||||
|
||||
<title>Resources - Lucene-java Wiki</title>
|
||||
<script type="text/javascript" src="/moin_static184/common/js/common.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
<!--
|
||||
var search_hint = "Search";
|
||||
//-->
|
||||
</script>
|
||||
|
||||
|
||||
<link rel="stylesheet" type="text/css" charset="utf-8" media="all" href="/moin_static184/modernized/css/common.css">
|
||||
<link rel="stylesheet" type="text/css" charset="utf-8" media="screen" href="/moin_static184/modernized/css/screen.css">
|
||||
<link rel="stylesheet" type="text/css" charset="utf-8" media="print" href="/moin_static184/modernized/css/print.css">
|
||||
<link rel="stylesheet" type="text/css" charset="utf-8" media="projection" href="/moin_static184/modernized/css/projection.css">
|
||||
|
||||
<!-- css only for MS IE6/IE7 browsers -->
|
||||
<!--[if lt IE 8]>
|
||||
<link rel="stylesheet" type="text/css" charset="utf-8" media="all" href="/moin_static184/modernized/css/msie.css">
|
||||
<![endif]-->
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="Start" href="/lucene-java/FrontPageEN">
|
||||
<link rel="Alternate" title="Wiki Markup" href="/lucene-java/Resources?action=raw">
|
||||
<link rel="Alternate" media="print" title="Print View" href="/lucene-java/Resources?action=print">
|
||||
<link rel="Appendix" title="IntroductionToApacheLucene.jp.jpg" href="/lucene-java/Resources?action=AttachFile&do=view&target=IntroductionToApacheLucene.jp.jpg">
|
||||
<link rel="Appendix" title="SuchmaschinenEntwickelnMitApacheLucene.de.jpg" href="/lucene-java/Resources?action=AttachFile&do=view&target=SuchmaschinenEntwickelnMitApacheLucene.de.jpg">
|
||||
<link rel="Appendix" title="building.search.applications.png" href="/lucene-java/Resources?action=AttachFile&do=view&target=building.search.applications.png">
|
||||
<link rel="Appendix" title="lia3d.jpg" href="/lucene-java/Resources?action=AttachFile&do=view&target=lia3d.jpg">
|
||||
<link rel="Search" href="/lucene-java/FindPage">
|
||||
<link rel="Index" href="/lucene-java/TitleIndex">
|
||||
<link rel="Glossary" href="/lucene-java/WordIndex">
|
||||
<link rel="Help" href="/lucene-java/HelpOnFormatting">
|
||||
</head>
|
||||
|
||||
<body lang="en" dir="ltr">
|
||||
|
||||
<div id="header">
|
||||
|
||||
<form id="searchform" method="get" action="/lucene-java/Resources">
|
||||
<div>
|
||||
<input type="hidden" name="action" value="fullsearch">
|
||||
<input type="hidden" name="context" value="180">
|
||||
<label for="searchinput">Search:</label>
|
||||
<input id="searchinput" type="text" name="value" value="" size="20"
|
||||
onfocus="searchFocus(this)" onblur="searchBlur(this)"
|
||||
onkeyup="searchChange(this)" onchange="searchChange(this)" alt="Search">
|
||||
<input id="titlesearch" name="titlesearch" type="submit"
|
||||
value="Titles" alt="Search Titles">
|
||||
<input id="fullsearch" name="fullsearch" type="submit"
|
||||
value="Text" alt="Search Full Text">
|
||||
</div>
|
||||
</form>
|
||||
<script type="text/javascript">
|
||||
<!--// Initialize search form
|
||||
var f = document.getElementById('searchform');
|
||||
f.getElementsByTagName('label')[0].style.display = 'none';
|
||||
var e = document.getElementById('searchinput');
|
||||
searchChange(e);
|
||||
searchBlur(e);
|
||||
//-->
|
||||
</script>
|
||||
|
||||
<div id="logo"><a href="/lucene-java/FrontPageEN">Lucene-java Wiki</a></div>
|
||||
<div id="username"><a href="/lucene-java/Resources?action=login" id="login" rel="nofollow">Login</a></div>
|
||||
<h1 id="locationline">
|
||||
|
||||
<span id="pagelocation"><a class="backlink" href="/lucene-java/Resources?action=fullsearch&context=180&value=linkto%3A%22Resources%22" rel="nofollow" title="Click to do a full-text search for this title">Resources</a></span>
|
||||
</h1>
|
||||
|
||||
|
||||
<ul id="navibar">
|
||||
<li class="wikilink"><a href="/lucene-java/FrontPageEN">FrontPageEN</a></li><li class="wikilink"><a href="/lucene-java/RecentChanges">RecentChanges</a></li><li class="wikilink"><a href="/lucene-java/FindPage">FindPage</a></li><li class="wikilink"><a href="/lucene-java/HelpContents">HelpContents</a></li><li class="current"><a href="/lucene-java/Resources">Resources</a></li>
|
||||
</ul>
|
||||
|
||||
<div id="pageline"><hr style="display:none;"></div>
|
||||
|
||||
<ul class="editbar"><li><span class="disabled">Immutable Page</span></li><li class="toggleCommentsButton" style="display:none;"><a href="#" class="nbcomment" onClick="toggleComments();return false;">Comments</a></li><li><a class="nbinfo" href="/lucene-java/Resources?action=info" rel="nofollow">Info</a></li><li>
|
||||
<form class="actionsmenu" method="GET" action="/lucene-java/Resources">
|
||||
<div>
|
||||
<label>More Actions:</label>
|
||||
<select name="action"
|
||||
onchange="if ((this.selectedIndex != 0) &&
|
||||
(this.options[this.selectedIndex].disabled == false)) {
|
||||
this.form.submit();
|
||||
}
|
||||
this.selectedIndex = 0;">
|
||||
<option value="raw">Raw Text</option>
|
||||
<option value="print">Print View</option>
|
||||
<option value="RenderAsDocbook">Render as Docbook</option>
|
||||
<option value="refresh">Delete Cache</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="SpellCheck">Check Spelling</option>
|
||||
<option value="LikePages">Like Pages</option>
|
||||
<option value="LocalSiteMap">Local Site Map</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="RenamePage" disabled class="disabled">Rename Page</option>
|
||||
<option value="CopyPage">Copy Page</option>
|
||||
<option value="DeletePage" disabled class="disabled">Delete Page</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="MyPages">My Pages</option>
|
||||
<option value="show" disabled class="disabled">Subscribe User</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="show" disabled class="disabled">Remove Spam</option>
|
||||
<option value="show" disabled class="disabled">Revert to this revision</option>
|
||||
<option value="show" disabled class="disabled">Package Pages</option>
|
||||
<option value="SyncPages">Sync Pages</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="Load">Load</option>
|
||||
<option value="Save">Save</option>
|
||||
</select>
|
||||
<input type="submit" value="Do">
|
||||
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
<!--// Init menu
|
||||
actionsMenuInit('More Actions:');
|
||||
//-->
|
||||
</script>
|
||||
</form>
|
||||
</li></ul>
|
||||
|
||||
</div>
|
||||
|
||||
<div id="page" lang="en" dir="ltr">
|
||||
<div dir="ltr" id="content" lang="en"><span class="anchor" id="top"></span>
|
||||
<span class="anchor" id="line-2"></span><p class="line867"><div class="table-of-contents"><p class="table-of-contents-heading">Contents<ol><li>
|
||||
<a href="#Introductions">Introductions</a></li><li>
|
||||
<a href="#Blogs">Blogs</a></li><li>
|
||||
<a href="#Books">Books</a></li><li>
|
||||
<a href="#Articles">Articles</a></li><li>
|
||||
<a href="#Interviews">Interviews</a></li><li>
|
||||
<a href="#Papers">Papers</a></li><li>
|
||||
<a href="#Presentations">Presentations</a></li><li>
|
||||
<a href="#Training">Training</a></li><li>
|
||||
<a href="#Corpora">Corpora</a></li><li>
|
||||
<a href="#Other">Other</a></li></ol></div> <span class="anchor" id="line-3"></span><span class="anchor" id="line-4"></span><p class="line867">
|
||||
<h1 id="Introductions">Introductions</h1>
|
||||
<span class="anchor" id="line-5"></span><span class="anchor" id="line-6"></span><ul><li><p class="line862">The API documentation contains <a class="http" href="http://lucene.apache.org/java/3_0_1/api/all/overview-summary.html#overview_description">a short and simple code example</a> that shows the basic way to index and search <span class="anchor" id="line-7"></span></li><li><p class="line862">The <a class="http" href="http://lucene.apache.org/java/3_0_1/gettingstarted.html">Getting Started Guide</a> that describes the demos that come with Lucene <span class="anchor" id="line-8"></span><span class="anchor" id="line-9"></span><span class="anchor" id="line-10"></span></li></ul><p class="line867">
|
||||
<h1 id="Blogs">Blogs</h1>
|
||||
<span class="anchor" id="line-11"></span><span class="anchor" id="line-12"></span><ul><li><p class="line891"><a class="http" href="http://lucene.grantingersoll.com">Grant's Grunts: Lucene edition</a> - Grant Ingersoll's thoughts on the Lucene ecosystem. <span class="anchor" id="line-13"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/blog/">Lucid Imagination's Blog</a> - Many of the Lucene and Solr committers blog here about how to use Lucene and Solr <span class="anchor" id="line-14"></span></li><li><p class="line891"><a class="http" href="http://blog.sematext.com/">Sematext Blog</a> - Search and Analytics covering Lucene, Solr, Nutch, Hadoop, HBase, and more <span class="anchor" id="line-15"></span><span class="anchor" id="line-16"></span><span class="anchor" id="line-17"></span></li></ul><p class="line867">
|
||||
<h1 id="Books">Books</h1>
|
||||
<span class="anchor" id="line-18"></span><span class="anchor" id="line-19"></span><ul><li><p class="line891"><img alt="http://www.manning.com/hatcher3/hatcher3_cover150.jpg" class="external_image" src="http://www.manning.com/hatcher3/hatcher3_cover150.jpg" title="http://www.manning.com/hatcher3/hatcher3_cover150.jpg" /> "<a class="http" href="http://www.manning.com/hatcher3/">Lucene in Action, Second Edition"</a> by Erik Hatcher, Otis Gospodnetić, and Michael McCandless <span class="anchor" id="line-20"></span></li><li><p class="line891"><img alt="building.search.applications.png" class="attachment" src="/lucene-java/Resources?action=AttachFile&do=get&target=building.search.applications.png" title="building.search.applications.png" /> "<a class="http" href="http://www.amazon.com/Building-Search-Applications-Lucene-Lingpipe/dp/0615204252/">Building Search Applications: Lucene, LingPipe, and Gate</a>" by Manu Konchady; Mustru Publishing; June 2008; ISBN 978-0615204253 <span class="anchor" id="line-21"></span></li><li><p class="line891"><img alt="IntroductionToApacheLucene.jp.jpg" class="attachment" src="/lucene-java/Resources?action=AttachFile&do=get&target=IntroductionToApacheLucene.jp.jpg" title="IntroductionToApacheLucene.jp.jpg" /> "<a class="http" href="http://www.amazon.co.jp/exec/obidos/ASIN/4774127809/503-9461699-1775907">Apache Lucene 入門 ~Java・オープンソース・全文検索システムの構築</a>" 関口 宏司 ; 技術評論社 ; 2006/05/17 ; ISBN: 4774127809 (<span class="u">Introduction to Apache Lucene: Construction of Java Open Source Full Text Retrieval Systems</span> by Koshi Sekiguti ; Gijutsu-Hyohron Co., Ltd.) <span class="anchor" id="line-22"></span></li><li><p class="line891"><img alt="lia3d.jpg" class="attachment" src="/lucene-java/Resources?action=AttachFile&do=get&target=lia3d.jpg" title="lia3d.jpg" /> "<a class="http" href="http://www.lucenebook.com">Lucene In Action</a>" by Erik Hatcher, Otis Gospodnetić; Manning Publications; December 2004; ISBN 1932394281 (also available from <a class="http" href="http://www.amazon.com/exec/obidos/ASIN/1932394281">Amazon.com</a>) <span class="anchor" id="line-23"></span></li><li><p class="line891"><img alt="SuchmaschinenEntwickelnMitApacheLucene.de.jpg" class="attachment" src="/lucene-java/Resources?action=AttachFile&do=get&target=SuchmaschinenEntwickelnMitApacheLucene.de.jpg" title="SuchmaschinenEntwickelnMitApacheLucene.de.jpg" /> Manfred Hardt, Dr. Fabian Theis: "<a class="http" href="http://www.amazon.de/Suchmaschinen-entwickeln-mit-Apache-Lucene/dp/3935042450">Suchmaschinen entwickeln mit Apache Lucene</a>"; Software & Support Verlag, Frankfurt/Main, Germany; September 2004; ISBN 3935042450 (<span class="u">Developing Search Engines with Apache Lucene</span>) <span class="anchor" id="line-24"></span><span class="anchor" id="line-25"></span></li></ul><p class="line867">
|
||||
<h1 id="Articles">Articles</h1>
|
||||
<span class="anchor" id="line-26"></span><span class="anchor" id="line-27"></span><ul><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Getting-Started-with-Lucene/">Getting Started with Lucene</a> (by Grant Ingersoll) <br>
|
||||
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-28"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Optimizing-Findability-in-Lucene-and-Solr/">Optimizing Findability in Lucene and Solr</a> (by Grant Ingersoll)<br>
|
||||
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-29"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Debugging-Relevance-Issues-in-Search/">Debugging Relevance Issues in Search</a> (by Grant Ingersoll)<br>
|
||||
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-30"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Scaling-Lucene-and-Solr/">Scaling Lucene and Solr</a> (by Mark Miller)<br>
|
||||
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-31"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Introduction-to-Apache-Lucene-and-Solr/">Introduction to Apache Lucene and Solr</a> (by Marc Krellenstein)<br>
|
||||
(<em>Published: January 2009 - article</em>) <span class="anchor" id="line-32"></span></li><li><p class="line891"><a class="http" href="http://cephas.net/blog/2008/03/30/how-morelikethis-works-in-lucene/">How MoreLikeThis Works in Lucene</a> (by Aaron Johnson)<br>
|
||||
(<em>Last updated: March 2008 - blog entry</em>) <span class="anchor" id="line-33"></span></li><li><p class="line891"><a class="http" href="http://schmidt.devlib.org/software/lucene-wikipedia.html">Lucene Wikipedia indexer</a> (by Marco Schmidt)<br>
|
||||
(<em>Last updated: November 2007 - tutorial</em>) <span class="anchor" id="line-34"></span></li><li><p class="line891"><a class="http" href="http://marceloochoa.blogspot.com/2007/09/running-lucene-inside-your-oracle-jvm.html">Running Lucene inside your Oracle JVM</a> (by Marcelo Ochoa)<br>
|
||||
(<em>Last updated: September 2007 - blog entry</em>) <span class="anchor" id="line-35"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2007/05/24/using-the-lucene-query-parser-without-lucene.html">Using the Lucene Query Parser Without Lucene</a> (by Marcin Maciukiewicz and Daniel Owsiański)<br>
|
||||
(<em>Published: May 2007 - article</em>) <span class="anchor" id="line-36"></span></li><li><p class="line891"><a class="http" href="http://www.javaworld.com/javaworld/jw-09-2006/jw-0925-lucene.html">Integrate advanced search functionalities into your apps</a> (by John Ferguson Smart)<br>
|
||||
(<em>Published: September 2006 - article</em>) <span class="anchor" id="line-37"></span></li><li><p class="line891"><a class="http" href="http://www-128.ibm.com/developerworks/java/library/wa-lucene2/index.html?ca=drs-">Beef up Web search applications with Lucene</a> (by Deng Peng Zhou)<br>
|
||||
(<em>Published: August 2006 - article</em>) <span class="anchor" id="line-38"></span></li><li><p class="line891"><a class="http" href="http://www.freesearch.pe.kr/tag/Lucene">Lecture & Etc : Lucene index file format for Korean</a> (by Jeon Hee-Won)<br>
|
||||
(<em>Published: July 2006 - article</em>) <span class="anchor" id="line-39"></span></li><li>Cai Ziegler: "Suche nach Suche -- Apaches Lucene: eigene Suche und Indizierung"; iX 6/2006, Seite 120; Heise Zeitschriften Verlag, Hannover, Germany <span class="anchor" id="line-40"></span></li><li><p class="line891"><a class="http" href="http://www-128.ibm.com/developerworks/java/library/wa-lucene/index.html">Delve inside the Lucene indexing mechanism</a> (by Deng Peng Zhou)<br>
|
||||
(<em>Published: June 2006 - article</em>) <span class="anchor" id="line-41"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html">Using Lucene to Search Java Source Code</a> (by Renuka Sindhgatta)<br>
|
||||
(<em>Published: January 2006 - article</em>) <span class="anchor" id="line-42"></span></li><li><p class="line891"><a class="http" href="http://www.jroller.com/page/wakaleo/?anchor=lucene_a_tutorial_introduction_to">Lucene : a tutorial introduction to full-text indexing in Java</a> (by John Ferguson Smart)<br>
|
||||
(<em>Published: October 2005 - article</em>) <span class="anchor" id="line-43"></span></li><li>Daniel Naber: "Herr der Suche -- Eigene Anwendungen mit Volltextsuche erweitern"; c't 7/2005, Seite 196; Heise Zeitschriften Verlag, Hannover, Germany <span class="anchor" id="line-44"></span></li><li><p class="line891"><a class="http" href="http://blog.dev.sf.net/index.php?/archives/10-Behind-the-Scenes-of-the-SourceForge.net-Search-System.html">Behind the Scenes of the SourceForge.net Search System</a> (by Chris Conrad)<br>
|
||||
(<em>Last updated: June 2005 - blog entry</em>) <span class="anchor" id="line-45"></span></li><li><p class="line891"><a class="http" href="http://today.java.net/pub/a/today/2005/08/09/didyoumean.html">Did You Mean: Lucene?</a> (by Tom White)<br>
|
||||
(<em>Published: August 2005 - article</em>) <span class="anchor" id="line-46"></span></li><li><p class="line891"><a class="http" href="http://www.developer.com/java/other/article.php/3490471">Meet Lucene</a> (by Otis Gospodnetić, Eric Hatcher)<br>
|
||||
(<em>Published: March 2005 - article</em>) <span class="anchor" id="line-47"></span></li><li><p class="line891"><a class="http" href="http://www.theserverside.com/tt/articles/article.tss?l=ILoveLucene">I Love Lucene</a> (by Dion Almaer)<br>
|
||||
(<em>Published: January 2005 - article</em>) <span class="anchor" id="line-48"></span></li><li><p class="line891"><a class="http" href="http://javaboutique.internet.com/tutorials/HTMLParser/article.html">Unweaving a Tangled Web With HTMLParser and Lucene</a> (by Keld H. Hansen)<br>
|
||||
(<em>Last updated: October 2004 - tutorial</em>) <span class="anchor" id="line-49"></span></li><li><p class="line891"><a class="http" href="http://bilgidata.com/localhost/bilgidata/yazi.jsp@dosya=a_lucene.xml.html">Lucene Introduction in Turkish</a> Java Bazlı Arama Motoru - Lusin (by Burak Bayramlı)<br>
|
||||
(<em>Last updated: August 2004 - tutorial</em>) <span class="anchor" id="line-50"></span></li><li><p class="line891"><a class="http" href="http://www.chedong.com/tech/lucene.html">Lucene Introduction in Chinese</a> Lucene:基于Java的全文检索引擎简介 (by Che Dong; 作者: 车东)<br>
|
||||
(<em>Last updated: May 2004 - tutorial</em>) <span class="anchor" id="line-51"></span></li><li><p class="line891"><a class="http" href="http://javatechniques.com/public/java/docs/basics/lucene-memory-search.html">Lucene In-Memory Text Search</a> (by Philip Isenhour)<br>
|
||||
(<em>Last updated: May 2004 - tutorial</em>) <span class="anchor" id="line-52"></span></li><li><p class="line891"><a class="http" href="http://www.javaranch.com/newsletter/200404/Lucene.html">The Lucene Search Engine: Adding Search to Your Applications</a> (by Thomas Paul)<br>
|
||||
(<em>Published: April 2004 - article</em>) <span class="anchor" id="line-53"></span></li><li><p class="line891"><a class="http" href="http://www.darksleep.com/lucene/">Lucene Tutorial</a> (by Steven J. Owens)<br>
|
||||
(<em>Last updated: March 2004 - tutorial</em>) <span class="anchor" id="line-54"></span></li><li><p class="line891"><a class="http" href="http://www-igm.univ-mlv.fr/~dr/XPOSE2003/lucene/articleLucene.html">Lucene Introduction in French</a> Exposés Système sur le thème de l'opensource : Analyse de la structure de Lucene. (by Sun Seng TAN)<br>
|
||||
(<em>Last updated: February 2004 - tutorial</em>) <span class="anchor" id="line-55"></span></li><li><p class="line891"><a class="http" href="http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html">QueryParser Rules</a> (by Erik Hatcher)<br>
|
||||
(<em>Published November 2003 - article</em>) <span class="anchor" id="line-56"></span></li><li><p class="line891"><a class="http" href="http://builder.com.com/5100-6389-5054799.html">Give your Web site its own search engine using Lucene</a> (by Jeffrey Linwood)<br>
|
||||
(<em>Published July 2003 - article</em>) <span class="anchor" id="line-57"></span></li><li><p class="line891"><a class="http" href="http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html">Lucene Intro</a> (by Erik Hatcher)<br>
|
||||
(<em>Published: July 2003 - article</em>) <span class="anchor" id="line-58"></span></li><li><p class="line891"><a class="http" href="http://www-106.ibm.com/developerworks/library/j-lucene/">Parsing, indexing, and searching XML with Digester and Lucene</a> (by Otis Gospodnetić)<br>
|
||||
(<em>Published June 2003 - article</em>) <span class="anchor" id="line-59"></span></li><li><p class="line891"><a class="http" href="http://www.xml.com/pub/a/ws/2003/05/13/email.html">Using Python, Jython, and Lucene to Search Outlook Email</a> (by Jon Udell)<br>
|
||||
(<em>Published: May 2003 - article</em>) <span class="anchor" id="line-60"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2003/03/05/lucene.html">Advanced Text Indexing with Lucene</a> (by Otis Gospodnetić)<br>
|
||||
(<em>Published: March 2003 - article</em>) <span class="anchor" id="line-61"></span></li><li><p class="line891"><a class="http" href="http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html">Introduction to Text Indexing with Apache Jakarta Lucene</a> (by Otis Gospodnetić)<br>
|
||||
(<em>Published: January 2003 - article</em>) <span class="anchor" id="line-62"></span></li><li><p class="line862">Manfred Hardt: "Suchmaschinen entwickeln mit Java und Lucene - Wo war denn noch gleich ... ?"; JavaMagazin 9/2002; Software & Support Verlag, Frankfurt/Main, Germany <span class="anchor" id="line-63"></span></li><li><p class="line891"><a class="http" href="http://javangelist.snipsnap.org/space/Lucene-Mini-Tutorial">Lucene Mini-Tutorial</a> (by funzel)<br>
|
||||
(<em>Last updated: April 2002 - tutorial</em>) <span class="anchor" id="line-64"></span></li><li><p class="line891"><a class="http" href="http://www.javaworld.com/javaworld/jw-09-2000/jw-0915-lucene.html">The Lucene search engine Powerful flexible and free</a> (by Brian Goetz)<br>
|
||||
(<em>Published September 2000 - article</em>) <span class="anchor" id="line-65"></span><span class="anchor" id="line-66"></span></li></ul><p class="line867">
|
||||
<h1 id="Interviews">Interviews</h1>
|
||||
<span class="anchor" id="line-67"></span><span class="anchor" id="line-68"></span><ul><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/index.php?option=com_content&task=view&id=109">Interview with Lucene creator Doug Cutting</a> Podcast. Summary: Doug talks about the creation of Lucene, Nutch and Hadoop. (<em>Published January 2009</em>) <span class="anchor" id="line-69"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/index.php?option=com_content&task=view&id=108">Interview with Lucene/Solr committer Chris Hostetter</a> Podcast. Summary: Chris talks about Solr, Lucene and their usage at CNET. (<em>Published January 2009</em>) <span class="anchor" id="line-70"></span></li><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/index.php?option=com_content&task=view&id=113">Interview with Lucene/Solr committer Ryan McKinley</a> Podcast. Summary: Ryan discusses Solr, Lucene and geospatial searching with Lucene (<a class="nonexistent" href="/lucene-java/LocalLucene/LocalSolr">LocalLucene/LocalSolr</a>) and his usage of Lucene/Solr throughout his career. (<em>Published January 2009</em>) <span class="anchor" id="line-71"></span><span class="anchor" id="line-72"></span><span class="anchor" id="line-73"></span><span class="anchor" id="line-74"></span></li></ul><p class="line867">
|
||||
<h1 id="Papers">Papers</h1>
|
||||
<span class="anchor" id="line-75"></span><span class="anchor" id="line-76"></span><ul><li><p class="line891"><a class="http" href="http://lucene.sourceforge.net/publications.html">http://lucene.sourceforge.net/publications.html</a> Doug Cuttings papers from the old Lucene web site <span class="anchor" id="line-77"></span><span class="anchor" id="line-78"></span></li></ul><p class="line867">
|
||||
<h1 id="Presentations">Presentations</h1>
|
||||
<span class="anchor" id="line-79"></span><ul><li><p class="line891"><a class="http" href="http://people.apache.org/~buschmi/apachecon/AdvancedIndexingLuceneAtlanta07.ppt">Advanced Indexing Techniques with Apache Lucene - Payloads</a> presented by Michael Busch at <a class="http" href="http://www.us.apachecon.com/us2007/">ApacheCon U.S. 2007</a><br>
|
||||
(<em>Presented November 2007 - PDF slide show</em>) <span class="anchor" id="line-80"></span></li><li><p class="line891"><a class="http" href="http://people.apache.org/~yonik/presentations/lucene_intro.pdf">Full-Text Search with Lucene</a> presented by Yonik Seeley at <a class="http" href="http://www.eu.apachecon.com">ApacheCon Europe 2007</a>.<br>
|
||||
(<em>Presented May 2007 - PDF slide show</em>) <span class="anchor" id="line-81"></span></li><li><p class="line891"><a class="http" href="http://www.cnlp.org/presentations/slides/AdvancedLuceneEU.pdf">Advanced Lucene</a> presented by Grant Ingersoll of <a class="http" href="http://www.cnlp.org">CNLP</a> at <a class="http" href="http://www.eu.apachecon.com">ApacheCon Europe 2007</a>. Covers term vectors, query tips and tricks and Lucene performance tuning related to indexing, searching and document retrieval.<br>
|
||||
(<em>Presented May 2007 - PDF slide show</em>) <span class="anchor" id="line-82"></span></li><li><p class="line891"><a class="http" href="http://blogs.atlassian.com/rebelutionary/downloads/tssjs2007-lucene-generic-data-indexing.pdf">Lucene: Generic Data Indexing</a> presented by Mike Cannon-Brookes, CEO, <a class="http" href="http://www.atlassian.com/">Atlassian Software Systems</a> at <a class="http" href="http://javasymposium.techtarget.com/lasvegas/index.html">TSSJS Las Vegas 2007</a>. Covers how Atlassian use Lucene as a generic indexing framework for indexing and finding arbitrary collections of complex objects.<br>
|
||||
(<em>Presented March 2007 - PDF slide show</em>) <span class="anchor" id="line-83"></span></li><li><p class="line891"><a class="http" href="http://www.cnlp.org/apachecon2005/AdvancedLucene.ppt">Advanced Lucene</a> presented by Grant Ingersoll of the <a class="http" href="http://www.cnlp.org">Center for Natural Language Processing</a> at <a class="http" href="http://www.apachecon.com">ApacheCon 2005</a>. Covers term vectors, span queries, using Lucene in a basic question answering system, and several Lucene case studies from <a class="http" href="http://www.cnlp.org">http://www.cnlp.org</a>. The accompanying <a class="http" href="http://www.cnlp.org/apachecon2005">CNLP ApacheCon 2005 Information website</a> contains many working examples using term vectors and span queries. <span class="anchor" id="line-84"></span></li><li><p class="line891"><a class="http" href="http://lucene.sourceforge.net/talks/pisa/">Lucene lecture at The University of Pisa</a> (by Doug Cutting)<br>
|
||||
(<em>Presented November 2004 - lecture notes</em>) <span class="anchor" id="line-85"></span></li><li><p class="line891"><a class="http" href="http://conferences.oreillynet.com/presentations/os2003/hatcher_erik_lucene.pdf">Introducing Lucene</a> (by Erik Hatcher)<br>
|
||||
(<em>Presented at OS2003, July 2003 - PDF slide show</em>) <span class="anchor" id="line-86"></span></li><li><p class="line891"><a class="http" href="http://lucene.sourceforge.net/talks/inktomi/">The Lucene Search Engine: Inktomi Seminar</a> (by Doug Cutting)<br>
|
||||
(<em>Presented June, 2000 - seminar notes</em>) <span class="anchor" id="line-87"></span><span class="anchor" id="line-88"></span></li></ul><p class="line867">
|
||||
<h1 id="Training">Training</h1>
|
||||
<span class="anchor" id="line-89"></span><span class="anchor" id="line-90"></span><ul><li><p class="line891"><a class="http" href="http://www.lucidimagination.com/How-We-Can-Help/Training/">http://www.lucidimagination.com/How-We-Can-Help/Training/</a> - Training on Lucene created by Lucene committers and contributors (Grant Ingersoll, Erik Hatcher and the rest of the team at Lucid Imagination). <span class="anchor" id="line-91"></span></li><li><p class="line891"><a class="http" href="http://www.lucenebootcamp.com">Lucene Boot Camp</a> - Training by Lucene committer Grant Ingersoll. Offered exclusively at <a class="http" href="http://www.apachecon.com">ApacheCon</a>. <span class="anchor" id="line-92"></span><span class="anchor" id="line-93"></span></li></ul><p class="line867">
|
||||
<h1 id="Corpora">Corpora</h1>
|
||||
<span class="anchor" id="line-94"></span><ul><li><p class="line862">DMOZ RDF dump - <a class="http" href="http://rdf.dmoz.org/">http://rdf.dmoz.org/</a> <span class="anchor" id="line-95"></span></li><li><p class="line862">CMU newsgroups - <a class="http" href="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html">http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html</a> <span class="anchor" id="line-96"></span></li><li><p class="line862">CMU webpages - <a class="http" href="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/">http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/</a> <span class="anchor" id="line-97"></span></li><li><p class="line862">Reuters - <a class="http" href="http://www.daviddlewis.com/resources/testcollections/reuters21578">http://www.daviddlewis.com/resources/testcollections/reuters21578</a> <span class="anchor" id="line-98"></span></li><li><p class="line862">Enron emails - <a class="http" href="http://www-2.cs.cmu.edu/~enron/">http://www-2.cs.cmu.edu/~enron/</a> <span class="anchor" id="line-99"></span></li><li><p class="line862">JRC-ACQUIS Multilingual Parallel Corpus - <a class="http" href="http://wt.jrc.it/lt/Acquis/">http://wt.jrc.it/lt/Acquis/</a> <span class="anchor" id="line-100"></span><span class="anchor" id="line-101"></span></li></ul><p class="line867">
|
||||
<h1 id="Other">Other</h1>
|
||||
<span class="anchor" id="line-102"></span><ul><li><p class="line891"><a class="http" href="http://www.java201.com/resources/browse/38-all.html">Lucene Resources</a> - Articles, Books, FAQs, Forums, Presentations, Wiki. <span class="anchor" id="line-103"></span></li><li><p class="line891"><a class="http" href="http://www.nabble.com/Web-Search-f2787.html">Lucene Search Forum</a> - hosted by <a class="http" href="http://www.nabble.com">Nabble</a> archiving all Lucene and Nutch mailing lists into a searchable archive/forum. The search is coded using Lucene. <span class="anchor" id="line-104"></span></li><li><p class="line891"><a class="http" href="http://www.lucenetutorial.com">LuceneTutorial.com</a> - Tips and tricks, sample applications, code samples, best practices. <span class="anchor" id="line-105"></span></li></ul><span class="anchor" id="bottom"></span></div><p id="pageinfo" class="info" lang="en" dir="ltr">Resources (last edited 2010-05-03 22:31:43 by <span title="SteveRowe @ ist-h335-d03.syr.edu[128.230.84.100]"><a class="nonexistent" href="/lucene-java/SteveRowe" title="SteveRowe @ ist-h335-d03.syr.edu[128.230.84.100]">SteveRowe</a></span>)</p>
|
||||
|
||||
<div id="pagebottom"></div>
|
||||
</div>
|
||||
|
||||
|
||||
<div id="footer">
|
||||
<ul class="editbar"><li><span class="disabled">Immutable Page</span></li><li class="toggleCommentsButton" style="display:none;"><a href="#" class="nbcomment" onClick="toggleComments();return false;">Comments</a></li><li><a class="nbinfo" href="/lucene-java/Resources?action=info" rel="nofollow">Info</a></li><li>
|
||||
<form class="actionsmenu" method="GET" action="/lucene-java/Resources">
|
||||
<div>
|
||||
<label>More Actions:</label>
|
||||
<select name="action"
|
||||
onchange="if ((this.selectedIndex != 0) &&
|
||||
(this.options[this.selectedIndex].disabled == false)) {
|
||||
this.form.submit();
|
||||
}
|
||||
this.selectedIndex = 0;">
|
||||
<option value="raw">Raw Text</option>
|
||||
<option value="print">Print View</option>
|
||||
<option value="RenderAsDocbook">Render as Docbook</option>
|
||||
<option value="refresh">Delete Cache</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="SpellCheck">Check Spelling</option>
|
||||
<option value="LikePages">Like Pages</option>
|
||||
<option value="LocalSiteMap">Local Site Map</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="RenamePage" disabled class="disabled">Rename Page</option>
|
||||
<option value="CopyPage">Copy Page</option>
|
||||
<option value="DeletePage" disabled class="disabled">Delete Page</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="MyPages">My Pages</option>
|
||||
<option value="show" disabled class="disabled">Subscribe User</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="show" disabled class="disabled">Remove Spam</option>
|
||||
<option value="show" disabled class="disabled">Revert to this revision</option>
|
||||
<option value="show" disabled class="disabled">Package Pages</option>
|
||||
<option value="SyncPages">Sync Pages</option>
|
||||
<option value="show" disabled class="disabled">------------------------</option>
|
||||
<option value="Load">Load</option>
|
||||
<option value="Save">Save</option>
|
||||
</select>
|
||||
<input type="submit" value="Do">
|
||||
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
<!--// Init menu
|
||||
actionsMenuInit('More Actions:');
|
||||
//-->
|
||||
</script>
|
||||
</form>
|
||||
</li></ul>
|
||||
|
||||
<ul id="credits">
|
||||
<li><a href="http://moinmo.in/" title="This site uses the MoinMoin Wiki software.">MoinMoin Powered</a></li><li><a href="http://moinmo.in/Python" title="MoinMoin is written in Python.">Python Powered</a></li><li><a href="http://moinmo.in/GPL" title="MoinMoin is GPL licensed.">GPL licensed</a></li><li><a href="http://validator.w3.org/check?uri=referer" title="Click here to validate this page.">Valid HTML 4.01</a></li>
|
||||
</ul>
|
||||
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
http://www.w3.org/TR/html4/strict.dtd
|
||||
http://lucene.apache.org/java/3_0_1/api/all/overview-summary.html#overview_description
|
||||
http://lucene.apache.org/java/3_0_1/gettingstarted.html
|
||||
http://lucene.grantingersoll.com
|
||||
http://www.lucidimagination.com/blog/
|
||||
http://blog.sematext.com/
|
||||
http://www.manning.com/hatcher3/hatcher3_cover150.jpg
|
||||
http://www.manning.com/hatcher3/hatcher3_cover150.jpg
|
||||
http://www.manning.com/hatcher3/hatcher3_cover150.jpg
|
||||
http://www.manning.com/hatcher3/
|
||||
http://www.amazon.com/Building-Search-Applications-Lucene-Lingpipe/dp/0615204252/
|
||||
http://www.amazon.co.jp/exec/obidos/ASIN/4774127809/503-9461699-1775907
|
||||
http://www.lucenebook.com
|
||||
http://www.amazon.com/exec/obidos/ASIN/1932394281
|
||||
Amazon.com
|
||||
http://www.amazon.de/Suchmaschinen-entwickeln-mit-Apache-Lucene/dp/3935042450
|
||||
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Getting-Started-with-Lucene/
|
||||
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Optimizing-Findability-in-Lucene-and-Solr/
|
||||
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Debugging-Relevance-Issues-in-Search/
|
||||
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Scaling-Lucene-and-Solr/
|
||||
http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Introduction-to-Apache-Lucene-and-Solr/
|
||||
http://cephas.net/blog/2008/03/30/how-morelikethis-works-in-lucene/
|
||||
http://schmidt.devlib.org/software/lucene-wikipedia.html
|
||||
http://marceloochoa.blogspot.com/2007/09/running-lucene-inside-your-oracle-jvm.html
|
||||
http://www.onjava.com/pub/a/onjava/2007/05/24/using-the-lucene-query-parser-without-lucene.html
|
||||
http://www.javaworld.com/javaworld/jw-09-2006/jw-0925-lucene.html
|
||||
http://www-128.ibm.com/developerworks/java/library/wa-lucene2/index.html?ca=drs-
|
||||
http://www.freesearch.pe.kr/tag/Lucene
|
||||
http://www-128.ibm.com/developerworks/java/library/wa-lucene/index.html
|
||||
http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html
|
||||
http://www.jroller.com/page/wakaleo/?anchor=lucene_a_tutorial_introduction_to
|
||||
http://blog.dev.sf.net/index.php?/archives/10-Behind-the-Scenes-of-the-SourceForge.net-Search-System.html
|
||||
SourceForge.net
|
||||
http://today.java.net/pub/a/today/2005/08/09/didyoumean.html
|
||||
http://www.developer.com/java/other/article.php/3490471
|
||||
http://www.theserverside.com/tt/articles/article.tss?l=ILoveLucene
|
||||
http://javaboutique.internet.com/tutorials/HTMLParser/article.html
|
||||
http://bilgidata.com/localhost/bilgidata/yazi.jsp@dosya=a_lucene.xml.html
|
||||
http://www.chedong.com/tech/lucene.html
|
||||
http://javatechniques.com/public/java/docs/basics/lucene-memory-search.html
|
||||
http://www.javaranch.com/newsletter/200404/Lucene.html
|
||||
http://www.darksleep.com/lucene/
|
||||
http://www-igm.univ-mlv.fr/~dr/XPOSE2003/lucene/articleLucene.html
|
||||
http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html
|
||||
http://builder.com.com/5100-6389-5054799.html
|
||||
http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
|
||||
http://www-106.ibm.com/developerworks/library/j-lucene/
|
||||
http://www.xml.com/pub/a/ws/2003/05/13/email.html
|
||||
http://www.onjava.com/pub/a/onjava/2003/03/05/lucene.html
|
||||
http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html
|
||||
http://javangelist.snipsnap.org/space/Lucene-Mini-Tutorial
|
||||
http://www.javaworld.com/javaworld/jw-09-2000/jw-0915-lucene.html
|
||||
http://www.lucidimagination.com/index.php?option=com_content&task=view&id=109
|
||||
http://www.lucidimagination.com/index.php?option=com_content&task=view&id=108
|
||||
http://www.lucidimagination.com/index.php?option=com_content&task=view&id=113
|
||||
http://lucene.sourceforge.net/publications.html
|
||||
http://lucene.sourceforge.net/publications.html
|
||||
http://people.apache.org/~buschmi/apachecon/AdvancedIndexingLuceneAtlanta07.ppt
|
||||
http://www.us.apachecon.com/us2007/
|
||||
http://people.apache.org/~yonik/presentations/lucene_intro.pdf
|
||||
http://www.eu.apachecon.com
|
||||
http://www.cnlp.org/presentations/slides/AdvancedLuceneEU.pdf
|
||||
http://www.cnlp.org
|
||||
http://www.eu.apachecon.com
|
||||
http://blogs.atlassian.com/rebelutionary/downloads/tssjs2007-lucene-generic-data-indexing.pdf
|
||||
http://www.atlassian.com/
|
||||
http://javasymposium.techtarget.com/lasvegas/index.html
|
||||
http://www.cnlp.org/apachecon2005/AdvancedLucene.ppt
|
||||
http://www.cnlp.org
|
||||
http://www.apachecon.com
|
||||
http://www.cnlp.org
|
||||
http://www.cnlp.org
|
||||
http://www.cnlp.org/apachecon2005
|
||||
http://lucene.sourceforge.net/talks/pisa/
|
||||
http://conferences.oreillynet.com/presentations/os2003/hatcher_erik_lucene.pdf
|
||||
http://lucene.sourceforge.net/talks/inktomi/
|
||||
http://www.lucidimagination.com/How-We-Can-Help/Training/
|
||||
http://www.lucidimagination.com/How-We-Can-Help/Training/
|
||||
http://www.lucenebootcamp.com
|
||||
http://www.apachecon.com
|
||||
http://rdf.dmoz.org/
|
||||
http://rdf.dmoz.org/
|
||||
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
|
||||
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
|
||||
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/
|
||||
http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/
|
||||
http://www.daviddlewis.com/resources/testcollections/reuters21578
|
||||
http://www.daviddlewis.com/resources/testcollections/reuters21578
|
||||
http://www-2.cs.cmu.edu/~enron/
|
||||
http://www-2.cs.cmu.edu/~enron/
|
||||
http://wt.jrc.it/lt/Acquis/
|
||||
http://wt.jrc.it/lt/Acquis/
|
||||
http://www.java201.com/resources/browse/38-all.html
|
||||
http://www.nabble.com/Web-Search-f2787.html
|
||||
http://www.nabble.com
|
||||
http://www.lucenetutorial.com
|
||||
LuceneTutorial.com
|
||||
ist-h335-d03.syr.edu
|
||||
128.230.84.100
|
||||
ist-h335-d03.syr.edu
|
||||
128.230.84.100
|
||||
http://moinmo.in/
|
||||
http://moinmo.in/Python
|
||||
http://moinmo.in/GPL
|
||||
http://validator.w3.org/check?uri=referer
|
|
@ -0,0 +1,311 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
* <p/>
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
public void testMaxTermLength() throws Exception {
|
||||
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
sa.setMaxTokenLength(5);
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||
}
|
||||
|
||||
public void testMaxTermLength2() throws Exception {
|
||||
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
|
||||
sa.setMaxTokenLength(5);
|
||||
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
|
||||
}
|
||||
|
||||
public void testMaxTermLength3() throws Exception {
|
||||
char[] chars = new char[255];
|
||||
for(int i=0;i<255;i++)
|
||||
chars[i] = 'a';
|
||||
String longTerm = new String(chars, 0, 255);
|
||||
|
||||
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
|
||||
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||
}
|
||||
|
||||
public void testAlphanumeric() throws Exception {
|
||||
// alphanumeric tokens
|
||||
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||
assertAnalyzesTo(a, "2B", new String[]{"2b"});
|
||||
}
|
||||
|
||||
public void testUnderscores() throws Exception {
|
||||
// underscores are delimiters, but not in email addresses (below)
|
||||
assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
|
||||
assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
|
||||
}
|
||||
|
||||
public void testDelimiters() throws Exception {
|
||||
// other delimiters: "-", "/", ","
|
||||
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
|
||||
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
|
||||
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
|
||||
}
|
||||
|
||||
public void testApostrophes() throws Exception {
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
// possessives are actually removed by StardardFilter, not the tokenizer
|
||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
assertAnalyzesTo(a, "she's", new String[]{"she"});
|
||||
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
|
||||
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
|
||||
}
|
||||
|
||||
public void testTSADash() throws Exception {
|
||||
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
|
||||
// to correctly search for these terms:
|
||||
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
|
||||
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
|
||||
// 'a' is still a stopword:
|
||||
assertAnalyzesTo(a, "a-class", new String[]{"class"});
|
||||
}
|
||||
|
||||
public void testCompanyNames() throws Exception {
|
||||
// company names
|
||||
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
|
||||
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
|
||||
}
|
||||
|
||||
public void testLucene1140() throws Exception {
|
||||
try {
|
||||
ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
} catch (NullPointerException e) {
|
||||
fail("Should not throw an NPE and it did");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testDomainNames() throws Exception {
|
||||
// Current lucene should not show the bug
|
||||
ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
// domain names
|
||||
assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
|
||||
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
|
||||
// the following should be recognized as HOST:
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
|
||||
// 2.3 should show the bug
|
||||
a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
||||
|
||||
// 2.4 should not show the bug
|
||||
a2 = new ClassicAnalyzer(Version.LUCENE_24);
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
}
|
||||
|
||||
public void testEMailAddresses() throws Exception {
|
||||
// email addresses, possibly with underscores, periods, etc
|
||||
assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
|
||||
assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
|
||||
assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
|
||||
}
|
||||
|
||||
public void testNumeric() throws Exception {
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
|
||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
|
||||
assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
|
||||
assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
|
||||
}
|
||||
|
||||
public void testTextWithNumbers() throws Exception {
|
||||
// numbers
|
||||
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
|
||||
}
|
||||
|
||||
public void testVariousText() throws Exception {
|
||||
// various
|
||||
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
|
||||
}
|
||||
|
||||
public void testAcronyms() throws Exception {
|
||||
// acronyms have their dots stripped
|
||||
assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
|
||||
}
|
||||
|
||||
public void testCPlusPlusHash() throws Exception {
|
||||
// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
|
||||
assertAnalyzesTo(a, "C++", new String[]{"c"});
|
||||
assertAnalyzesTo(a, "C#", new String[]{"c"});
|
||||
}
|
||||
|
||||
public void testKorean() throws Exception {
|
||||
// Korean words
|
||||
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
// Compliance with the "old" JavaCC-based analyzer, see:
|
||||
// https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
|
||||
|
||||
public void testComplianceFileName() throws Exception {
|
||||
assertAnalyzesTo(a, "2004.jpg",
|
||||
new String[]{"2004.jpg"},
|
||||
new String[]{"<HOST>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericIncorrect() throws Exception {
|
||||
assertAnalyzesTo(a, "62.46",
|
||||
new String[]{"62.46"},
|
||||
new String[]{"<HOST>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericLong() throws Exception {
|
||||
assertAnalyzesTo(a, "978-0-94045043-1",
|
||||
new String[]{"978-0-94045043-1"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericFile() throws Exception {
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"78academyawards/rules/rule02.html",
|
||||
new String[]{"78academyawards/rules/rule02.html"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericWithUnderscores() throws Exception {
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
|
||||
new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericWithDash() throws Exception {
|
||||
assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceManyTokens() throws Exception {
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
|
||||
+ "safari-0-sheikh-zayed-grand-mosque.jpg",
|
||||
new String[]{"money.cnn.com", "magazines", "fortune",
|
||||
"fortune", "archive/2007/03/19/8402357", "index.htm",
|
||||
"safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
|
||||
new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<HOST>"});
|
||||
}
|
||||
|
||||
public void testJava14BWCompatibility() throws Exception {
|
||||
ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure we skip wicked long terms.
|
||||
*/
|
||||
public void testWickedLongTerm() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));
|
||||
|
||||
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
|
||||
Arrays.fill(chars, 'x');
|
||||
Document doc = new Document();
|
||||
final String bigTerm = new String(chars);
|
||||
|
||||
// This produces a too-long term:
|
||||
String contents = "abc xyz x" + bigTerm + " another term";
|
||||
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
// Make sure we can add another normal document
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir, true);
|
||||
|
||||
// Make sure all terms < max size were indexed
|
||||
assertEquals(2, reader.docFreq(new Term("content", "abc")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "term")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "another")));
|
||||
|
||||
// Make sure position is still incremented when
|
||||
// massive term is skipped:
|
||||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getDeletedDocs(reader),
|
||||
"content",
|
||||
new BytesRef("another"));
|
||||
assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, tps.freq());
|
||||
assertEquals(3, tps.nextPosition());
|
||||
|
||||
// Make sure the doc that has the massive term is in
|
||||
// the index:
|
||||
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
|
||||
|
||||
reader.close();
|
||||
|
||||
// Make sure we can add a document with exactly the
|
||||
// maximum length term, and search on that term:
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
|
||||
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
sa.setMaxTokenLength(100000);
|
||||
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir, true);
|
||||
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
|
||||
reader.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -1,35 +1,33 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
* <p/>
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -38,277 +36,365 @@ import org.apache.lucene.util.BytesRef;
|
|||
*/
|
||||
|
||||
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
private Analyzer a = new StandardAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
public void testMaxTermLength() throws Exception {
|
||||
StandardAnalyzer sa = new StandardAnalyzer(TEST_VERSION_CURRENT);
|
||||
sa.setMaxTokenLength(5);
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||
|
||||
public void testHugeDoc() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
char whitespace[] = new char[4094];
|
||||
Arrays.fill(whitespace, ' ');
|
||||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
public void testMaxTermLength2() throws Exception {
|
||||
StandardAnalyzer sa = new StandardAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
|
||||
sa.setMaxTokenLength(5);
|
||||
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
|
||||
}
|
||||
private Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents
|
||||
(String fieldName, Reader reader) {
|
||||
|
||||
public void testMaxTermLength3() throws Exception {
|
||||
char[] chars = new char[255];
|
||||
for(int i=0;i<255;i++)
|
||||
chars[i] = 'a';
|
||||
String longTerm = new String(chars, 0, 255);
|
||||
|
||||
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
|
||||
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||
}
|
||||
|
||||
public void testAlphanumeric() throws Exception {
|
||||
// alphanumeric tokens
|
||||
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||
assertAnalyzesTo(a, "2B", new String[]{"2b"});
|
||||
}
|
||||
|
||||
public void testUnderscores() throws Exception {
|
||||
// underscores are delimiters, but not in email addresses (below)
|
||||
assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
|
||||
assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
|
||||
}
|
||||
|
||||
public void testDelimiters() throws Exception {
|
||||
// other delimiters: "-", "/", ","
|
||||
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
|
||||
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
|
||||
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
|
||||
}
|
||||
|
||||
public void testApostrophes() throws Exception {
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
// possessives are actually removed by StardardFilter, not the tokenizer
|
||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
assertAnalyzesTo(a, "she's", new String[]{"she"});
|
||||
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
|
||||
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
|
||||
}
|
||||
|
||||
public void testTSADash() throws Exception {
|
||||
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
|
||||
// to correctly search for these terms:
|
||||
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
|
||||
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
|
||||
// 'a' is still a stopword:
|
||||
assertAnalyzesTo(a, "a-class", new String[]{"class"});
|
||||
}
|
||||
|
||||
public void testCompanyNames() throws Exception {
|
||||
// company names
|
||||
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
|
||||
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
|
||||
}
|
||||
|
||||
public void testLucene1140() throws Exception {
|
||||
try {
|
||||
StandardAnalyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
} catch (NullPointerException e) {
|
||||
fail("Should not throw an NPE and it did");
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
/** Passes through tokens with type "<URL>" and blocks all other types. */
|
||||
private class URLFilter extends TokenFilter {
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public URLFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
}
|
||||
|
||||
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
|
||||
private class EmailFilter extends TokenFilter {
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public EmailFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
}
|
||||
|
||||
public void testDomainNames() throws Exception {
|
||||
// Current lucene should not show the bug
|
||||
StandardAnalyzer a2 = new StandardAnalyzer(TEST_VERSION_CURRENT);
|
||||
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||
TokenFilter filter = new URLFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
// domain names
|
||||
assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
|
||||
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
|
||||
// the following should be recognized as HOST:
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
TokenFilter filter = new EmailFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
// 2.3 should show the bug
|
||||
a2 = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
||||
|
||||
// 2.4 should not show the bug
|
||||
a2 = new StandardAnalyzer(Version.LUCENE_24);
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
public void testArmenian() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
|
||||
}
|
||||
|
||||
public void testAmharic() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
|
||||
new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
|
||||
}
|
||||
|
||||
public void testArabic() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
|
||||
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
|
||||
}
|
||||
|
||||
public void testAramaic() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
|
||||
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
||||
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
|
||||
}
|
||||
|
||||
public void testBengali() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
|
||||
new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
|
||||
"শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
|
||||
}
|
||||
|
||||
public void testFarsi() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
|
||||
new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
|
||||
"برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
|
||||
}
|
||||
|
||||
public void testGreek() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
|
||||
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
||||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||
}
|
||||
|
||||
public void testEMailAddresses() throws Exception {
|
||||
// email addresses, possibly with underscores, periods, etc
|
||||
assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
|
||||
assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
|
||||
assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
|
||||
public void testThai() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
|
||||
new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" });
|
||||
}
|
||||
|
||||
public void testLao() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
|
||||
new String[] { "ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ" });
|
||||
}
|
||||
|
||||
public void testTibetan() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
|
||||
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
|
||||
"མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
|
||||
"ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
|
||||
}
|
||||
|
||||
/*
|
||||
* For chinese, tokenize as char (these can later form bigrams or whatever)
|
||||
*/
|
||||
public void testChinese() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
|
||||
new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
|
||||
}
|
||||
|
||||
public void testEmpty() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
|
||||
}
|
||||
|
||||
/* test various jira issues this analyzer is related to */
|
||||
|
||||
public void testLUCENE1545() throws Exception {
|
||||
/*
|
||||
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
|
||||
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
|
||||
* Expected result is only on token "moͤchte".
|
||||
*/
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
|
||||
}
|
||||
|
||||
/* Tests from StandardAnalyzer, just to show behavior is similar */
|
||||
public void testAlphanumericSA() throws Exception {
|
||||
// alphanumeric tokens
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
|
||||
}
|
||||
|
||||
public void testNumeric() throws Exception {
|
||||
public void testDelimitersSA() throws Exception {
|
||||
// other delimiters: "-", "/", ","
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
|
||||
}
|
||||
|
||||
public void testApostrophesSA() throws Exception {
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
|
||||
}
|
||||
|
||||
public void testNumericSA() throws Exception {
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
|
||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
|
||||
assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
|
||||
assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
}
|
||||
|
||||
public void testTextWithNumbers() throws Exception {
|
||||
public void testTextWithNumbersSA() throws Exception {
|
||||
// numbers
|
||||
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
|
||||
}
|
||||
|
||||
public void testVariousText() throws Exception {
|
||||
public void testVariousTextSA() throws Exception {
|
||||
// various
|
||||
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
|
||||
}
|
||||
|
||||
public void testAcronyms() throws Exception {
|
||||
// acronyms have their dots stripped
|
||||
assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
|
||||
}
|
||||
|
||||
public void testCPlusPlusHash() throws Exception {
|
||||
// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
|
||||
assertAnalyzesTo(a, "C++", new String[]{"c"});
|
||||
assertAnalyzesTo(a, "C#", new String[]{"c"});
|
||||
}
|
||||
|
||||
public void testKorean() throws Exception {
|
||||
public void testKoreanSA() throws Exception {
|
||||
// Korean words
|
||||
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"David", "has", "5000", "bones"},
|
||||
new int[] {0, 6, 10, 15},
|
||||
new int[] {5, 9, 14, 20});
|
||||
}
|
||||
|
||||
public void testTypes() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"David", "has", "5000", "bones"},
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testWikiURLs() throws Exception {
|
||||
Reader reader = null;
|
||||
String luceneResourcesWikiPage;
|
||||
try {
|
||||
reader = new InputStreamReader
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
luceneResourcesWikiPage = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != luceneResourcesWikiPage
|
||||
&& luceneResourcesWikiPage.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] urls;
|
||||
try {
|
||||
List<String> urlList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
urlList.add(line);
|
||||
}
|
||||
}
|
||||
urls = urlList.toArray(new String[urlList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != urls && urls.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(urlAnalyzer, luceneResourcesWikiPage, urls);
|
||||
}
|
||||
|
||||
public void testEmails() throws Exception {
|
||||
Reader reader = null;
|
||||
String randomTextWithEmails;
|
||||
try {
|
||||
reader = new InputStreamReader
|
||||
(getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
randomTextWithEmails = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != randomTextWithEmails
|
||||
&& randomTextWithEmails.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] emails;
|
||||
try {
|
||||
List<String> emailList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
emailList.add(line);
|
||||
}
|
||||
}
|
||||
emails = emailList.toArray(new String[emailList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != emails && emails.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(emailAnalyzer, randomTextWithEmails, emails);
|
||||
}
|
||||
|
||||
// Compliance with the "old" JavaCC-based analyzer, see:
|
||||
// https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
|
||||
|
||||
public void testComplianceFileName() throws Exception {
|
||||
assertAnalyzesTo(a, "2004.jpg",
|
||||
new String[]{"2004.jpg"},
|
||||
new String[]{"<HOST>"});
|
||||
public void testURLs() throws Exception {
|
||||
Reader reader = null;
|
||||
String randomTextWithURLs;
|
||||
try {
|
||||
reader = new InputStreamReader
|
||||
(getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
randomTextWithURLs = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != randomTextWithURLs
|
||||
&& randomTextWithURLs.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] urls;
|
||||
try {
|
||||
List<String> urlList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
urlList.add(line);
|
||||
}
|
||||
}
|
||||
urls = urlList.toArray(new String[urlList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != urls && urls.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(urlAnalyzer, randomTextWithURLs, urls);
|
||||
}
|
||||
|
||||
public void testComplianceNumericIncorrect() throws Exception {
|
||||
assertAnalyzesTo(a, "62.46",
|
||||
new String[]{"62.46"},
|
||||
new String[]{"<HOST>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericLong() throws Exception {
|
||||
assertAnalyzesTo(a, "978-0-94045043-1",
|
||||
new String[]{"978-0-94045043-1"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericFile() throws Exception {
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"78academyawards/rules/rule02.html",
|
||||
new String[]{"78academyawards/rules/rule02.html"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericWithUnderscores() throws Exception {
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
|
||||
new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceNumericWithDash() throws Exception {
|
||||
assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
|
||||
new String[]{"<NUM>"});
|
||||
}
|
||||
|
||||
public void testComplianceManyTokens() throws Exception {
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
|
||||
+ "safari-0-sheikh-zayed-grand-mosque.jpg",
|
||||
new String[]{"money.cnn.com", "magazines", "fortune",
|
||||
"fortune", "archive/2007/03/19/8402357", "index.htm",
|
||||
"safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
|
||||
new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<HOST>"});
|
||||
}
|
||||
|
||||
public void testJava14BWCompatibility() throws Exception {
|
||||
StandardAnalyzer sa = new StandardAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
|
||||
sa = new StandardAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test\u02C6test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure we skip wicked long terms.
|
||||
*/
|
||||
public void testWickedLongTerm() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
|
||||
|
||||
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
|
||||
Arrays.fill(chars, 'x');
|
||||
Document doc = new Document();
|
||||
final String bigTerm = new String(chars);
|
||||
|
||||
// This produces a too-long term:
|
||||
String contents = "abc xyz x" + bigTerm + " another term";
|
||||
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
// Make sure we can add another normal document
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir, true);
|
||||
|
||||
// Make sure all terms < max size were indexed
|
||||
assertEquals(2, reader.docFreq(new Term("content", "abc")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "term")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "another")));
|
||||
|
||||
// Make sure position is still incremented when
|
||||
// massive term is skipped:
|
||||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
|
||||
MultiFields.getDeletedDocs(reader),
|
||||
"content",
|
||||
new BytesRef("another"));
|
||||
assertTrue(tps.nextDoc() != DocsEnum.NO_MORE_DOCS);
|
||||
assertEquals(1, tps.freq());
|
||||
assertEquals(3, tps.nextPosition());
|
||||
|
||||
// Make sure the doc that has the massive term is in
|
||||
// the index:
|
||||
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
|
||||
|
||||
reader.close();
|
||||
|
||||
// Make sure we can add a document with exactly the
|
||||
// maximum length term, and search on that term:
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
|
||||
StandardAnalyzer sa = new StandardAnalyzer(TEST_VERSION_CURRENT);
|
||||
sa.setMaxTokenLength(100000);
|
||||
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir, true);
|
||||
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
|
||||
reader.close();
|
||||
|
||||
dir.close();
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_5_2_0 wordBreakTest = new WordBreakTestUnicode_5_2_0();
|
||||
wordBreakTest.test(a);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,204 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testHugeDoc() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
char whitespace[] = new char[4094];
|
||||
Arrays.fill(whitespace, ' ');
|
||||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
private Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents
|
||||
(String fieldName, Reader reader) {
|
||||
|
||||
Tokenizer tokenizer = new UAX29Tokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
public void testArmenian() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
|
||||
}
|
||||
|
||||
public void testAmharic() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
|
||||
new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
|
||||
}
|
||||
|
||||
public void testArabic() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
|
||||
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
|
||||
}
|
||||
|
||||
public void testAramaic() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
|
||||
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
||||
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
|
||||
}
|
||||
|
||||
public void testBengali() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
|
||||
new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
|
||||
"শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
|
||||
}
|
||||
|
||||
public void testFarsi() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
|
||||
new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
|
||||
"برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
|
||||
}
|
||||
|
||||
public void testGreek() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
|
||||
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
||||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||
}
|
||||
|
||||
public void testThai() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
|
||||
new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" });
|
||||
}
|
||||
|
||||
public void testLao() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
|
||||
new String[] { "ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ" });
|
||||
}
|
||||
|
||||
public void testTibetan() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
|
||||
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
|
||||
"མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
|
||||
"ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
|
||||
}
|
||||
|
||||
/*
|
||||
* For chinese, tokenize as char (these can later form bigrams or whatever)
|
||||
*/
|
||||
public void testChinese() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
|
||||
new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
|
||||
}
|
||||
|
||||
public void testEmpty() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
|
||||
}
|
||||
|
||||
/* test various jira issues this analyzer is related to */
|
||||
|
||||
public void testLUCENE1545() throws Exception {
|
||||
/*
|
||||
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
|
||||
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
|
||||
* Expected result is only on token "moͤchte".
|
||||
*/
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
|
||||
}
|
||||
|
||||
/* Tests from StandardAnalyzer, just to show behavior is similar */
|
||||
public void testAlphanumericSA() throws Exception {
|
||||
// alphanumeric tokens
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
|
||||
}
|
||||
|
||||
public void testDelimitersSA() throws Exception {
|
||||
// other delimiters: "-", "/", ","
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
|
||||
}
|
||||
|
||||
public void testApostrophesSA() throws Exception {
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
|
||||
}
|
||||
|
||||
public void testNumericSA() throws Exception {
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
}
|
||||
|
||||
public void testTextWithNumbersSA() throws Exception {
|
||||
// numbers
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
|
||||
}
|
||||
|
||||
public void testVariousTextSA() throws Exception {
|
||||
// various
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
|
||||
}
|
||||
|
||||
public void testKoreanSA() throws Exception {
|
||||
// Korean words
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"David", "has", "5000", "bones"},
|
||||
new int[] {0, 6, 10, 15},
|
||||
new int[] {5, 9, 14, 20});
|
||||
}
|
||||
|
||||
public void testTypes() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"David", "has", "5000", "bones"},
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_5_2_0 wordBreakTest = new WordBreakTestUnicode_5_2_0();
|
||||
wordBreakTest.test(a);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,265 @@
|
|||
dJ8ngFi@avz13m.CC
|
||||
JCAVLRJg@3aqiq2yui.gm
|
||||
kU-l6DS@[082.015.228.189]
|
||||
37layCJS@j5NVP7NWAY.VG
|
||||
"%U@?\B"@Fl2d.md
|
||||
aH3QW@tw8uo2.eu
|
||||
Bvd#@tupjv.sn
|
||||
SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt
|
||||
DvdUJk@61zwkit7dkd3rcq4v.BD
|
||||
~+Kdz@3mousnl.SE
|
||||
C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY
|
||||
}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM
|
||||
lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae
|
||||
V85E9Hx7@vpf0bs.bz
|
||||
MGBg2@7F3MJTCCPROS8YETM0B4-C9P7WXKGFB0.RU
|
||||
rsBWOCJ@lYX0SILY4L53Z3VJPSF6.pwrawr.vdpoq.nz
|
||||
dIyLrU@9A40T2ZIG7H8R.t63.tv
|
||||
6dAsZKz@d33XR.IR
|
||||
EnqCC@2bk6da6y08.LI
|
||||
AQ9yV@Mfqq32nexufgxzl4o7q5jv3kd.lb
|
||||
lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H
|
||||
b6/zomNkV@8jwm-he.IN
|
||||
5FLuakz.hXVkuqDt@iBFP83V6MNI3N0FRWJ9302DS-0KHRV6O.1bf59kj64uj5b6e2zfn.cm
|
||||
RhIwkU@58vmet9yfddpg.3adkmhrv1px.AO
|
||||
nEBk6w2Q@Bb5ib.2pay.so
|
||||
AlW5CMAn@qos-53u.j91qq96d4en129szf7099kxv5lo6yo.gm
|
||||
QPYBDV3.Ah/h8U@x3v444pzi.1cvgokam.PW
|
||||
5Iwbiq7@p9s-2pixps9jwzyhfroxqivw8sv90r.xn--wgbh1c
|
||||
AaFU9L@3yj1xqf1.cz9.ac
|
||||
|iCmQ1@rum6w0a7wt.3QLD.ht71.cx
|
||||
EhLTUjo@rEK.sJ44H0.GR
|
||||
bHEbq3Rp@33.lKSSMY.9xaurtfle9xe.iu4810l.fj
|
||||
eFcup.cPPEW@[1ae]
|
||||
p907@bk3o.fvtmw2m2.Uutr83x2yt4.2nuin.EU
|
||||
PpW2L5.QgP2n@9rz7.a5qi.oRH1Z.8ov.UZ
|
||||
o8UgG5fewm4vr9Ai5wPS@sgh.2F-OLKLZ81DIUET.xpya0vtx.fj
|
||||
aixQH@z-y.AR
|
||||
jVTeWQfL."M#~t Q"@1e.oglq.ubk.SZ
|
||||
6e5QQuy@N7.2cuw3x2wpddf.paycp1pc.AI
|
||||
IqG6Fl@[220.112.120.54]
|
||||
lWHH4eWSn@tbxyb7.jhzqxrk.lv
|
||||
P1zO*RaAr@[111.99.108.22]
|
||||
d00gy@[4TC]
|
||||
1yNINoBU@[136.003.010.238]
|
||||
Ms8ox@[_3Tuehr]
|
||||
wtWDNo@1sjmcbbli196-765mt7m8o8hywft.7-ga6rsnum8v.np
|
||||
"x)yO"@7le5o2rcud5ngs.Qmfmq.Jfxv8.Zznv6t6il.MIL
|
||||
1hXd@f8.1kxqd3yw4j6zmb7l7.US
|
||||
"8}(\$"@mu2viak0nh4sj5ivgpy1wqie.HK
|
||||
Th7XoAs5@ggdb.BI
|
||||
5iDbhah.xdtF1x@[59.55.12.243]
|
||||
j2ovALlgm2Wcwx@5jphzt.TN
|
||||
ZlaP~E.4Yk1K0F@lF6VN.M5.Nj.PRO
|
||||
cFCvIJAw@l93H0R1W6V4RI0AY7RLRQR4KOEVQPEG-PDTF03V4D9A0.xZZK5.lu
|
||||
8Ju2AW@1n.h7.vu
|
||||
"\nkP]{"@[Vej\yo\HD]
|
||||
fKWC?@qgcb.xn--mgbaam7a8h
|
||||
L4BbaB@hv1.BIZ
|
||||
WvSmV@qpx15vzmbtxzvi-syndl1.ML
|
||||
"3|PX~Cbdq"@U3vp-7k.8c4q3sgpwt6sochundzhx.museum
|
||||
LjH9rJTu@tkm.gy
|
||||
vQgXEFb@maxmrbk-5a5s6o.6MZZ6IK.awjbtiva7.IL
|
||||
6TVbIA@r50eh-a.la
|
||||
AaASl@Bsteea.qHXE3Q5CUJ3DBG.S2hvnld.4WJWL.fk
|
||||
"CN;\-z 6M"@86.qc7s.23p.ET
|
||||
zX3=O3o@Yjov.7g660.8M88OJGTDC5.np
|
||||
QFZlK1A@4W47EIXE.KY
|
||||
1guLnQb07k@ab.ccemuif2s.lb
|
||||
Jddxj@[111.079.109.147]
|
||||
Hj06gcE@[105.233.192.168]
|
||||
u8?xicQ@[i\21I]
|
||||
CczYer}W@bezu6wtys9s.lft3z.mobi
|
||||
OmpYhIL@6GJ7P29EIE-G63RDW7GLFLFC0M1.AERO
|
||||
2RRPLqO@8lh0i.vm7xmvvo-r5nf0x.CY
|
||||
TOc!BhbKz@F-myy7.kQWSUI7S3.net
|
||||
"0\!P?".shQVdSerA@2qmqj8ul.hm
|
||||
LTLNFsgB@[191.56.104.113]
|
||||
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU
|
||||
VGLn@z3E2.3an2.MM
|
||||
TWmfsxn@[112.192.017.029]
|
||||
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV
|
||||
CjaPC63@['\RDrwk]
|
||||
Ayydpdoa@tdgypppmen.wf
|
||||
"gfKP9"@jo3-r0.mz
|
||||
aTMgDW4@t5gax.XN--0ZWM56D
|
||||
mcDrMO3FQ@nwc21.y5qd45lesryrp.IL
|
||||
NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp
|
||||
XtAhFnq@[218.214.251.103]
|
||||
x0S8uos@[109.82.126.233]
|
||||
ALB4KFavj16pODdd@i206d6s.MM
|
||||
grxIt96.46nCf@nokjogh2l4.nCMWXG.yt
|
||||
Fgbh7@2rxkk0bvkk-v3evd-sh56gvhxlh.hhjcsg36j8qt98okjbdj9z574xdpix59zf6h80r.Gyb4rrxu.ve
|
||||
uo0AX41@Fhlegm1z57j-qvf5.p8jo6zvm.sc
|
||||
sjn4cz@9ktlwkqte.bv
|
||||
b04v0Ct@[243.230.224.190]
|
||||
F!FUbQHU@uvz7cu1l.ciz4h2.93U4V.gb
|
||||
6CHec@nONUKT.nl
|
||||
zbmZiXw@yb.bxxp.3fm457.va
|
||||
"/GdiZ7f"@[221.229.46.3]
|
||||
NJde8Li@f7a.g51VICBH.cy
|
||||
6IeAft@e-3fp.Nkh7nm8.v8i47xvrv27r.pf
|
||||
TC*Qopzb@xIOB3.6egz4.m-24t5wmxtmco4iy8g91o66mjgha1vjlepyffott.E5ta.p9.CF
|
||||
"_3Sc_"@[193.165.124.143]
|
||||
W0dwHf@[25.174.65.80]
|
||||
qPkkP0@4k0vs.oaak2z.3JMTI.PK
|
||||
XzZh7@[\\JmD%U]
|
||||
66SGHzw@Oqnr82oml7jct0b8crwbstdhcgc3khxj7dj-t898mzro0p3-rvp-dythh.TN
|
||||
ot4tPF@[AY\j]
|
||||
e4seIFbl@cib.cg
|
||||
B2w025e@r2H7BW16B24DG1S5DED.bg
|
||||
atweEde@blk-3y.mgvoh6l9my.F6.FI
|
||||
uDoPcRGW@rEBD5LUT.ly
|
||||
2KQhx@Bba.u--9b5bc0.NF
|
||||
tKWc2VjVRYD@[254.190.162.128]
|
||||
wc3W16^@D3v2uxqqeclz.w1fd529m.DM
|
||||
Njg@6S8MA.HK
|
||||
"L\^4z]92"@0qp--walx.MIL
|
||||
X08sWFD@62GNK.tN4.f1YXX.ug
|
||||
eK6Bz1Bu@[rX;J&036]
|
||||
"~`o\:"@hO4UKF.oZBWV56B.cmn.DJ
|
||||
lcgUakx@[pjGd&i2]
|
||||
BqdBTnv3c@wf35nwaza.ME
|
||||
"a#Um{:\'\bX:"@in7tjo.uw8wil.gp
|
||||
ApIbER8'@[&Y]
|
||||
JTsM0c!s9CzEH@Sd.mh
|
||||
hy2AOUc@uqxzl7v0hl2nchokqit9lyscxaa0jaqya1wek5gkd.NC
|
||||
pY7bAVD4r@[,>T*R T]
|
||||
!0axBT@03-gdh1xmk3x9.GH
|
||||
vbtyQBZI@20al5g.ro6ds4.Bsg15f5.NU
|
||||
2^ZhSK-FFYOh@Z2iku.rg.Z0ca1.gs
|
||||
G1RLpOn."yfJpg["@mXEV8.mu
|
||||
yrBKNkq@a2a1.Aifn.Ta2.dj
|
||||
Wok5G@b5aqobvi5.ni
|
||||
nXz9i.=EL9Yj@93r8do3ntizibg1-5-a0ziw9ugyn4bo9oaw3ygrxq-eczzv1da6gj58whvmo2.rs
|
||||
Dp63hd@B1kbahyq.PL
|
||||
y01rn27SFq@o0HNP8.C5.i4rvj8j338zgter7er5rkwyo5g.atnc0iuj2ke.8or6ekq0x.IO
|
||||
0RiEo@08mnvbu.p661ernzjz5p7nbyix5iuj.cig5hgvcc.SO
|
||||
Dwxab5@1sx5y3-umsy72nl.74lwye5.DJ
|
||||
IvdZVE4xRk@0vw7ajl.AR
|
||||
CvQxhXJ@d5a7qnx.ke
|
||||
n7MxA4~@[4(R]
|
||||
RFGzu3hD0@wbh4.sm
|
||||
eOADW}BcNG@2568p3b4v.Xq3eksr.GP
|
||||
AsAMWriW7.zSDQSAR6@Gg2q4rtgr.GG
|
||||
cDCVlA0t@[20.116.229.216]
|
||||
c=yJU+3L5@n2x3xhksf.gvreani.MZ
|
||||
wfYnaA4@lzojy.4oii6w6sn-p9.kh
|
||||
kdeOQ5F@vD5Y.wmmv.7rswz.1zelobcp5qxxwzjn.fOEJZ.KM
|
||||
ppULqb2Z@Hv9o2ui.AO
|
||||
tOHw@[IPv6:3500:8B6C::CB5E:1.124.160.137]
|
||||
MWLVsL@7nhliy.O8mjon3rj-kb.t8d6bcpa5i.au
|
||||
BN0EY@hh9v.p9bwgs.TN
|
||||
RgiAp@d9ln.bf
|
||||
PBugBo@97gcz.DJ
|
||||
Fh#dKzbI@[+_]
|
||||
wyqU-C9hXE@wPRBUI-WS9HXE19.LV
|
||||
muC?Js@[IPv6:47FB:5786:4b5e::5675]
|
||||
yLTT2xV@wdoszw9k1ork-z-t.kq.l3SEO.Lb4jx0.NA
|
||||
6zqw.yPV4LkL@dA3XKC.eg
|
||||
S5z9i7i3s@Vzt6.fr
|
||||
L|Sit6s@9cklii1.tf
|
||||
yWYqz@mw-9k.FJ
|
||||
Knhj419mAfftf@R26hxll64.3qtdx6g.AL
|
||||
aZYHUr6@Shyn76c67.65grky.am
|
||||
ZYxn6Px@di0cqhtg.hu
|
||||
"#mLl"@w1sc0g3vm.j1o4o9g.GW
|
||||
WYJcFp@653xk-89oprk2im.iemhx9.CC
|
||||
y5AXi@[Oa #]
|
||||
nZErAGj@6sq3-p.r8KQ.aero
|
||||
OMq5sBK@udg-5zp1.Dory85.SG
|
||||
2bymd@Ojla1hvfpw8rrihrx.cy
|
||||
5OMbw0@r2d8cn75.1VR2BJ0J3A8PY.gc0mljc-h.COOP
|
||||
al6X^pQkx@pyj--2hp.lbet.TN
|
||||
NkzPW4f@2-0.aaoqccwrgi4olytac0imp6vvphsuobrr115eygh2xwkvzeuj.tl
|
||||
"4-b9|/,\e]h]2"@9-iiahsdlzv-v65j.FK
|
||||
g8Pv2hb9@[166.176.68.63]
|
||||
"IA~".Tn03w7@[\>J?]
|
||||
E6aK9TaJ@j0hydmxhkq2q.Svku4saky.MU
|
||||
rdF2Zl1@9fsic.C17pw9o0.vn
|
||||
pCKjPa88DG&x5a@4ha07ia2jk.xk7xe8.PM
|
||||
qgLb5m@nynqp.DE
|
||||
qC731@["\S]
|
||||
vIch1nT@[IPv6:4c2f:A840:1788:ad5:C2C6:dfae:1b1f::]
|
||||
GVSMpg@2YGZ1R19XTW1TIH.Re3vg30u1xq6v7cj1wf-6m14939wvgqbl.93mztd.SG
|
||||
0jq4v7PMxm@eq6teog.kO6LR3.x2p.53yltrsvgpd3.RO
|
||||
zdGLZD0P@i2JQNM8.816oja8pkk5zkvyx.KM
|
||||
Jp#hSH@74zkerax4.31kr.7c9-yuk.mp
|
||||
Kx^0oZn@oFFA-URZ13B34J.DK
|
||||
sub52@aoq7.iHF.CH
|
||||
jfVSq9oAR2D@iGU0.7bp3x.4cr.sz
|
||||
nalgU@Yfpbdcv8a5.n9kwz6kyi2u.thic-rws.af.TG
|
||||
=uC5qVT@56g530cltpekrw.pt
|
||||
QR5&kx@7qhi3bhav5ga0eva.b0sdom.bb
|
||||
8DZQ7@dtr16r89fdw59q.cf
|
||||
Q4pNw@6o-9weojl3r7.LS
|
||||
*mfOc_CN@[G\3]
|
||||
2p`tbG@c767inolrav0hg6a-ucs.y0.tw
|
||||
Rop{cgBy@Wekdh0xns2um.UK
|
||||
t*p05lV@017y.MR
|
||||
7ZxO80@Dovepwr4l.qxfzchrn1.es8ul0vavi6gqy82.K1hc7.INT
|
||||
C_Iphp@5t4rtc.id
|
||||
q+m2x@Cfw.1tm52-kr.BO
|
||||
47NIL@Hl68os0.66l9bsf2q.SC
|
||||
vi0LyF9O@p74jz6mxby.it
|
||||
xQ4jU@rQVWLWAD3T8.4-lnu.AZ
|
||||
zea_0Kr@[97.59.144.249]
|
||||
5HP1k|s@[068.150.236.123]
|
||||
5XJZlmYk.3Du5qee@[072.023.197.244]
|
||||
AvNrIHB0@[+n}oV]
|
||||
"!N7/I\zhh"@[204.037.067.146]
|
||||
vlJODxFF@xFO6V.i1.fgad6bjy.NO
|
||||
qDe0FA@xpp1le82ndircjgyrxyzkrqu3il.oUKHVV6829P-16JILWG62KN.cr
|
||||
pMF64@wssq6kh9uhxk.cA2YZVBV4JW.xX585A.ru
|
||||
G3meE@[^!'OO]
|
||||
"1@0UYJl"@vplkx.d2n.i3tcx3aaxut.lbb3v9.ldq.me
|
||||
iTH0QND@wg9sizy.lr
|
||||
9kF?opSTo9rSDWLo&W&6@xrh32ibf.F0zb6kb.BJ
|
||||
a0FI1m@1olkdpz.W70a3w8qmk3.NA
|
||||
"0H}r}X(p\M`/x"@rY48LPH.Axy.Ue624.TV
|
||||
AQL6YBFb@Hxawb15okz.y4.y5c0e.bt
|
||||
PEaNVR@m8NH9BVX5L096DRM7YTR.er
|
||||
diI`Q@i5fpkuc.7zg2av.D6tzqq.CK
|
||||
TCN0-Z@Tezeq9ejv.ekeab8hz14hui.il
|
||||
05SnFh@jZ85JXZ.1RO99W5FYK3.uyv7g15.MP
|
||||
B2Z76Rn@9yce0shfsydxetu1v4-y.rBU2M0.6ik8oapv0zho6n653il25gu4rd216uw03.MG
|
||||
vGZ2K@C2osgjtel5uerwn.riihbabhh41ve84.r3l.vH6S64.vn
|
||||
Nv2ZgL@[037.054.177.155]
|
||||
WsdI2W@i1ULFQ1.79qfph2.eg
|
||||
vJfpTf3@Hh4x2h.25m0idq3.fr
|
||||
oRqbgftr@l6jg0.TV
|
||||
NiynsKb@k9BTX4-FV.hc0skm-o.lv
|
||||
w9uGwf@4hop8.Jb9655is.nr
|
||||
"NVUW+"@6jbe.KM
|
||||
QusHU6JMR@0RXKIZNH76C3.Oqwcfr779e.MH
|
||||
}C5IwKv1S45vlmPaaVHhF@[IPv6:EBF6::]
|
||||
T7rXlYc@4AI1LM.2o.uk
|
||||
uuCiDC6c@Maar3.65hlg-wf.t3pt9.FJ
|
||||
w2mNOvIUh@dx3ep7ew.ru
|
||||
b#Add@9hpopo.Xg3tbjchdpt.TT
|
||||
NtrgJjfj."NBwi"@[142.085.096.018]
|
||||
00lF9UB@2NR2.rs
|
||||
MPr42ye9@p08lcrzs.4bzxfznsh2bhgsa.CX
|
||||
awwLoYLn~c2LfTEVT@fwksx.qoj94r11kw19k50k3.gd
|
||||
gRZ5w9epm@p6adico3auugj5qklec.Sm4bx5.li
|
||||
zfdZ67Y@1azhq.dl3xxzni2.rrj.lpclc6g4d.sl
|
||||
vTWwSD4fb@uBSOHD.3g.u3mb.gf
|
||||
cYFVxcC6E@F9g0b.n1339r.AU
|
||||
pnuXl@s1alo2.tc
|
||||
lKy64zp.Cbg8BM@y0S.6uiux8h8.0udipt.ma
|
||||
|9FDgc@vbrz.3L.av4kmt.rs
|
||||
skcHAu7@xD715N1.DZ
|
||||
BfcgHK3@[220.136.9.224]
|
||||
LCOEag@Gwm.drsa0.GL
|
||||
qrNZtp3vO@a0gr.8j9cvcgy0p-3.HN
|
||||
lfW2rei20XWSmpQoPY1Dl@[(N&c]
|
||||
WFBBEv|@q7R2J.oy48740.pm
|
||||
6H6rPx@zVJ40.xgyat.cLUX6SVFJWMLF9EZ2PL8QQEU7U1WT0JW3QR8898ALFGKO18CF1DOX89DR.1tfu30mp.CA
|
||||
ytG@J4auwv4has.PS
|
||||
"X;+N1A\A "@rc9cln0xyy8wa6axedojj9r0slj0v.Luy9i6ipqrz74lm5-n6f1-2srq5vdo-opef747ubdykv5hc.2lztpe.er
|
||||
DQTmqL4LVRUvuvoNb8=TT@2up3.PY
|
||||
NC0OPLz@kcru1s0mu.name
|
||||
kBoJf{XaGl@[248.166.223.221]
|
||||
pEjZPm8A@v956Y7GQV.5uu6.Ribgf20u.6e.0do1nki1t.ahy.6iy.sm
|
||||
pIFWkl2@w9N0Q.MC
|
||||
p=VTtlpC@w3ttqb.FO
|
|
@ -0,0 +1,206 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use File::Spec;
|
||||
use Getopt::Long;
|
||||
use LWP::UserAgent;
|
||||
|
||||
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
|
||||
|
||||
my $version = '';
|
||||
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
|
||||
print STDERR "Usage: $script_name -v <version>\n";
|
||||
print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
|
||||
if ($version);
|
||||
exit 1;
|
||||
}
|
||||
my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
|
||||
my $scripts_url = "${url_prefix}/Scripts.txt";
|
||||
my $line_break_url = "${url_prefix}/LineBreak.txt";
|
||||
my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
|
||||
my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
|
||||
my $underscore_version = $version;
|
||||
$underscore_version =~ s/\./_/g;
|
||||
my $class_name = "WordBreakTestUnicode_${underscore_version}";
|
||||
my $output_filename = "${class_name}.java";
|
||||
my $header =<<"__HEADER__";
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
/**
|
||||
* This class was automatically generated by ${script_name}
|
||||
* from: ${url_prefix}/auxiliary/WordBreakTest.txt
|
||||
*
|
||||
* WordBreakTest.txt indicates the points in the provided character sequences
|
||||
* at which conforming implementations must and must not break words. This
|
||||
* class tests for expected token extraction from each of the test sequences
|
||||
* in WordBreakTest.txt, where the expected tokens are those character
|
||||
* sequences bounded by word breaks and containing at least one character
|
||||
* from one of the following character sets:
|
||||
*
|
||||
* \\p{Script = Han} (From $scripts_url)
|
||||
* \\p{Script = Hiragana}
|
||||
* \\p{LineBreak = Complex_Context} (From $line_break_url)
|
||||
* \\p{WordBreak = ALetter} (From $word_break_url)
|
||||
* \\p{WordBreak = Katakana}
|
||||
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
|
||||
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
|
||||
*/
|
||||
public class ${class_name} extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test(Analyzer analyzer) throws Exception {
|
||||
__HEADER__
|
||||
|
||||
my $codepoints = [];
|
||||
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
|
||||
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
|
||||
# Using lowercase versions of property value names to allow for case-
|
||||
# insensitive comparison with the names in the Unicode data files.
|
||||
parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
|
||||
parse_Unicode_data_file($scripts_url, $codepoints,
|
||||
{'han' => 1, 'hiragana' => 1});
|
||||
parse_Unicode_data_file($word_break_url, $codepoints,
|
||||
{'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
|
||||
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
|
||||
|
||||
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
||||
open OUT, ">$output_path"
|
||||
|| die "Error opening '$output_path' for writing: $!";
|
||||
|
||||
print STDERR "Writing '$output_path'...";
|
||||
|
||||
print OUT $header;
|
||||
|
||||
for my $line (@tests) {
|
||||
next if ($line =~ /^\s*\#/);
|
||||
# ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
|
||||
my ($sequence) = $line =~ /^(.*?)\s*\#/;
|
||||
print OUT " // $line\n";
|
||||
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
|
||||
my $test_string = $sequence;
|
||||
$test_string =~ s/\s*÷\s*/\\u/g;
|
||||
$test_string =~ s/\s*×\s*/\\u/g;
|
||||
$test_string =~ s/\\u000A/\\n/g;
|
||||
$test_string =~ s/\\u000D/\\r/g;
|
||||
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
|
||||
my @tokens = ();
|
||||
for my $candidate (split /\s*÷\s*/, $sequence) {
|
||||
my @chars = ();
|
||||
my $has_wanted_char = 0;
|
||||
while ($candidate =~ /([0-9A-F]+)/gi) {
|
||||
push @chars, $1;
|
||||
unless ($has_wanted_char) {
|
||||
$has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
|
||||
}
|
||||
}
|
||||
if ($has_wanted_char) {
|
||||
push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
|
||||
}
|
||||
}
|
||||
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
|
||||
print OUT " new String[] { ";
|
||||
print OUT join(", ", @tokens), " });\n\n";
|
||||
}
|
||||
|
||||
print OUT " }\n}\n";
|
||||
close OUT;
|
||||
print STDERR "done.\n";
|
||||
|
||||
|
||||
# sub parse_Unicode_data_file
|
||||
#
|
||||
# Downloads and parses the specified Unicode data file, parses it, and
|
||||
# extracts code points assigned any of the given property values, defining
|
||||
# the corresponding array position in the passed-in target array.
|
||||
#
|
||||
# Takes in the following parameters:
|
||||
#
|
||||
# - URL of the Unicode data file to download and parse
|
||||
# - Reference to target array
|
||||
# - Reference to hash of property values to get code points for
|
||||
#
|
||||
sub parse_Unicode_data_file {
|
||||
my $url = shift;
|
||||
my $target = shift;
|
||||
my $wanted_property_values = shift;
|
||||
my $content = get_URL_content($url);
|
||||
print STDERR "Parsing '$url'...";
|
||||
my @lines = split /\r?\n/, $content;
|
||||
for (@lines) {
|
||||
s/\s*#.*//; # Strip trailing comments
|
||||
s/\s+$//; # Strip trailing space
|
||||
next unless (/\S/); # Skip empty lines
|
||||
my ($start, $end, $property_value);
|
||||
if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
|
||||
# 00AA ; LATIN
|
||||
$start = $end = hex $1;
|
||||
$property_value = lc $2; # Property value names are case-insensitive
|
||||
} elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
|
||||
# 0AE6..0AEF ; Gujarati
|
||||
$start = hex $1;
|
||||
$end = hex $2;
|
||||
$property_value = lc $3; # Property value names are case-insensitive
|
||||
} else {
|
||||
next;
|
||||
}
|
||||
if (defined($wanted_property_values->{$property_value})) {
|
||||
for my $code_point ($start..$end) {
|
||||
$target->[$code_point] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
print STDERR "done.\n";
|
||||
}
|
||||
|
||||
# sub get_URL_content
|
||||
#
|
||||
# Retrieves and returns the content of the given URL.
|
||||
#
|
||||
sub get_URL_content {
|
||||
my $url = shift;
|
||||
print STDERR "Retrieving '$url'...";
|
||||
my $user_agent = LWP::UserAgent->new;
|
||||
my $request = HTTP::Request->new(GET => $url);
|
||||
my $response = $user_agent->request($request);
|
||||
unless ($response->is_success) {
|
||||
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
|
||||
exit 1;
|
||||
}
|
||||
print STDERR "done.\n";
|
||||
return $response->content;
|
||||
}
|
|
@ -0,0 +1,427 @@
|
|||
=========
|
||||
This file was generated in part (i.e. without the email addresses)
|
||||
by the random text generator at:
|
||||
<http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-rosalixion-word-2gram¶graphs=20&length=200&suppress-quotes=on&no-ads=on>
|
||||
=========
|
||||
waist and Wintja are relearning how dJ8ngFi@avz13m.CC we spread out, but it
|
||||
here before, our dimension of story. In Bed and Marys opus in the last thing
|
||||
actually having difficulties moving, Spiros rises to our hidden on your
|
||||
<JCAVLRJg@3aqiq2yui.gm> orders, my love: Im seven doors and with gentle
|
||||
fingers, then disappears? Whats the idea <kU-l6DS@[082.015.228.189]> of
|
||||
<37layCJS@j5NVP7NWAY.VG> the "%U@?\B"@Fl2d.md pages blowing to appear on Earth
|
||||
in motion (what rules did we can take a radio changes. A VOICE: Hes a
|
||||
scoundrel. VOICES: Burn him! Burn him! SPIROS: Want to team of the couple is
|
||||
the sweetest love aH3QW@tw8uo2.eu of the teaching teaches members to
|
||||
communicate with time interplaying and linked and you marry it. It will leave
|
||||
Bvd#@tupjv.sn the logic of it from hereing those people were all
|
||||
SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt the
|
||||
artist stray? Does a few rose doom the UFO with my dear Sissy says Sissy,
|
||||
holding hands up a bit of DvdUJk@61zwkit7dkd3rcq4v.BD fate falls asleep. When
|
||||
an internet age is ~+Kdz@3mousnl.SE currently working with his bedside table,
|
||||
and brings in a shimmering timeshifty verse vortex, the dream. Victory is
|
||||
hallucination, my hand for more. Mmm my head,
|
||||
C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY in five. (Spiros waves goodbye to tell
|
||||
you, honeybuns: The poisoning is, but no addresses. A message identical reach
|
||||
across the script. }0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM I grasp hold their
|
||||
flapping wings and when theyre seemingly infallible information? Bookshrine of
|
||||
a sip of defined the Great Horned Goddess of no feeling.) Meaw. FFIANA: So,
|
||||
darling. Dont be dry white and teases him back
|
||||
lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae in society not speaking, giggling
|
||||
V85E9Hx7@vpf0bs.bz in MGBg2@7F3MJTCCPROS8YETM0B4-C9P7WXKGFB0.RU the boring
|
||||
f***s! (She leaves and Him Lover, Outlanders. Plus Universe where better than
|
||||
they just the land any letters in the gods. Expected, this at the threesome get
|
||||
even touching myself. rsBWOCJ@lYX0SILY4L53Z3VJPSF6.pwrawr.vdpoq.nz He picks
|
||||
dIyLrU@9A40T2ZIG7H8R.t63.tv up at our harem world 6dAsZKz@d33XR.IR so pop up
|
||||
you will be gathered, then Wintjas hair; smells of the manuscript: Contains a
|
||||
EnqCC@2bk6da6y08.LI common AQ9yV@Mfqq32nexufgxzl4o7q5jv3kd.lb universal within
|
||||
this lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H web.
|
||||
b6/zomNkV@8jwm-he.IN The
|
||||
5FLuakz.hXVkuqDt@iBFP83V6MNI3N0FRWJ9302DS-0KHRV6O.1bf59kj64uj5b6e2zfn.cm cosmos
|
||||
is filled with soap bubbles. <RhIwkU@58vmet9yfddpg.3adkmhrv1px.AO> I cant
|
||||
concentrate with a nearby and he nEBk6w2Q@Bb5ib.2pay.so pours.
|
||||
<AlW5CMAn@qos-53u.j91qq96d4en129szf7099kxv5lo6yo.gm> Its a wine with the joke
|
||||
in the only good enough! It hit again the house. He thinks of terrorist, this
|
||||
water. They were in verbatim rewritable. World by a quick eye shadow beneath
|
||||
the stairway; we not easily counter weight, is filled with your own perceptions
|
||||
about it. (Eve, how to talk to you really turns on its physics. The lover on
|
||||
the sunflower in worship of the? (She smiles.) Greet
|
||||
<QPYBDV3.Ah/h8U@x3v444pzi.1cvgokam.PW> it makes sense$A!-(B Not really,
|
||||
5Iwbiq7@p9s-2pixps9jwzyhfroxqivw8sv90r.xn--wgbh1c from up in the candlelight,
|
||||
denser <AaFU9L@3yj1xqf1.cz9.ac> medium to say something. Shifting of that
|
||||
|iCmQ1@rum6w0a7wt.3QLD.ht71.cx the eyes and there came. And now, approaching.
|
||||
When the thing. What did I woke up the printers! We EhLTUjo@rEK.sJ44H0.GR shall
|
||||
we are heard like a glimpse of hyperspace. It travels further and kneeled down
|
||||
bHEbq3Rp@33.lKSSMY.9xaurtfle9xe.iu4810l.fj to you can walk away? FFIANA: I want
|
||||
to eFcup.cPPEW@[1ae] speak. The Fountain of the background when I extract of
|
||||
hers, so strange book and a royal destruction of songs of this pearl. Not often
|
||||
by an incinerator vessel. Spiros, the delivery of alien exists now. Forward.
|
||||
The rosy guidance of wine. Notices that is partly the pipe
|
||||
p907@bk3o.fvtmw2m2.Uutr83x2yt4.2nuin.EU of the chance in Old Town. D Strange
|
||||
music keeps one of the top of myth and smiles.) SPIROS: Nope, cant even
|
||||
PpW2L5.QgP2n@9rz7.a5qi.oRH1Z.8ov.UZ more! says it doesnt exist! The world in
|
||||
the cosmos loves us. (Spiros soon
|
||||
o8UgG5fewm4vr9Ai5wPS@sgh.2F-OLKLZ81DIUET.xpya0vtx.fj here again aixQH@z-y.AR
|
||||
and again he turns and blinks with you want? says Sissy looks over Wintja and
|
||||
the fashions of Fit to Spiros continues. Its a situation of the barman says
|
||||
Spiros. I read the river. SPIROS: Damn I said. 69
|
||||
<jVTeWQfL."M#~t Q"@1e.oglq.ubk.SZ> he kept locked up into a suitcase along
|
||||
her body, points a female voice of 6e5QQuy@N7.2cuw3x2wpddf.paycp1pc.AI their
|
||||
part of flowers, and Marys opus IqG6Fl@[220.112.120.54] in my PROSECUTOR: Hes
|
||||
<lWHH4eWSn@tbxyb7.jhzqxrk.lv> one is <P1zO*RaAr@[111.99.108.22]> unsafe at a
|
||||
little <d00gy@[4TC]> secrets, we made to write: And a drink of Eternity,
|
||||
Speros, <1yNINoBU@[136.003.010.238]> Mr Boore, back to me! Lovers break
|
||||
Ms8ox@[_3Tuehr] the code so
|
||||
<8'Hk8a@ksf7qqaa7616xw8dq80h.K6fy89c.3k-8c.g58m48v-18zh8v> recap.29 28 So,
|
||||
darling. Dont leave each itself, on and devotion to all about time
|
||||
<wtWDNo@1sjmcbbli196-765mt7m8o8hywft.7-ga6rsnum8v.np> has happened? ANON 4593:
|
||||
What the tongue Such as she did you back and the whole moment in
|
||||
<"x)yO"@7le5o2rcud5ngs.Qmfmq.Jfxv8.Zznv6t6il.MIL> your own lens, thank you
|
||||
1hXd@f8.1kxqd3yw4j6zmb7l7.US arent already. It tastes them have ever come come!
|
||||
The tomb. Blink to him and flips to it, but the palace. No
|
||||
"8}(\$"@mu2viak0nh4sj5ivgpy1wqie.HK way$A!-(B Happily: You smell of it
|
||||
all and yet sure this pool Th7XoAs5@ggdb.BI of the first of his
|
||||
5iDbhah.xdtF1x@[59.55.12.243] heart j2ovALlgm2Wcwx@5jphzt.TN can take to the
|
||||
wind, speak to apply perfectly, you say turn toward sexual nature and lays his
|
||||
ZlaP~E.4Yk1K0F@lF6VN.M5.Nj.PRO pipe. No, landing from
|
||||
cFCvIJAw@l93H0R1W6V4RI0AY7RLRQR4KOEVQPEG-PDTF03V4D9A0.xZZK5.lu the fruit will
|
||||
say. -F<>Dont talk like the west 8Ju2AW@1n.h7.vu wing of the letter in every
|
||||
second, <"\nkP]{"@[Vej\yo\HD]> but he slipped in. Yours Spiros and there
|
||||
when I imagined anything can take returning? <fKWC?@qgcb.xn--mgbaam7a8h> Where?
|
||||
With? Who? Going toward his body and kisses the notion that has joined odds. A
|
||||
scattered around <L4BbaB@hv1.BIZ> slowly, moving eyes on and
|
||||
WvSmV@qpx15vzmbtxzvi-syndl1.ML turns toward her. She sips some way everything
|
||||
began was finished my wet Earth. Warning
|
||||
"3|PX~Cbdq"@U3vp-7k.8c4q3sgpwt6sochundzhx.museum for me.-A City Different.
|
||||
Let your myth LjH9rJTu@tkm.gy settles over it
|
||||
<8myMO4@hOV209VZ-SHGBIH5FBYLTCQZSBW-U5-1.dv9> means to Our of a book he has
|
||||
only but <vQgXEFb@maxmrbk-5a5s6o.6MZZ6IK.awjbtiva7.IL> the imagination, master
|
||||
phreaker, <5ohpA3ww@dcpcotwccy> main railway station. Loses the dreamadoory in
|
||||
the surprising success.) A note from round is her splendour in them? Mmm my
|
||||
dear, were 6TVbIA@r50eh-a.la from them keywords. Boy,
|
||||
AaASl@Bsteea.qHXE3Q5CUJ3DBG.S2hvnld.4WJWL.fk my own imagination, master
|
||||
"CN;\-z 6M"@86.qc7s.23p.ET is the usual fashion, says to stream and appointed
|
||||
space-time continuum. Dilutes your zX3=O3o@Yjov.7g660.8M88OJGTDC5.np sleep. Ive
|
||||
been seen, he says the ringnot we proved? (On the pact. Thanateros is an
|
||||
internet caf<61> where the Queen. Now cmon, lets take to raise the apartment. Like
|
||||
a limousine and I kiss timelord slides his hand QFZlK1A@4W47EIXE.KY in words
|
||||
now. Get us in the same time conceptualisation is to bed. STEFANDIS: Dont do
|
||||
you think Ive put down the green lush. She often by God of a 15 minutes. The
|
||||
others knew into the 1guLnQb07k@ab.ccemuif2s.lb you-know-what. Youre the luxury
|
||||
hotel. Diamonds and receive the process of action. We wanted in the nominated
|
||||
bird. The <Jddxj@[111.079.109.147]> woman undressing. He has him just get at
|
||||
Hotel California. Its <Hj06gcE@[105.233.192.168]> about all devices. Playlist?
|
||||
Initiating playlist. Timelock? Timelock on. We have a u8?xicQ@[i\21I] lock of
|
||||
the apartment. Like a kto, part of Our superhallugram to hook up and
|
||||
CczYer}W@bezu6wtys9s.lft3z.mobi outs. polish
|
||||
OmpYhIL@6GJ7P29EIE-G63RDW7GLFLFC0M1.AERO fills the crowd, comes from the music
|
||||
is impossible. SPIROS: F***. You are your voo goo.
|
||||
<2RRPLqO@8lh0i.vm7xmvvo-r5nf0x.CY> Daysends burn deeply and will take
|
||||
TOc!BhbKz@F-myy7.kQWSUI7S3.net this he thinks. For UFO from elsewhere. Bzzz!
|
||||
Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them "0\!P?".shQVdSerA@2qmqj8ul.hm the leg
|
||||
of LTLNFsgB@[191.56.104.113] all, until it has read it is
|
||||
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU there. <VGLn@z3E2.3an2.MM> Once
|
||||
TWmfsxn@[112.192.017.029] Spiros under the place
|
||||
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV as were not a house of the
|
||||
rosebushes and the whateverend, feel her waist. She changes everything. We had
|
||||
decided to do you know CjaPC63@['\RDrwk] this, is what did leave, pray; let us
|
||||
come to, <Ayydpdoa@tdgypppmen.wf> what history as died. Strange, Spiros with
|
||||
delight: That night "gfKP9"@jo3-r0.mz and gold case
|
||||
<aTMgDW4@t5gax.XN--0ZWM56D> is spring: the aeon arising, wherein he returned,
|
||||
retraversing the mcDrMO3FQ@nwc21.y5qd45lesryrp.IL gates, first
|
||||
<NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp> to reach session. Initiating first
|
||||
part of the main hall toward his own spurs. Hes an <XtAhFnq@[218.214.251.103]>
|
||||
Irifix And older ones who wins? ADAM: x0S8uos@[109.82.126.233] The violin and
|
||||
reality. The hidden set up to come. ROSE WAKINS: No answer. The
|
||||
ALB4KFavj16pODdd@i206d6s.MM rosy pink cigarette.) Visit the supreme chest and
|
||||
express in orgasm, my version of clouds contemplating existence, the horizon.
|
||||
Best grxIt96.46nCf@nokjogh2l4.nCMWXG.yt of sheer emotion. Spiros laughs. Why
|
||||
did he says Spiros. Ban him, he called for it, sir, says Spiros
|
||||
Fgbh7@2rxkk0bvkk-v3evd-sh56gvhxlh.hhjcsg36j8qt98okjbdj9z574xdpix59zf6h80r.Gyb4rrxu.ve
|
||||
laughs. uo0AX41@Fhlegm1z57j-qvf5.p8jo6zvm.sc Can we determined that when I am
|
||||
Spiros, quoting Jim Morrison. Death. Design patterns, youll hear Spiros says.
|
||||
They cant G decide if he was your key that we playing? SPIROS: Why wont xxx
|
||||
would be imagined. Technology so beautiful to fill his diary; I like a match.
|
||||
Puffs. The Star Eagle. And a person with a play with. sjn4cz@9ktlwkqte.bv
|
||||
Faberge can change overcome your work, a large-scale coordination, Goddess say
|
||||
is blasting away to end is <b04v0Ct@[243.230.224.190]> very tricky to stab it
|
||||
as a turn me to the champagne on your obsession about his nose and
|
||||
F!FUbQHU@uvz7cu1l.ciz4h2.93U4V.gb somewhere <6CHec@nONUKT.nl> else, then far
|
||||
stretch. The great outdoors), puffing dried cum on the manuscript I$A!-(B O
|
||||
one knee, feeling and sex in igniting <zbmZiXw@yb.bxxp.3fm457.va> bomb. (A
|
||||
housefly, Musca domestica, lands on into the device. Let me met. Wintja and
|
||||
victory. <"/GdiZ7f"@[221.229.46.3]> For years in tipsy bliss. SISSY: (Nods.)
|
||||
Yes. Now you witch. And we must remember, will tell you move but her
|
||||
NJde8Li@f7a.g51VICBH.cy creation with gentle feet, naked on strange hovering
|
||||
futuristic vehicles that when retrieved upon a thought, or reflected. The Crew
|
||||
coming on our gratitude for you address then ventured into a dream, has begun,
|
||||
she sees a 6IeAft@e-3fp.Nkh7nm8.v8i47xvrv27r.pf golden ball and 4 If you that,
|
||||
Izz). Lapis, to the return all laugh. Applesfoods maybe, says
|
||||
TC*Qopzb@xIOB3.6egz4.m-24t5wmxtmco4iy8g91o66mjgha1vjlepyffott.E5ta.p9.CF She.
|
||||
Cmon I Stefandis.) Count me with a bed sheets, carrying gently away about time
|
||||
you rather dramatic, which reaches across this day. It brings forth between
|
||||
suns. How about the white sugar, leaves, sugardusty sugar, drinking of time.
|
||||
Believe. There "_3Sc_"@[193.165.124.143] is the soul, W0dwHf@[25.174.65.80]
|
||||
and only Spiros. Love you. Believe in the multi-leveledness of the 21st century
|
||||
and exchanges a book called Sphinx. Alien Star qPkkP0@4k0vs.oaak2z.3JMTI.PK
|
||||
initiated. NYKKEL HUMPHRY: Of Make ways over town.) SISSY: $A!-(Band you can
|
||||
turn slowly but not yet audible, appears, XzZh7@[\\JmD%U] in the silver
|
||||
melt together. This way of vision sees through time). Brewing with a kiss?
|
||||
<66SGHzw@Oqnr82oml7jct0b8crwbstdhcgc3khxj7dj-t898mzro0p3-rvp-dythh.TN> Her
|
||||
feathers: streaming water of the wind. I started interacting in a boat, on
|
||||
ot4tPF@[AY\j] her e4seIFbl@cib.cg thigh as she blinks happily. Here is
|
||||
<B2w025e@r2H7BW16B24DG1S5DED.bg> what you around him, Magus says the list. Its
|
||||
about what that atweEde@blk-3y.mgvoh6l9my.F6.FI there is functional. We
|
||||
vanished into the computer. Up hills and enable entry using his long adventure.
|
||||
Do we are all detailed trip against decent behaviour and girls. And you
|
||||
alright? You evil laughter: Muah! Muah! Wont wate you all uDoPcRGW@rEBD5LUT.ly
|
||||
way that there <2KQhx@Bba.u--9b5bc0.NF> is either both night And our dimension
|
||||
of a bad joke, says nothing, just after time. It was indeed. Now that will make
|
||||
the streets. He instable? What shall do. tKWc2VjVRYD@[254.190.162.128] Who
|
||||
wc3W16^@D3v2uxqqeclz.w1fd529m.DM are heard like our love. Of the stairs too,
|
||||
usually through the note nearby and you go now. If I remember Njg@6S8MA.HK how
|
||||
it instead. (She chews the rosy petals, frosty and the land at first part of
|
||||
waking? That we "L\^4z]92"@0qp--walx.MIL like they meet you.
|
||||
<X08sWFD@62GNK.tN4.f1YXX.ug> And out into the bed. From the gods have loads of
|
||||
a dark winding stairs and laughs. Why doth Her devastatingly good eyesalve, to
|
||||
tell it says the Rosy Dawn. Rising, rosing, the story? (For all the UFO
|
||||
shimmers from around him, but we look before eK6Bz1Bu@[rX;J&036] the Eternity
|
||||
we shall never go now, look, he thinks, both go for the words said. 69 people
|
||||
who live in Thy honor. "~`o\:"@hO4UKF.oZBWV56B.cmn.DJ And
|
||||
lcgUakx@[pjGd&i2] here and his life has tasted of becoming more clearly. He
|
||||
is dead. Calculating possible meanings of it instead. BqdBTnv3c@wf35nwaza.ME
|
||||
(She whispers, smiling.) Theyll be able to help. ELLILIEILIA: You are created
|
||||
the visible "a#Um{:\'\bX:"@in7tjo.uw8wil.gp world, without it will see now,
|
||||
says Spiros ApIbER8'@[&Y] thinks. Every time and go to write fiction. Indeed,
|
||||
love something I pop, from the play? asks JTsM0c!s9CzEH@Sd.mh the taste of the
|
||||
outrageous wreck of dream, born and there
|
||||
hy2AOUc@uqxzl7v0hl2nchokqit9lyscxaa0jaqya1wek5gkd.NC was still result. Search
|
||||
taking <pY7bAVD4r@[,>T*R T]> out into !0axBT@03-gdh1xmk3x9.GH my dear, you
|
||||
know, of saint? What did come here from the Crowinshield Garden, amongst the
|
||||
warm kiss. Everything is white marble statue he is tunes faberge intricate.
|
||||
Spiros, a particular frequency, vbtyQBZI@20al5g.ro6ds4.Bsg15f5.NU spinning,
|
||||
trying to a trail of the narrative that it while the Queen, giggling: What are
|
||||
a letter with a web we could 2^ZhSK-FFYOh@Z2iku.rg.Z0ca1.gs not a
|
||||
G1RLpOn."yfJpg["@mXEV8.mu peculiar yrBKNkq@a2a1.Aifn.Ta2.dj stench of history,
|
||||
when appearing in the interface as well as follows the secret I am not
|
||||
teleframe the room, disguised <Wok5G@b5aqobvi5.ni> as the brilliance of the
|
||||
pressure of the modern world, but
|
||||
nXz9i.=EL9Yj@93r8do3ntizibg1-5-a0ziw9ugyn4bo9oaw3ygrxq-eczzv1da6gj58whvmo2.rs
|
||||
whatever. The solid concrete, Dp63hd@B1kbahyq.PL and put it stumbling or why
|
||||
wont the chalice with communicating with language only she says Spiros,
|
||||
whispers.) We left from the second birth? The young man is part of the teapot
|
||||
opens. A man in disbelief.
|
||||
y01rn27SFq@o0HNP8.C5.i4rvj8j338zgter7er5rkwyo5g.atnc0iuj2ke.8or6ekq0x.IO
|
||||
Outwords scratch skills against her in fairy gently
|
||||
<0RiEo@08mnvbu.p661ernzjz5p7nbyix5iuj.cig5hgvcc.SO> bite of death and Wintja,
|
||||
playing with the name by <Dwxab5@1sx5y3-umsy72nl.74lwye5.DJ> your dreams. He
|
||||
arrives <IvdZVE4xRk@0vw7ajl.AR> the information. He swallows all the f*** me
|
||||
tell her wineglass and tangles. Synchronising <CvQxhXJ@d5a7qnx.ke> weeks of a
|
||||
reason why everything seemed as wet dreamery, remember? Got a purple Ipomoea,
|
||||
crawls through the first stage has the riddled beginning to her in a butterfly.
|
||||
You landed smoothly. Preparing to n7MxA4~@[4(R] hit a world is man. How much
|
||||
in <hEhF@3TV5WQ.fbkx3f> mystery. And RFGzu3hD0@wbh4.sm furthermore, what the
|
||||
edge of physics, death and eOADW}BcNG@2568p3b4v.Xq3eksr.GP touched smoothly ah?
|
||||
Fashion feasible technical population resulted distinct produces
|
||||
AsAMWriW7.zSDQSAR6@Gg2q4rtgr.GG recognize instance the room at the garden.)
|
||||
PERNELLE FLAMEL: (To Mrs She is basically very drunk. I see you
|
||||
<cDCVlA0t@[20.116.229.216]> cant I walk down naked on it to bed bed into
|
||||
c=yJU+3L5@n2x3xhksf.gvreani.MZ the stairway wfYnaA4@lzojy.4oii6w6sn-p9.kh and a
|
||||
kiss as though the point we see the numbers, the phone set to be displayed,
|
||||
disincarnate entities can feel my wifey. Spiros empties the answering evening.
|
||||
That is kdeOQ5F@vD5Y.wmmv.7rswz.1zelobcp5qxxwzjn.fOEJZ.KM simply not but I
|
||||
could do to the ground, and the decanter ppULqb2Z@Hv9o2ui.AO is my friends and
|
||||
says: I <tOHw@[IPv6:3500:8B6C::CB5E:1.124.160.137]> see The elves of dream
|
||||
telepath posts, but makes a gentle people with a redirection is generally said
|
||||
Tadeja. Its over, or of ages, you excuse us walk off to Talk A never-ending
|
||||
one. I remember how cute she saw the neat fuse weds sexiness. A thick paperback
|
||||
book itself continuouslyposition, have heard in the noise We are presently at
|
||||
the first of the death MWLVsL@7nhliy.O8mjon3rj-kb.t8d6bcpa5i.au mask there is
|
||||
accurate to meet by to this important worse material in separate directions.
|
||||
Spiros stands, and arrows and orange from a witch and down the mix? he feels
|
||||
Wintjas 13th century. arling peach, cosmos loves playing with silver trays with
|
||||
the <BN0EY@hh9v.p9bwgs.TN> language as RgiAp@d9ln.bf I still result. Search
|
||||
taking time and time <PBugBo@97gcz.DJ> in time. Spiros, how else or
|
||||
Fh#dKzbI@[+_] nonexistence. Eros never guarded the horse stops. Move. Stop.
|
||||
Move. After earlier squads mysterious source. It inscribes in case you are
|
||||
applause. The world was a. With swiftly cover <wyqU-C9hXE@wPRBUI-WS9HXE19.LV>
|
||||
it as in yourself! 5 Yes, now comes from half walls of us, my love. I am your
|
||||
vast operation is all worked out? O how long ago. It glimmers, node of the
|
||||
voice, the middle of the introducing of utter hell on the car unlocked and mind
|
||||
around midsummer and not believing in <muC?Js@[IPv6:47FB:5786:4b5e::5675]> his
|
||||
lower lip. From the wind say I was inspired to live in a crime. I know, and
|
||||
find people have been reported found a digital electronics. Is the pillow,
|
||||
touched falls down their part of the computer and our world
|
||||
<yLTT2xV@wdoszw9k1ork-z-t.kq.l3SEO.Lb4jx0.NA> come walking in
|
||||
<6zqw.yPV4LkL@dA3XKC.eg> the stuff to help. Websight. Dedicated hosting
|
||||
wordpress blogger coined Sister <S5z9i7i3s@Vzt6.fr> short Sissy Cogan. She
|
||||
answers. It is finished his way that includes getawayways. Compiling focused is
|
||||
this case? Then turn on. ANON 4593: What are pretty kinky a story about the
|
||||
L|Sit6s@9cklii1.tf strangest child a Syntax of passage and Wintja and
|
||||
reportedly after demolition, decay, and twists up to tales endwhere. This way
|
||||
there to born from elsewhere. Bzzz! Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them that
|
||||
words from sleep but no poet yWYqz@mw-9k.FJ am I woke
|
||||
Knhj419mAfftf@R26hxll64.3qtdx6g.AL up in a kiss made it is heard on Midsummer
|
||||
our cards like big fane beneath the secret of the <aZYHUr6@Shyn76c67.65grky.am>
|
||||
criticising crowd of the gods and here to... TADEJA: (Suddenly appearing in
|
||||
ZYxn6Px@di0cqhtg.hu your "#mLl"@w1sc0g3vm.j1o4o9g.GW voo goo. Daysends burn
|
||||
deeply happy, for large bite of his artistic inspiration without feeling as the
|
||||
season. One within the dreary WYJcFp@653xk-89oprk2im.iemhx9.CC kingdom. (She
|
||||
steps up with Christine says. The Blooming of y5AXi@[Oa #] The time regularly
|
||||
we are, she nZErAGj@6sq3-p.r8KQ.aero kisses the gods? I am in his brother I met
|
||||
years ago. The word <OMq5sBK@udg-5zp1.Dory85.SG> is because we had. But yes
|
||||
just like a while. Were not matter; W it going? Im sad to
|
||||
<2bymd@Ojla1hvfpw8rrihrx.cy> where he arrives and information, and smiles
|
||||
victoriously. 5OMbw0@r2d8cn75.1VR2BJ0J3A8PY.gc0mljc-h.COOP Mmm, you Rudy. And
|
||||
there and day soon is phone and come <al6X^pQkx@pyj--2hp.lbet.TN> back?
|
||||
Rephrase that we are good, I leave the gifts of html or center of her right to
|
||||
him to where the room.) SPIROS: Okay, sure, Ill be a page is to
|
||||
NkzPW4f@2-0.aaoqccwrgi4olytac0imp6vvphsuobrr115eygh2xwkvzeuj.tl put in a novel.
|
||||
I want two. "4-b9|/,\e]h]2"@9-iiahsdlzv-v65j.FK Passing
|
||||
<1AhBt@od77y.s9ZZP531YKW> now. I go identify what we are always win. Anyway. I
|
||||
know. It is here reaching your script and toward the edge of shortcuts. We came
|
||||
the Saussiepan and <g8Pv2hb9@[166.176.68.63]> its mysterious ways. I remember
|
||||
"IA~".Tn03w7@[\>J?] how am waking to, that the secret about it will say the
|
||||
redpurple wine, Our plan all within this moment you can hear me, I heard on the
|
||||
clouds. A channel is hidden visible world, without ground turned real, their
|
||||
every E6aK9TaJ@j0hydmxhkq2q.Svku4saky.MU way to a radius of
|
||||
rdF2Zl1@9fsic.C17pw9o0.vn apple tree and says Spiros. Here I saw her. He walks
|
||||
by the landscape of secrets of paper. I love it! But I could call the
|
||||
<pCKjPa88DG&x5a@4ha07ia2jk.xk7xe8.PM> world with the manuscript I$A!-(B O
|
||||
nothing. Im proofreading the most dead branch in qgLb5m@nynqp.DE the screen,
|
||||
then I did you can remember. qC731@["\S] (If you can it completely insane and
|
||||
we had expected something our sacrament. We were back. Esc. (Shuffle.
|
||||
Hallucinate a sip of grandeur, said he suddenly a tree, and ground turned out
|
||||
the publisher. O about it all. Lets
|
||||
<vIch1nT@[IPv6:4c2f:A840:1788:ad5:C2C6:dfae:1b1f::]> stay with us. Mooneye
|
||||
today and thinks and check
|
||||
GVSMpg@2YGZ1R19XTW1TIH.Re3vg30u1xq6v7cj1wf-6m14939wvgqbl.93mztd.SG the modern
|
||||
world.) Sissy stands sipping redpurple wine) and you
|
||||
0jq4v7PMxm@eq6teog.kO6LR3.x2p.53yltrsvgpd3.RO up to be wilds. Spiros 99% dead.
|
||||
Calculating fastest and chewing she directions!
|
||||
zdGLZD0P@i2JQNM8.816oja8pkk5zkvyx.KM Take my body and executed with your own
|
||||
forehead, born from Egypt come back? Rephrase that what is the night. There is
|
||||
here. Cant you think. And shadows Jp#hSH@74zkerax4.31kr.7c9-yuk.mp keep
|
||||
dreaming of letting the elves of modern civilisation? Does that fly softly
|
||||
through the surface. Of the modern world we must Kx^0oZn@oFFA-URZ13B34J.DK find
|
||||
sub52@aoq7.iHF.CH them, baby. Rosy Dawn. jfVSq9oAR2D@iGU0.7bp3x.4cr.sz You have
|
||||
become clear edges. And why you told our skin and
|
||||
nalgU@Yfpbdcv8a5.n9kwz6kyi2u.thic-rws.af.TG places, spread on your air on her
|
||||
earlier. The effects will be the song by and his eyes are gods. Expected, this
|
||||
pool of illusions, that makes its golden geisha ball on Clocksmith Alley. Two
|
||||
female form orbits the two chords on a god, in correct dose to see a book.
|
||||
JOEL: Spiros thinks as he felt, came out out! We are switched in the matter. I
|
||||
shall I can imagine the Crowinshield Garden the aeon arising, wherein he once
|
||||
again. You suddenly changed. And the rose; Will you? Now listen. (She smiles.)
|
||||
Greet it comes everybody. And what the room, disguised noise We are you in 3D:
|
||||
you come. ROSE WAKINS: =uC5qVT@56g530cltpekrw.pt I used to read it: Barbapappa
|
||||
(a gay pirate captain) <QR5&kx@7qhi3bhav5ga0eva.b0sdom.bb> and walks up again,
|
||||
when you are here; working on to. 8DZQ7@dtr16r89fdw59q.cf Now join you? Im
|
||||
slowly in white <Q4pNw@6o-9weojl3r7.LS> bed and language whitespace
|
||||
sensitivity, readability, less punctuation, etcetera. Things had to the Dark
|
||||
signal has him with gentle blood on to the ages. Stops laughing. Sharpens eyes
|
||||
from the *mfOc_CN@[G\3] starway, Down the uniqueness of the bed
|
||||
2p`tbG@c767inolrav0hg6a-ucs.y0.tw and Rop{cgBy@Wekdh0xns2um.UK giggles. Spiros
|
||||
soon here for ignition of the thing Mr and fetches her t*p05lV@017y.MR you hold
|
||||
their own code. Your brain and Nora in longer. Stay tuned. We
|
||||
7ZxO80@Dovepwr4l.qxfzchrn1.es8ul0vavi6gqy82.K1hc7.INT must marry me? Eyeglance
|
||||
is is not hear. He takes a good marijuana. And I had very fluid. It cant G
|
||||
C_Iphp@5t4rtc.id decide long hair shaved like a while. I have telephones and
|
||||
waited. He sits there is humanity within its authors and snaps a touch
|
||||
q+m2x@Cfw.1tm52-kr.BO it candlelight tuning. Just a young man go to the
|
||||
ad-section.) 47NIL@Hl68os0.66l9bsf2q.SC THE F*** UP. Spiros slowly. Lets rock
|
||||
on his father and remember: the sea soothe his paternal grandfathers old days.
|
||||
In to the Honey Queen, xxx 14 hristytio (Ill catch us. Compliments always. Did
|
||||
you rather unnoticeably. Faster than we got this cosmos. The engineers of
|
||||
terribly intricate fantasy turned semitransparent, the people have done subtly.
|
||||
It is THIS bulls***? Count me Rudy$A!-(B Sissy laughs. Can we are breadcrumbs
|
||||
vi0LyF9O@p74jz6mxby.it on Clocksmith xQ4jU@rQVWLWAD3T8.4-lnu.AZ Your usage
|
||||
<zea_0Kr@[97.59.144.249]> of <5HP1k|s@[068.150.236.123]> being a shimmering
|
||||
green. 5XJZlmYk.3Du5qee@[072.023.197.244] Her feathers: streaming
|
||||
<fzQlo2R.HSbkNYi@ay8a5so81x2fgkt2rv> rays Wanna take AvNrIHB0@[+n}oV] a marble
|
||||
from the letter the brink of wheat from the dull ghost of the article atomrss
|
||||
am I? (He hangs up "!N7/I\zhh"@[204.037.067.146] dreaming? A PEDESTRIAN: I
|
||||
already told you than the world now, as vlJODxFF@xFO6V.i1.fgad6bjy.NO though he
|
||||
walks off the flowers. He lifts
|
||||
<qDe0FA@xpp1le82ndircjgyrxyzkrqu3il.oUKHVV6829P-16JILWG62KN.cr> his head we
|
||||
passed on a hint of the worldmask of the people we dance, sweet boy, my dear,
|
||||
matter of bridging millennia, I was it works, and Adam says: And the fathers
|
||||
pMF64@wssq6kh9uhxk.cA2YZVBV4JW.xX585A.ru that we are in this G3meE@[^!'OO]
|
||||
stuff!? The wunderdome. I saw "1@0UYJl"@vplkx.d2n.i3tcx3aaxut.lbb3v9.ldq.me
|
||||
your prophethood of the ones too far! iTH0QND@wg9sizy.lr Further! Into the
|
||||
planet. He sits on the Other. We came from Egypt to save our dear Sissy slid
|
||||
her earlier. Ill tell me away with bright asterisms sparkling around
|
||||
9kF?opSTo9rSDWLo&W&6@xrh32ibf.F0zb6kb.BJ in this young woman in the whispering
|
||||
wind and hands to speak, but using his <a0FI1m@1olkdpz.W70a3w8qmk3.NA> nose.)
|
||||
Nevermind. WOMAN TWO: And furthermore, what about the script, says the sun.
|
||||
Large-scale thinking of a witch? Spiros hears music
|
||||
<"0H}r}X(p\M`/x"@rY48LPH.Axy.Ue624.TV> and a world as well as a poem
|
||||
AQL6YBFb@Hxawb15okz.y4.y5c0e.bt ever, indestructible. A newsboy hands
|
||||
<PEaNVR@m8NH9BVX5L096DRM7YTR.er> Spiros gives the drawing. Looks like to the
|
||||
<diI`Q@i5fpkuc.7zg2av.D6tzqq.CK> living out TCN0-Z@Tezeq9ejv.ekeab8hz14hui.il
|
||||
loud from the house. He is disappearance, as I know on the centre of your
|
||||
section gives rise from 05SnFh@jZ85JXZ.1RO99W5FYK3.uyv7g15.MP which it be close
|
||||
now, dream once: The stars
|
||||
<B2Z76Rn@9yce0shfsydxetu1v4-y.rBU2M0.6ik8oapv0zho6n653il25gu4rd216uw03.MG> are
|
||||
your vGZ2K@C2osgjtel5uerwn.riihbabhh41ve84.r3l.vH6S64.vn presence. UFO. You,
|
||||
Spiris, are born in Plomari. Steal back door, from his mother: Is it to live in
|
||||
their doors are like, Nv2ZgL@[037.054.177.155] two weeks with
|
||||
WsdI2W@i1ULFQ1.79qfph2.eg us across his way to crack matter projected by four
|
||||
<vJfpTf3@Hh4x2h.25m0idq3.fr> initiated. NYKKEL HUMPHRY: Of <oRqbgftr@l6jg0.TV>
|
||||
the woman casts a drop of your amulets NiynsKb@k9BTX4-FV.hc0skm-o.lv and the
|
||||
morning light. Plasticity of the sun bursts can feel it, rises from lands on
|
||||
w9uGwf@4hop8.Jb9655is.nr the realization of his field of the branded mania.
|
||||
Spiros says a dream? Something happened. And watching the Other, she says Fast
|
||||
Eddie. Bandaging the greeter info. The Eagles song by the fragrance of
|
||||
Timescity Express, is there, by zero. -F<>Your star alliance. SPIROS: (Quietly,
|
||||
smiling faces twitching in an envelope yellowed by It, producing open minds.
|
||||
This mighty Nile dynamic magnetic strip that sticks). To Ellileilia, two
|
||||
fingers with the moon undersea settling for "NVUW+"@6jbe.KM insanity! He
|
||||
rises from the QusHU6JMR@0RXKIZNH76C3.Oqwcfr779e.MH end of wine ride the Logos
|
||||
and the cosmos loves <}C5IwKv1S45vlmPaaVHhF@[IPv6:EBF6::]> playing with care of
|
||||
myself up pitch/volume of a violin. The rosy dawn, Adam says: The transforming
|
||||
magic touch the waist, working-A transparent, yet its not easily let us
|
||||
changelings who all across Fountain Square where no telephones ring? Spiros
|
||||
recently. MARY T7rXlYc@4AI1LM.2o.uk BRISCOLL: What if
|
||||
uuCiDC6c@Maar3.65hlg-wf.t3pt9.FJ I w2mNOvIUh@dx3ep7ew.ru dreamed of a new
|
||||
dimension of her in Wintjas direction. -F<>Word frequencies, underground river,
|
||||
announced on your location. Thought b#Add@9hpopo.Xg3tbjchdpt.TT magic. The
|
||||
violin kept talking to stab it was born from our own life as the dream I was
|
||||
practically there I want to smalltalk about the station, and so recap.29 28 So,
|
||||
darling. We are truly is. Its on Crete. On a curtain in a copy of the
|
||||
<NtrgJjfj."NBwi"@[142.085.096.018]> afterlife, the grass and the lovers pot!
|
||||
Transistoryness? Radiosyncromatics? Syntax of the modern world The mirror at
|
||||
<00lF9UB@2NR2.rs> the day soon <MPr42ye9@p08lcrzs.4bzxfznsh2bhgsa.CX> there,
|
||||
doing it will you will be disclosed, says Saussie. Become the future just
|
||||
happened? Spiros picks it at the time transfer was
|
||||
awwLoYLn~c2LfTEVT@fwksx.qoj94r11kw19k50k3.gd successful. Initiating first
|
||||
somewhere else. Its from gRZ5w9epm@p6adico3auugj5qklec.Sm4bx5.li the
|
||||
imagination, Spiros saw the words: They cant remember yet? I add to Any time
|
||||
here, she says. Butterfly as a dark zfdZ67Y@1azhq.dl3xxzni2.rrj.lpclc6g4d.sl
|
||||
soil run free What do you see, is the natural radiance of death reports,
|
||||
<vTWwSD4fb@uBSOHD.3g.u3mb.gf> is welcomed. Layer upon layer of Thy angels are
|
||||
crystal. Red <cYFVxcC6E@F9g0b.n1339r.AU> King and its my opinion. You were
|
||||
back. Hows it with-A liquid purple. She looks at pnuXl@s1alo2.tc a man
|
||||
lKy64zp.Cbg8BM@y0S.6uiux8h8.0udipt.ma on with me. Say the beginning from the
|
||||
manuscript and |9FDgc@vbrz.3L.av4kmt.rs bare plot. Queen told by the redpurple
|
||||
wine back where we all be rather dramatic, which they had skcHAu7@xD715N1.DZ
|
||||
always <BfcgHK3@[220.136.9.224]> include Sir Nykkel Humphry, master of the
|
||||
inverse confine survey the rosy guidance of her eyes on <LCOEag@Gwm.drsa0.GL> a
|
||||
river here, to the latest of Sissy. He again set the old Egypt. He returns to
|
||||
the looser you ready? Y Were ready. Spiros qrNZtp3vO@a0gr.8j9cvcgy0p-3.HN says
|
||||
Sissy. Wintja sing: Ive put ourselves in him, he has taken a
|
||||
lfW2rei20XWSmpQoPY1Dl@[(N&c] third <J761x@0IKGVUDNQ.3xpb> person. Whats it
|
||||
will bring the room on the book in trees and WFBBEv|@q7R2J.oy48740.pm smiles a
|
||||
pipe he enters the chat room (The church music in comic book aside
|
||||
<6H6rPx@zVJ40.xgyat.cLUX6SVFJWMLF9EZ2PL8QQEU7U1WT0JW3QR8898ALFGKO18CF1DOX89DR.1tfu30mp.CA>
|
||||
Rosalias Dawn, pray, Man through ytG@J4auwv4has.PS concrete. Could we? Were
|
||||
taking over a
|
||||
<"X;+N1A\A "@rc9cln0xyy8wa6axedojj9r0slj0v.Luy9i6ipqrz74lm5-n6f1-2srq5vdo-opef747ubdykv5hc.2lztpe.er>
|
||||
hippie up the detail. Rain begins to being married to the designing of love.).
|
||||
Made myself a funeral. Who are created DQTmqL4LVRUvuvoNb8=TT@2up3.PY (Is that
|
||||
hyperspace at the merriest of us for that. -F<>Christofle is heard
|
||||
NC0OPLz@kcru1s0mu.name him a huge and wraps if he find? He is or so much more
|
||||
complex than kBoJf{XaGl@[248.166.223.221] we are heard within the
|
||||
<pEjZPm8A@v956Y7GQV.5uu6.Ribgf20u.6e.0do1nki1t.ahy.6iy.sm> woman of The
|
||||
<pIFWkl2@w9N0Q.MC> mirror of p=VTtlpC@w3ttqb.FO dream, born from that we are. A
|
||||
VOICE:-A
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,643 @@
|
|||
http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on
|
||||
http://c5-3486.bisynxu.FR/aI.YnNms/
|
||||
ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R
|
||||
sJ5PY.b5t6.pn/
|
||||
http://Z%441S6SK7y%30K34@35j.np/RUpp%D1KnJH
|
||||
[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/
|
||||
file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7
|
||||
http://[a42:a7b6::]/qSmxSUU4z/%52qVl4
|
||||
http://Rcbu6/Oxc%C0IkGSZ8rO9IUpd/BEvkvw3nWNXZ/P%17tp3gjATN/0ZRzs
|
||||
file:///2CdsP/U2GCLT
|
||||
Http://Pzw978uzb.ai/yB;mt/o8hVKG/%231Y/Xb1%bb6v1fhjfdkfkBvxed?8mq~=OvF&STpJJk=ws0ZO&0DRA=
|
||||
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH
|
||||
Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m
|
||||
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb
|
||||
ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J
|
||||
ftp://213.7.210.47/%e5pFkj6e6Jczc/ypJGG/z%663jYR/37IxLQBPr/Ciq50EUIdueyj
|
||||
ftp://alv0e-s.88.nJ2B34.ps/s0TgnaY?yOQUt/18CY%16IzNSQu/LaT3dD?io%80LBw%cdXDHU3/ppMyv/DbLDzyceaC/Goa%f3gn/5ebODAP0NAOD/6NkL/uP7CW/gS5TnaS
|
||||
http://278phvcx21/QGOy%395L/yy5NurSi8S/gMr%553%C9q0S
|
||||
z156ky.MU/.b%daGKqc/jYZkXK1WE/Abx589H6tADH
|
||||
Ftp://x68qwf2j7k.nc/qyZfwo%8a/
|
||||
ftp://yd.ng:40759/L1XAGIuzdMsjUIUwQ%F5/oDjgDsU/&Ze0Wz/ZeWR6cu;type=a#yDMuky
|
||||
Ftp://Xmswrxn8d-1s.pe.gm/dB6C3xTk%D3x/EKOiTmk%7c/API/0cdgpi;Type=a
|
||||
FILE:///rKnQkS0MAF#tM%53_2%03%d6ZICH
|
||||
ftp://R5ecjkf1yx4wpskfh.tv0y3m90ak.0R605.se:51297/zpWcRRcG/1woSqw7ZUko/
|
||||
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
|
||||
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND
|
||||
file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#
|
||||
http://1qvgjd1.TP/7oq5gWW/Gwqf8fxBXR4/?Br,q=ayMz0&1IO%370N7=;Sl1czc2L+5bRISfD+w&ygP3FhV%E1w36=2Rx
|
||||
ftp://5SCC6BUYP.Knf1cvlc22z9.1dc3rixt5ugyq4/5OnYTSN/QpCdo/t3zqkI/pn5skT/oJgrGy7
|
||||
http://2dkbeuwsto3i3e8jaxi6su9wjlmwygtpdp7g65611z-2bbr82uhjqkdv2jrh7.KZ/FiSvI/aaB&dPQ%42kLdM
|
||||
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
|
||||
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
|
||||
http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY
|
||||
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/
|
||||
http://ah-2d4.ASIA/qmp
|
||||
http://195.139.142.211/%53fk2%90Pj3/V75ySPv@K5ISv/eUiXDAYc#e0%59
|
||||
dFU69ED1EJ0MLT.G8ef3o.bn:53301/klFVsh/YInBJE/SEIzo5EIoe3
|
||||
http://[3349:5FBD::213.207.213.043]/k4PbSpylXc%92Qckx/aQfV7X0V/25RN%49ZzvavLgf/re9~I?OP=nXo&oi0mm=f0e5&KK8=9V%13&Wd0%1Ce'0qnS=CFlgRw&4%89V6AON8%53jQhwUvln=r%6edz&W=Pq+T&a%F4H%51p%d9ZIU8l=uyA8S5J%95+Wb&xi3KNa1P-Xwu=&8tCH=BwNWf+%37G16&rsyBG=MnU4S
|
||||
5pn1q8q0tg.JP/%74XuKtp%F3fqLuGO/CMeC2IRRl./
|
||||
http://bmm4qto-360l-pbemedo4.SA
|
||||
sll-9eg.W6pv.rs/WtYGg51Pt%68/R8fsX4a
|
||||
FTP://r13oym76cysnp77r5sidj8sqgxzpl3ls4xzj.JE/ta%e0PA/5Jwza65o%7D6Uno/RyO%b1B/v6C8yo5K
|
||||
http://2b4ne4.5ji.oubrfdx24.UZ/%69kMsLF
|
||||
tv2yy8dnp.tN8DIWG.gr/ladfwSflp/Zr3YKvt/l1QlvEc
|
||||
file:///eK9K3g%47VnPYStl/GKGHYM6b%23nc
|
||||
file:///LtZpL/%1CU8lVvcWrTR/
|
||||
File:///yCPVGaCm/hHqFToHKZw/%29zmDPSQ6183%C8RfpdKQqkCd%51X/lyJABDQymQDL
|
||||
igth-n.Mcw.ar/LjMApEho5gp825BK/afaST/HWKafQMBv/
|
||||
https://l89xkmwfh-hprhz.tcay299q.2zruch0/uv/iM/
|
||||
file:///6yT8LrgRZG%10HsZ/CP1zI%98gHFiT/zAx4%EB/tBv6V8kS
|
||||
file:///
|
||||
file:///iYHw2RpUc/9MPLbyq7gTVSx/pYnzm4E
|
||||
FTP://[9198:015F::]/pU7tr7Zhgt/~cLd7w7.Gb/4MvIKc6iy%58vN/AGZ08o/uT%1e7vtcZD;type=d
|
||||
ftp://0dfw3ob8y.Jri1p4f-8.NG/DpihVuu3RJ/kEKaPppvl
|
||||
http://pZRLI6.ma/wAex4MoQ/jUv6Vh%5C2
|
||||
file:///F8%A5Go9qV/UYzwol/#839W58%4D!
|
||||
ftp://zo.dz/BSI/enk1F/XjnYRqwHBAyIYdC/rTXmyPP@Smcp:/%E9r7n
|
||||
nhzbw2.qyevbi.gn/Oxbk%737lUb/OBx7/VX67/%C4fxQxvns/4fNNJ9FjR/7YeGTW/7VOLjOD4/P%89.1Forp&3/wLVBbhK/3GdjIWB
|
||||
Ftp://4ie4a.fl8g3c5.wjvan5m3j.4sawo3mof.TH/wfcrCzx8%B50W24/ZxqhiPCLDP/SZbReZ4h7
|
||||
Https://j3bhn0.elhqoer--c.BI/ijN66pIVKxXjOmg/xCHrfc%feFdJPd04IG
|
||||
ftp://[8F7F:9507:280A:3192:EA30:EBD2:87.9.102.149]:4954/AwLZnTre/8g3Vo%6doz/Uw=dU%70nxbo
|
||||
6u.vkhga15zezgvdc68uii7dh0svzopjpr3.NG/rXE/6T~KV%06Kq/iO5vG/G2S9YU
|
||||
HTTP://lZSO.fr/%baWLoH/rsdViX1jMX/jKQg/aWFY%eekWu%17DTY/ASpif739Hht/hHM/oXdG6y/Es2c2Q/UVz6TevIJa
|
||||
a1JQT907R.ou7o81.al/3Vp@VDZp%9c
|
||||
http://g746.mhi.xtzovtn01w87au9.tc/%8Dn1XEzK/FsoFQ/xuL0wOc/YNP%53OS3/w5sIf7ox/t%22S9TxaTtK3/K%74%4EabDPe
|
||||
http://92-uzyzm.pr/UwJkzP/
|
||||
http://46cda.e92kuq1029.Igb3rjaqtc.Xgpak.T50lamdm4sscw1i8mq1-8.wx6wzqxd92z68sbs43l6.JO/Q7RzRWFz2/
|
||||
[BD39::62:47.178.113.23]/U4woqa77Wyygc2/cltcO5Xw%EDWZT/%5Fd@GP5vV#wUMoflXqTOsj
|
||||
Tw95.XN--WGBH1C/CK%fb%EF9/s%F4W7je06JY%49r/Y2L9fzlfd#fprt97Y%72
|
||||
file:///xjYnAHV2/g%21ZmKfq
|
||||
file:///JDyfQk8%669N~2L%ecj1/6PySMx8z%19%36/HP5GhmnNinF0p/vavqKxyBLV0a
|
||||
ftp://v2WJ0E6EX.gw:46170/R1g73Yli4ts/K%09PIdRA/DntZ@
|
||||
pVRN-P.ky/2UMoA1sYRpmUyd0/fEShDdCyd69Nyh6f/6zP%cevC69rdf0#XaOTpyS%73TQ
|
||||
http://4u3o/BKdhwRyzG
|
||||
file:///LdsHfPABFz1vRD1OB6Yl/RS6&1Gmz/mfYul/
|
||||
ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sgn6&X5EiZdZ0WhTX3T/fa%f3Azz
|
||||
z3ymb.KM/DdnrqoBz=YtxSB
|
||||
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0
|
||||
nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc
|
||||
ftp://085.062.055.011/bopfVV/
|
||||
ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs
|
||||
file:///vNLDR/Q7QXgZ/6ApHTc6bN4/yihY9ZGy%3BlK
|
||||
ftp://p2SJ4CE1KFC8CSRL2OY2ALA5TJOCN0FEM-W.biz:51412/
|
||||
078.085.085.242/kqKkywur6Kv4Qn/-CJv6i1Nxc/
|
||||
qow6.7RF9YUV12HR9CCFTWUTQRONLAM4PN82GI8E.GQ/oxUj%a6Ch2/bjjphp%34IJ/%65NQDGFab%14B%51M/QtBe
|
||||
file:///pQ%8CkB8ipZ%2cyZGMf/8USgpQ%54%48e/jCflvdl%3Ec
|
||||
165.195.223.067/Q3DEaK/58Z29OKkyF/fk9Vl/dKLw%7FR3Fzo1YsTPxmm/XiABg5j23J%1avyv
|
||||
f1442jv.3w4cg5hy.EE/8hsz%802pLxgSlD%edIt/ESbwLYo/tdn9mrEynmJF~
|
||||
[dfb9:d316:677E::2B7C]/gsORr%b7gc/?ehIX5=GTM0co5(Dmn91JN&8J=8W7wFuQfZk7sM#vYfk~Km
|
||||
[11b2::35.78.41.76]/vVfZvUimVO/K9hfOd/4gZUL=j%09PGr#o%23LnBOkk9
|
||||
https://oL2UQ.yLN-U053DA.bf/CfFIFwe/ZbgHFvLfbEYrStIS2h3r/pqd%14rY/aR5a8hx/aKWFJechP8DT/ypmeBjL7rcbUr
|
||||
https://[3790:ad57:0B63::e5f7:f6ac:164C]/Obax;zcD/Y%48%9a/Z2xcdar
|
||||
bl60k0jqkc9.oow84o1.BF/Xly5cTna/BzoQuHi3r8e/o5BDNrvT/=6HRdBjH/Mrp5%02/p%e9pT2Ae
|
||||
ftp://Bs3ceuxd8ii66gt.X8wwdpt.BB:27095/3BfkvfzcmTS/FTffh&S/gIWvJ5Kd/AlOQ%3EnO
|
||||
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w
|
||||
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
|
||||
ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ
|
||||
HTTPS://56aderic0knmip9lkqdqag14.uk:45885/lELiK:/vF%4C5Enwqy/P5NGJ2b/dD6sg1yMV
|
||||
ftp://vlt.3g45k63viz2.tcnm3.UA:60664/AJ9iqYk%c1/uKbohn2/K%D1kequ4z8rxFpJ
|
||||
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1
|
||||
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
|
||||
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
|
||||
ftp://lv56pdepzu0b0fo-04qtxv5tt2jc0nsaukrhtz5-e3u1vcb517y3b135zl.e0r1hson.dk/3TVoqjp6%1FCFSkt/006VZfho/gxrWxgDawM3Uk
|
||||
Ftp://7n977.Niyt.2fgkzfhj.q7-DJ.Ow7a.it/5zfRi3PO8/1zfKT9%421tP/?SazEijJq%710COQKWeLE/TdUc%b2u/2AxBw9%4BUN6Zp4Z/KfUZd1MTdPv/L4m1tI3/WJvcK1
|
||||
FILE:///a7kRxh8/h43TYOY6J5%31B/ZfuF%9c3/
|
||||
[46C8:60FE:7ff2:79cd:69E1::221.191.034.036]/Q2MQ8mttjsMF/UqrKq0W%E6N1#YfB7A8CHYa
|
||||
https://hnk6fx.2uxg1e9o.pm/I=LKn%a2n4/J&RntX3mUxZ/B1Q.Ilpk3Icq%7fZ/ia:4DLuk8pvsD/mpED3egQJfH/O0es5zrzwWQIC%21K1
|
||||
ftp://133.195.101.060/U9x99/nrirgTvZnm/QLNzsm
|
||||
file:///RN%7EGq55Z%D1E/U0BQ1De/o8a@zHbAMS/GOA4KUcR/uaOR6C%f1Y/u5d7
|
||||
http://[f63f:096e:ee87:792d:CD31:A1B2:83FD:7322]/tnFLqVSRa5h1/%EDX1y4cxiv/GIo.OM0/M4lBr/xgHa=
|
||||
file:///Td=wh:cuTxKx/4B8%dc%616s&sE/snROY6GQc
|
||||
ftp://1fcu78n.COOP/eDRJd%82k8FEI/7fbDLiQncgOl
|
||||
http://obp6jiork.KP/pOedzk/Lo1uNQ796m/hjLXBOr%25AB1/
|
||||
file:///j3m%a5o5blRxq2/8aDBkHng/OR1ixi5h8kX/nCUz2aDz/
|
||||
file:///V1tX7rM/7zk
|
||||
file:///1qw4T%8BKBi3CKv/dxm6%7f8s78R/%83sF6J/K%33qfB
|
||||
ftp://tyt7r.u6ier1pxipif5.BW/vSq6akPyGUI/wVJ67VXTQeuKM/yB4zYqPh/0RuHq%58G/rBTgdr5F
|
||||
Ftp://4dx-s0az06e.Su7ir.SA:16277/HWkL7hR1SW/RzpkWipV/LCYQ6/gLpY%807L6/60H1z96%90xdQ/P9jx4DVu/oFa6c#gQo%57wv0vN
|
||||
FTP://o--B02WG9T7-BXW-RVAJCJN1IALU9EX65WSEXCRHM.Aeh-m.cat:34416/3q9yW%53m/FJ9&U84ik9&e/R.l/ji0sjWb%5edu12nbNSW5c/YMGfLcesN
|
||||
HTTP://lMxNbKW@tq1imryvi.P7g5o8np1.SK/um4Z2TESWBSrcN/fNehEdgh/sW%6fCP/b2fqBsG
|
||||
http://Lgwt071.sn/HPn4x/%46zCwYZzy/wzQVoL2sT%E3Yl?974Zu=X+JuSbGjrO&Xu3Fz%a8%19%5159f0r=afHdI3%F7FNrs&Mb0hjV7d=&I43eztc=1k:3+uSz+kdJP5c+bRkUBkF
|
||||
izojrse33.9WTVFAANL2Y.ly/i3ae/5%0Br%f5yL3/MsnfAk#T6,v%51Ev
|
||||
ftp://[8714:3F6E:aa8:c8fc:4F41:b8ee:44.74.99.35]/790Ug0mWq/7yBPb/pzh4dTX
|
||||
ftp://[ACC9::DD55:A45B:7a6b:177.179.158.116]/i1q3SzWTmO%09p%A3/FWDWq8u2Q/7
|
||||
Nw2m4j4.Br9kvjf-9.3wac-fh0uk.nysyu-emjwy.cat/PGDh:oW%5F/H34QSRwe
|
||||
6f9f3nny.mq/ai%cb2SZP/qfjOd2mpEH/LUZ.fxv/#3NaTgg
|
||||
ftp://R1x5yr2ij24e42wlojnp1i-b2bsacd01stfe5-10m0-3z6cwb3aflzrgoo.it:8665/oFbo12T%3Bng=x/%B2FcEUXPHAP/Ni0qL%0bPN4#yhp%5dO6
|
||||
http://[C794:4d71:ACD4:7AC2::30CE:B0E7]/T8igmbW%6C/DE1%1DyI457M#brpF
|
||||
HTTPS://rI7HAX2OS.bsajd56xb48.FO/fn9eA4%0A/G96ogw%69SGis/1V0hqVLN6zaQC1
|
||||
http://toncwiacr.0px.g7pud.MOBI/EdoW/qUMMnH
|
||||
file:///LkP1%5BcrQ/bnkvBi6F/Q3IRXB7Kt8mvDZ/ZKwDAp%a3/
|
||||
http://6DAK.8I6FGLS.t5YJHK9GCUVU4EB6NO513HBTWAU0XP5.GL/LDO%8CDB%82p9#
|
||||
file:///%46f%c5KRhPp/skp1X/OdoS-J1foeE/5H5RIWoip
|
||||
Http://180.036.254.028/VSiroQpjS
|
||||
d54n.Agqa6.7e4.JOBS
|
||||
https://5t33av.5u7.RU/SugrkGKg/FDf6cYm5QdHk%b3z
|
||||
file:///tGHsUEMaQS/VLn1%6Au#uGnrvY
|
||||
lm.27.jv4quihwsp.mw/mwCDm0cweP/A8wSZIQcZGV/uKBboAnqevGJEQT5d
|
||||
ftp://6g4.qe-s9txq3o8vvr5e.5YWZGPDM9Q.820d8wtribsgglbrnkafno126s8vflph9tfmt0mwew/qC0bInpp/fqxKQLzN/hAj/6PsngV;TYPE=I
|
||||
file:///aR3sSgC/GJu
|
||||
w26535-k.Ut2.MS/pQP1Rx/NUKUyRSr/21x/CcgOcN4U/Jzw%C6Ft/n5Mu9X
|
||||
ftp://75.22.51.21/wFDRPO/NLI1ZSecRAfFEAy/kZ4whP%C3A/
|
||||
ftp://1h3yyf3d8sffjx3rsf3k2y7c459c2gx/%2FfoFDEyWygHgKAuo/KhJZkBlC5r3%99/9I8SMy/25_&y0
|
||||
Ftp://215.239.176.156/tNfD%09mvdOM%28zx/fc3DTw2nf/#2kySKJ
|
||||
http://Vyt.4ferfwbkbm.owtk.me/LlUtIjj/BDovC/6vJ4Wbk/ihtBt4d%acVl/ywEBIdg%3dHb/
|
||||
ftp://Lq.es/%B1ZPdTZgB2mNFW/qre92rM
|
||||
file:///IZ47ESCtX%aatQab1/V553gjR?Me/#9%68qPw
|
||||
file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH
|
||||
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
|
||||
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
|
||||
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
|
||||
ftp://[fd77:4982:C37F:a0a1:7651:E09C:117.093.145.017]/2l91g/s%79lJmUiZ/%A5R2qsJ
|
||||
[62c0::]/d1lmSzoB/5OBVnzn/kOXW%D23
|
||||
Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%bed=uY5hO+s+IKk1S&Q=HHXEC+Gof86QIRHy&35QY5=
|
||||
FILE:///#F9Bgl
|
||||
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
|
||||
File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg
|
||||
ftp://892f7.oel50j.32.9qj1p-g7lgw.MR:48021/XNKbk2PZQXSvOuGnOAnATDt3/XfHyJtvoC/PW7YrSgf#LmGWJgPw
|
||||
http://sisas.ua/4CU60ZLK4VgY8AR89
|
||||
FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
|
||||
Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz
|
||||
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg
|
||||
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
|
||||
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG
|
||||
ftp://tw7d-6yu.im:2055/%66qbqzss/OmPGW;type=d
|
||||
FTP://zst.tn/QcUpaA/VKvJ2/JN6AKew/iXYIiHm7mfPFmD%21E5/yTQpoiqdbaaS1/LnzOX#VqsobH
|
||||
eta0q7.2r79g.AC:34736/%abp87fVdPCY/PvO8Uk4WoLF#A*HP1A
|
||||
https://w9zhko2rttzndzivll92.sbzum.UZ/bgy8l68/Ix72mHu/zlA4CI/IQjc%CD9%255FxJ8A/Dbb%4eTCRu
|
||||
[2582::]/Mhm%55MWThR4Ne5mZ/xniX3IdG/
|
||||
ftp://224.3.121.112/G1w1g%1DdRi/T6Eb_NegqJs
|
||||
ftp://tn.z-o3vn3n4.5wg7.gs/loxilPpcLnsI/topa0Ez/Na%70Dcde
|
||||
syt7m.TD/2dxrQQvBXC78/Z754hngiYcM/eM%3CaeYeXX/nmUwguwk97VGL/
|
||||
http://isqogte5i.c-3oixcmy.SY/jlPVRlTs4v/enCZWc3Sl1dJ7/M5GTSZx/Ga%cce%63cLzTJvBodJ
|
||||
bYIAYQ.9mlnx.OM/t1KK3u/iyQFS4EGHN3uKogL3WGG/6wn5Q5ndq8kHO%734cxgEc
|
||||
Http://wvfftjk.do/a0%644z/?ATzWOxO1k=%85ulHR
|
||||
http://fnoY09@bm8xcfjyfiremhz9.sr/E4Rrq2/vQjQKj9fwV6r51/mn3x8he7/W4xCQs%FBvrzb
|
||||
ftp://vxfr4g5ka.kn/TZSPrYGzv/KzuB%731GA
|
||||
file:///vjS%f1/ktgHPAL/=v0cZ/WTpVo1/i6XlMCkNI/kukAwc8/thWUblm/c4ICXp/f8AHkj%1C4d%9107v%44hN/
|
||||
Ftp://t4qxt.hd9ok.aUQ7GIMBGXP.IS/%7ey71ndfLh/m%4A5P%75153tpU0hY73KfO6o/E%7aAkUlK3hX3Fg
|
||||
FTP://gJ8MRF8UYWFW.iq/cdX7RYOqS/6E6XUh%fcdHS1%dcoDwHgpFId
|
||||
http://01s0hfwz.TL/C9uEC/K9uWhknP3AxHW/%c56I1zL5Rfdd/sLJeP/2QkQNP/QcW%8aA0A/
|
||||
Http://gRWSMJ90XZNPAPHL90FB.zfyopzk/hMq%1fD/A5jQ%efiH4Csr/HTFm14uSXf/jW50yvQ6Mb/EJrahj19Y9Y
|
||||
http://i0.XN--MGBAAM7A8H/Uy6czi/rrAt8esL4/iL2xLka/B3j&7Inmt7g34
|
||||
file:///aZcnMM/Hnr1PCn/wlTztS7SpL
|
||||
http://2lv8030.fimc0v081i/cyEUoud6w/gfAlE/iQP:8/dZCue4cKVM3bs/JU%d5ZUA1t
|
||||
ftp://kF0NLTJGD.HM:44827/Y6CgKRiW/4r7G/Db%bb=7xD/tE/t4ooQHdBsrw/ZvgcX/qTCarGQWa~MKW5nn8NF/dcy%1caO%b8/Di%947%2cB
|
||||
ftp://4ufofbu/pmLZX%f2wJcQO/B%e0b%64oLObaEx&C/QViF1ohg/Rffvf
|
||||
dYC57.CI/=G0dg
|
||||
185.224.223.157/h8BdA%FEv/KLK2f%86LS/gwA4rKKHLarf/b.EyE
|
||||
FTP://uhw3qgl0bvfp568.e5wkz1l.Dug75a1j.US/R%AE5DNL%C4vMl-TXG/BDSu8PXNYU42aY/MR-hx1/mC2:SJqsCN%d7#smDUT
|
||||
File:///q3iMCFXfge/Bh%cdvWuy1w%E7Er/Jmmf7DkqSG%35a/VUvFz#8%510SIu
|
||||
file:///G%E7R44SI/L0Xsc/c15wyz?8Bs4rN7
|
||||
FTP://eQ23LB4U9CX.vcrnx.2fa.k6rjf8b.pe/8L163hbbt/J%26zcQf/lkieT5x/Efa/A2gUk/o%ef9PIBhPODaAn/p8%55Wsfap/BdTfZ4zm%2fbQt/SY7rMh
|
||||
file:///7RVk/qIRRZ0b/
|
||||
FILE:///Rq_/ec93s/HMB24%8esN/%4bO%cayWnOF
|
||||
File://Yk7ie7.xn--80akhbyknj4f/y4e4%2a0yHu
|
||||
ftp://4ps9b29prywnt6-1xt9t4cgi8sbwjj6obbw1x-2y-v2tft1eei67i.Hk0u4zwmd7o9z.jp/o4R1sdAnw/Hu408%CB/HdQ6cFhG
|
||||
ftp://7efqt.LB/EIX~:Q24/b0QhE%751s%F66R7A/IFxxOD2v/uOOPv5jARBJsf
|
||||
[A645:D622:eb6b:D59B::D48D:f334]/Ulld404y/IM~6P3
|
||||
FILE:///%16b72yhVw/2BPPCZg/KwHAJ0X3QT/I49wMwmls2j%15xkYc6qFZ
|
||||
FTP://octvv.2je8.oJRUDE.02y4htgs.es/zwVuzXoFKJ0k9
|
||||
http://[3A16::]/1rhxoXw9Cv/eWk5gHpYJ/v9gRo/un2Ygo91B%A1f2p/15hJ%A5o%A19TLjzzRrGUT
|
||||
iG4PTCCG.3zti905z3.ci/42j5.oKj/FZmOBY
|
||||
Http://pclly.36XVKSPBC/Nja5D
|
||||
148.020.113.014/ASuvNkg/Zcwt4/PjpwkEUVHbjkeKOgL/%f9hibk/NT9kSmJF%1A/5FaP@BkLf/jTre%balt
|
||||
tnjbgbiparss2x-xav2mitawqn9ema07kfk6kjck.xC1U6J.hm/scUu%E5D/qZ9K%1CX.d3mWJb/-SdvwN/nFS0ZdZDNQA
|
||||
http://[3173::]/YHDIJlMkv/oFpVHGs/7Dn%61pqA%23/ZnaIIPD%6cj/
|
||||
http://i4f8l.sc/WuJNKVuflVGa8/%85hi4B1G/mPs/1KfX%12/WswWA%B3i1OVsF/Z;wC5kkDQ/XIOtrdBl%D9%33
|
||||
https://v24gyfj.xfrc5dy6xuz3paev4rggl3xeg3vxzw7cz98pbcgum8xlczt-n.SU/Mb=PxgWX/J04ScMxk8u/oH%A08nv/3oXR85tM/
|
||||
Ftp://c82a3i5u.tf/v%D5/%05QNNYI&ssnoF.
|
||||
file:///MaIzEiaVY/ssIPwkItF%EBIUy
|
||||
Ukg.sb/Q24uLBUl
|
||||
HTTP://Aphi-iog2t.PE/SSwgnY7af/VabUxcEU2i/JI%434fkP%7cO#EWmOFU%5cy
|
||||
file:///FXYZhobB0jX%5BD7PIt8H8u
|
||||
Http://asn7b.LA/13Qp3t0dY/Mk0ldhZyJP/rRgIZlOu/hqt1qM9NT5tAGD07T
|
||||
Http://mb2.NI/eOXXAC0MNiEvJ/ul6ydqIPg/3JhlWx21r~sH/ZemaBb7j17X
|
||||
ftp://7i27:54542/B3rW/LSNLFJ%74J/%e4NHDP1svTU/Kkpr%C1%6cO/2wWp%f4MiYLhgWGSF/u0wNwK0B
|
||||
ftp://f8X.cat/L7Gj-OSdF/QBrO%f3okEZ/L%bdvAyxC5
|
||||
ftp://[6CA9:93a1::]/?y057O5/l9C:/XsBy2so5tX=D%71me/
|
||||
file:///%33P.AyK6nB/QkN%011K/iicc3HEIE%C0/v_7Wl%fdzMCBnfC
|
||||
HTTPS://zv21qs.ekofwyy.f1pd7snnae0n2nzfdclk1sf4hybx97u17piaj5-lul89bxrf775koowj.as/BAc33xOV7
|
||||
ftp://ko%5BM@183.207.071.131/tq~2QxL/d%D397GnaQgKtPMOsCp7fyVobgZ/Nhnp4LAKEvQ1V/1xFn%cbR%7BVU3
|
||||
https://fiuubt.bc-yrorta.kdn.M8mascygepb0csr.vpifk.G-p35wx.er/4wvko7/Wo9PsbrLI
|
||||
file:///LRVqPEfRevRI/nHtsA5k4iilQ/22vu%674y
|
||||
http://jX-U69Z4.3vuws.41h3q22bzs.o3hng9:6629/Qj=CQmh9/%9aCSTfa%0aXvFQ/u0zAICPSGUx/MqP32INW%00mp?ZmIZc=5o1okD&WEDMM6Qnm=0w5T&gajnp=GFwK+Ct8Pds+KRsnyPq+2UFmx+cwnDnvyn+Zf0VFXyk2+Aw67fL
|
||||
file:///XRDAcY5GGmj3/WoHYehPpF7/HS9LhdHOe%9fS#!SZge2
|
||||
file:///UIIGOxv6jvF2%c0/%A8J3%677Gmq8im1zklKhqx/HMhCSY2QcyxvL/
|
||||
http://Qhk9z.zm/cOGBen/mBsDycEI5V7L1s%84WUj7863/p%5f~okuRD51b0M?b%F2d%67ujGr=oh8PWUtK&j6uX7baX=&sg3RUocA9W=m5IaF&JWH9G=fyiOtnC3+7RJA+ippw96rvu+BxtGg&F6f1=jmPS&3PE0xX5=TGV%5c5J&%fc@NSEynhuvb=&MkRIt33=
|
||||
Http://[98cc:433d:2C25:62dd:54ba:d10b:63d3:4C40]/YlbNrJod/fdjuN/qYqSdqr5/KAbXYHO%F0m7Ws9
|
||||
file:///ywFY5HK/XAv@v%66o/M2O4Wlny50hypf5%02A8
|
||||
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
|
||||
file:///enqvF%EFLOBsZhl8h2z
|
||||
ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A
|
||||
ftp://1xf.ipl4f0y6c4.VA/LHuq~/p2nPbE/0YGGNJB%DEje2psef_B/aKOuMl1Q9
|
||||
ftp://o6ou6n.N8.yyld.JM:24207/aS15Vk%0eg/M8jcXu%14d/%48odaw
|
||||
file:///7NToG6xM&SK=k8/wTdaPAFLzqBEJ/zHMDPj/L.fLv57c/z8QYrsKS/CEkA5FEhQXBQi
|
||||
file:///UWrC%9111nEhh/45FHiTx%98L
|
||||
http://35.iN13LEQV.z2d.in/%B2GBtdYtQjc4TTr/gLxjU%B3c?3m8B3t%24eK9%b8=kgc0f+ew+uux%7dOI+pbZ+H%9cS&%56mm6=rkQm+dHPh3gGj+1kC
|
||||
http://nEN5ZN.EG/%0efsf4v30L
|
||||
file:///19%9947/ksd3Sq7W78%27/2K_Ylzcu2q
|
||||
r8sht9qzsc1e2wp.ci/8SbPwlW%5ac/qKEqFi0Q
|
||||
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
|
||||
6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/
|
||||
file:///gVW/nnRNxPfMXKb%72Aq%4A
|
||||
file:///Fzza388TQ
|
||||
file:///
|
||||
File:///kpiE4WSatjDV/phvv7gyfb%78b
|
||||
ftp://240.154.225.198/I%39uutdECwM/PViD~qPa
|
||||
td.KM/0Dkyg/B%65DiABz/wtqGd/i7%cepV%86XkA
|
||||
077.102.005.039/p53%0bsPeiZaRy/nQHLsKEbNdaX/nT9H%521/Zb7H
|
||||
https://Pu5aweu-29knkj3k41tw25h7xzm9pck96ey4q0gqzig27u.vLPR1Q4.vg/QANLMxa/gccQ1ekkRDr/?bXRDWO=I%0ap7%f4PB8S&t%a0Uhe1I$j$=Mm
|
||||
https://J-5ytf.nmp5zuopbj1qbl1ik2c4ihjwu6-q5dhn.ng/GDtBeBZixtl/6sgw9/tmeJ7k3I1hHJfM/2JYRt7towpNjvDWsumYmhu/nBVPkzSo/cBXPb
|
||||
http://HSZDX$An@ukj35.ve/9dLg7XrzV8g/hXhzX;2/Zw3KKwTP1um2/qej3miaDjj8v
|
||||
http://sL333Q.Zci48xtb4g6.lu/sQw4ZHF/M%99%1DNl/s58%a2sCxGQ?EgPNZ=qaG'U2CO
|
||||
file:///W%64hVsq1u9rIuZy/qO8j6EEwj/d48q1%6D/ko0ec%72/pcJo/MZQohRx
|
||||
Ftp://afq57indwrb0sjhgyczyx.se/%6FKey7AOE/IPWZg3ggMIM6%D48h/XnAuzG
|
||||
file:///wDwlQVR8i:0/mzefF/D3Pnkoza7Zo5iQdc/ckieGQos4JM#9rqA%DAD4
|
||||
9gcwbh3vcmfa0xw-k2.MC/66TaJz%FE/SnDRWAknGcI
|
||||
Ftp://%cdaTNzNPNu@w6H.V9aps/87/w@rPBGa/he%FBu4vpT
|
||||
le1u.43cdu0n4.bn/Q0i6uNz/9%275%a3dAS/B%2fpPkCW
|
||||
ftp://131.173.229.062/1IYcY/mJJ894/%89F%45HHRdA/eGlhL2MXm6Q/heBdvWm%3cVs%04/x3JjEB#2%2cQsgeK
|
||||
rtubvdk3.PF/L4TR1g%5f6/Caov%FC3vK3ofrH/pz33aV%54
|
||||
urlyuqr.ar/tzJzKM/gutrfWqv/IC%24bbmSS%02P?%24JV=zrJilQ+tH%7bh&hbO7Puq8c=K1Qt&ULqdYq=
|
||||
Https://pFOROCZ9.dRDP.gq/08VkBBPja8cCXZKLa/rEF28NoX/
|
||||
https://[5319:CAA9:0242:86EA:8e36:7086:B3E2:ded6]/Jq%C0P@jZ/KoNj84B5AJ=3jGk/7wdasVgHFexe4M/zgEZvK3vh
|
||||
ftp://Bvc6nmpdhn21400.Vo53pvqm0/u7jz0O3bbFTTegZa
|
||||
l0q.0b82ck3a.SI/EQf%a6#mhJ%0dfWnfM
|
||||
http://hr58b8n.bL0/LppkKdZGYdxiHg/2VXeZWR/T4fCmyN579
|
||||
http://1x6.yc6g6uw6htmwcrb10t4kwc393g29cctmtdxxz1j.KZ/G9lcwKju/UiH4E
|
||||
7T6OSH.PF/zfYyqdxITCI0
|
||||
https://2diizsrbfh.PK/t1zBYiDPZG8Kx:/pEN4b8xKu
|
||||
HTTP://r53fl98bazbqhc19-h-r.qif.AW/8sH0%59j%FF7/QPnw69%17Og9V9l/JAn2c7i/%7Fta3x/P%08HRF/
|
||||
qvpqmoa.O-0.FI/TDl%E6x1oUoACe/4VUZdMKL8Axud/JEZEF/KOR7Q7?ifYXMx@=&iI'!tR=p&k2Tv=Behew+RFW2c+w8NOK7+?BGH&:TYW.6(=H%B0Jvo9LvAy61V+YjewIUBKHe+lT543+BIss6Rz%25KTjd7+fOp-r+/PvG%fbP9kd4K02Z+IUXHyh&Lb1kab=FDdwA3_Z%81e&iiG=CVrO+1AhtbU1JSvh+Q;ay+Jb8c+%c1L%D4&m?r%0en=8S$wF&5JOA9WI=&kGJ=WjzqGX&Bew@sXE=cl4a+2S8
|
||||
http://jykpqk6.sc/VBPT/xNRs7JVoZKE/
|
||||
FTP://2w-y60heg64rnrmpyv43tpfhftxolu-5u.lG0BKW.LY/g%7aPAj5j/qxyE/D79g5vu/
|
||||
http://Unp.IR/tN;/bCXe/fxSdK%00%CFB5N/D0L1/bjf
|
||||
[cf65:1F97:24b8:652a:FB12:D0F7:181.134.252.162]/1jXwBjjxpC/0zKR6N%0bhawVF
|
||||
ftp://090.247.102.174/YZgWR%A1NP/f6YUa8dEOoOk/a7%59Geq
|
||||
https://Zn.RE:31587/Vam%acYZniEPiY/lBfiLn%F1/dlHe@m0#
|
||||
FILE:///FojXlCuj/OQXGX/JUHCBAF/TUAe8k7O/fnh8rautFH/e6%C2xGbsfELFVW%df/JKQk/gEO%589e7uMuM/SM%7dz%0chqvt%67/dc4fnbs%F3%5e/4rLtAbS
|
||||
http://247e/qBmVNrd4AstGuk/JkV%50CBmmp%06/%a5E%34TAY%E7/5WL:W%CB%193Dr=cl9rn&/mA9%651nvah%63hV
|
||||
qkwlh9jp618.k-x.de/xiraBM/6zj@AcW3NA/%CBeI4RpP5nz/FiWXIm/fy6YJd/n%006lFEE/uT7%284Q;fXK/a52ToS/w6jn4ZU4r8/:B~XHaw?G.cE=osg8k3&iGJ=V4&w1vL=me4QRwj&YFgq=%22zCDTqgmKC
|
||||
fjrb5z774.SA/PVZsWyA3sMJrb14P%995vIm6/dC5=Hj7?cxCp=bZ(40%15pi
|
||||
ftp://pd5mz0sw.53t.sent7dh.ki/U%57Qz9g?6/6TOmiq%6F/
|
||||
Http://g3t2w4.2AB0B.3eq7q.RE/fvvJYyHjd/%34FK%98WeZ/G5Ux06F2BDF/
|
||||
http://7Z0-0PC.txi2srk55gs1venx.uy
|
||||
https://i6.kzdyaq-v3.9j78y.oq5r.gpm7oh.x1fnc78-tli.5yu2f.3hfnkcvwoms.hWRAX7TAJ.7ei.tt/Ysy-/sRl/LZa6nw8
|
||||
Iq7sp.vLK69LN.lr/hjB0EW3t5%36/lSVsKT%3CWsL-%ADA1p%0ffG/M1S;SyAVBO/EvzIxfZpicuo/dOst%DE%E1w
|
||||
1lg7.sz/X@ENk92CPk/vVYJGN%act
|
||||
ugk7-paad2cswwq3kd82lp9r7-i93galijy4x4.vatv4ag.va/Eww6Y1XABn/pC3%9BzjH1q:sB%89Mu/WdjiQ32H/LEaekIokSv1%E61s/Y~wQYu9v8yDqSatHO8F
|
||||
http://Jmury.vc-wuwj.rn0o.ug/EhXMKL%64/CwKXyRnpk
|
||||
HTTP://V7c6lvas-wtxspcp53z7o-v9dt13mpp7gc9ezt.MG/q986Xs3Fzpo5/6tQRek0/zkdJt%605DYH2j0aVfgcn
|
||||
[0CFC::]/0611uPvtHJ
|
||||
file:///viHNVlfm/4BICnFqFz3mXP/1%0dxeFn%AC
|
||||
file:///ceic16R0Ht/b%AFXzo7oKlnID/v84LSyw/wBfvq3QVf/vuytS9wORE/tYsyN9i/msSNDC4Jt8/nPWzs35yu%ED/zvTeOit/uSVe?PyD
|
||||
FTP://8GJ0QK.rQ8H0BIQZVFQQHPAWF7EVV12.LU/dLOis5Hvn/YEA%C5Z68E%50hS/Ie1Sx/
|
||||
FTP://bGCO.apov3z1nrv.ke/cM4fSVF?%ff/tWLPVByl0/ABCz7EZc3/R2b7U8o9JM6p76
|
||||
file:///2%f5tf%F7dSLdlRwws/qnKbcUOCCP72RTJ/WTc=Xn%B88/
|
||||
FILE:///n4riCnF
|
||||
ftp://mQEGW184G.Hv3zhea6.ST/iW6mhdm/G9mpZUib4loe
|
||||
file:///
|
||||
https://A0ea6aeynb4z3fsvnh4wg6h7.9bicz2zg2-695lf1uql14i2sjf6pqh1sae2j3k8iptes.57/jzHSQ%ebP5/%e3%9Chd/#VqMzFZrd%ddpe
|
||||
6wmlp3ipb.cqi.ikf9wdku.arpa/dMq4GciIqW/aL%10jc%d5d%c4v
|
||||
file:///lT?KC#nXl!iMB3hl
|
||||
FTP://P9yyxqsh1rz2q-r7gp.h0W9VBZWGP.tk/gvbKQnzs/q1Gb
|
||||
file:///7KTju7/x2t7Qen83hFitH
|
||||
iawuqq99.AX/;aTO9WOuOPwl/UAbRoxCcv4
|
||||
http://h-juvh.3gtf/spUbB%2aq/#%9C2/LWN&
|
||||
vj021lv-xpcrzcaibfgk0.ad/dVYoNrxc5/NVH90Y7CCv%4E/vITM8z%C4?P9Y6IZlhse=7w1CwndaDA%79PY+r4Wm+esuV
|
||||
http://%d3fV6o@knpyxaoxorjk0xthy4c56-idtz3.i91eof5.mt/MM0jI8/mviceY%E9KnCQrwqA/xTTC@R/bgzg%6CfrsDT/uN8jUqZIRPdu9a27A/aNc%f4l1h9UUax#t4W~aw
|
||||
qc6iz4vjp42.9IZ.l87y.4m79dnm6i.tqhva6e.dumzoy.GG/aNgCtk310/ltjBeHJh5uJx/XMIgU=CSzwD3D/
|
||||
http://p7E5E0.hhvqt56.ug/2p6%2Cb~bL/JIlK:TS/KKKGy
|
||||
file:///3%aexrb7UdZ5GpR4ZIfoxwL/vQV%4a2zQxki/QRji6gHpMGgBaM/d%71A2CTpZv-kF0tD/Ig6roS8m4/~aA64OxN2yNDZ/fLLcgp%d0/He%98%b6JWoLAm/_aKE52/bcn8%06hs~If/IV9oQt%A1K
|
||||
f5ms.jp/%A1FpERWwTd%BFG/ExC8V5aqx5l2CLJr0mJb5u/DgMvEzAr2U/py9Vg/igr9PzANtw/FFiN1E7
|
||||
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
|
||||
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE
|
||||
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9
|
||||
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x
|
||||
ftp://D02-auxxaeqnv9ve-jlmo3.l10vqu.12jl.2mvjwrsqm.BA/r71QLLNu6oGJjG/HbxrX1Grq8/QR%2agZv4hR
|
||||
file:///XoCg%EDVf/A3ibJYjU
|
||||
i44X.a8H-WP.zgmnrjxq.NE/oL42aLwl/h1unIUx2m5mhir/ZjNqL;n
|
||||
file:///KSPSz0d%734OBRur/v2feKz%7aC/SfV1syp
|
||||
http://29SB.j6/ojVDhx/%A7e34T8%01L%41BNV?6uRxM%DFd=qg9jmHtW5R&EeR=%f9,mnV.cGVNclEM54f+efsLBpEc+3V7mIJi+Dng2-Qk9&t=VWC!+5gUmI&c4c0sX%51=%03?a3mDKm+4rHPsfb%dc
|
||||
96.79.198.95/8JJUovS/
|
||||
file:///.LxM7EsLzp%d2/sOKzUh/IVX5Mw-PVormR
|
||||
5r.uL9CQEBDLX.bn/?3z283zb=k&q%d8u%aeOKQs=s2Ixcyjmlg&%52=Fc68M+%F9JLUS+4XTt7ypy%881+knwx%3CF+CUc1ZNLx)K8Ht&Bks=*woVYK?GE&vv=P+b+W%134Flc6+%2e2w5%cfPu%5BXUS+PAAvb+@e/E
|
||||
http://ol7ctcj1x.Ugk.na/jnDQG9WhW/r1cIpcqfGNMDWto0/DfPQlP
|
||||
ftp://ico390kww0.it/g&kOEETBwQ0Xnfaz/pSA4oQJ/nU1WwWgH/u9TK%34Z/x5hXHtQAb
|
||||
HTTP://iEYF-043APHCKLC7PX.qB28RKI5NNRTNJJ41MVKDI53GHXIMLM.BV/QBykbXcYpFg/zgpKZ/pVe2L5cYl0X1%37bmI2D/NIdWj_%EC6VE56mu%64M1sh%bfvNe/
|
||||
ftp://vb5vs.P5f5jmxq.sn:10748/gx%54N7WDo@FP%a9/aFd0z2V/6OCUikUdhs/F89CFSH6XHi9Pgt/CzM6Y3s0UZ/u8xukwK;type=d
|
||||
File:///B5dOvjHOOe/oUJYD5/zgi4jw%54XPx=S4NV8R21Bo3u%d5/Mbd0rcFk/%5cPig5
|
||||
FTP://ebibm0spm7.cat/aalird/1v6GldpVgXA/9akBrbVRE/FbH97%67/YfhOfgG/gPiGQb%D6?AodiI#nTfAhiF1
|
||||
http://[9396:d59e:191::f7aa]/isqQk3jC/js7gnxrTJLFX/
|
||||
HTTP://k5ifny.sa:32595/8XvVVW6Tp37x/IF0IkevEa9jqkw/58g3p/MZB%94sVPjmF7/wZD0BUp?N6P1o=nH:%5840TZNN%37eJ+AJXoM5t7+UhR&%3FCC(O96dC=e2Zqj-YxOMwv
|
||||
2hr.p5v.6aqidmeffi.flfqfx2znf.cup605.v6ktei.mi6.AQ/ky~LSgBJ/3JZhLix/blFeDQRn
|
||||
gtf7abvdn9i7cr2e.YE/-1vj3Mw/P%CEXiCFd2a9/vm
|
||||
http://3rsqw6jt.cv/n5e9YJBevO5c%6e4rW%a8/iKy-raSDu/.j6BTI6/CZR%f7I=Qmfr%dd/#xTHGb9RTWP%c9H31p3
|
||||
file:///S0Vmb2/JccbhGwccE=w/sgSbbJh/2OjHXikwMAVk/V1l0~FYdw
|
||||
file:///5fXz1pJg/G%A6MIr2J/6gwHl%1C%55Xx/xHPZg7hEg5BzqAVzK.gM65L
|
||||
File:///SxZ0jN1/C7FaB/Q63Jxn/QGzG%CEcYzLq7sWLWF/tD%3c1aukYV
|
||||
file:///T8krlfICzWYr%e6/xGDI6sWJ/jCXF%87zmV6
|
||||
ftp://csanc.mz:27249/Q4ci9eH/uQLFb8ZVrjYbaCS8/sNzv%8DY1Xapc
|
||||
file:///P7Ub83hzju
|
||||
HTTP://q6-aoovoq.j-joev5ivayrom1t474xlqxrfro.xn--wgbh1c/WiS76Kh&O/IDDo916%22Vp4/iZYdp?%66lk%24ke=&OGXRBNTxne-Rc1i9b1=b2DcK&Lyuxv=&%5bF=
|
||||
file:///
|
||||
2cc16zv4u31wx-edyjiy.cz/voFy:f8~/9kCAM1/1i8r969t&%53/V;exvHAKlZm5g/J85xEKDBR4yY/@%8dUYyVS%4e%3B%B2m/W5AXsrDE0i/#ivl39=VdW
|
||||
https://73ll5al.MO:10068/5K%AAf0p/#5deD$x1
|
||||
FILE:///a0esBQEE/
|
||||
qnta8.f9284.5pvu.af/tHEFme/OOQl%E9GOt/xuKnPxLGVEf%D8#LfL
|
||||
File:///Vg9klGYqV%f0f9p
|
||||
[1112:D95A::f9fa:5258:6AD4:3c08]/tAHstaKl7bvDJ/Hm3zObt/qSQiJ1FD/ff6EP/YLR%71gk/Qm%98XlJqp/B5%31GicO
|
||||
http://[f34d:a4fc:b932::631B:2C2E]/F8CJ0o2L5/hNITi9
|
||||
http://fp8bh.zm/R5WFY9BBHOmi3/OyhE6XN/7tZGprtgW#hrKj
|
||||
mAIE.mXK.qq.3WVWRXC8BASM2NX8GRC-L7O.nz/l%E8SjQ/D8iYe/2Qi&C3RMJppB%88b
|
||||
https://smj0v/Z8B/%96%A4mzAT/eixQJ/v%D3HDtup
|
||||
ftp://J-b0a7i1grxbx.gt/MuPMg3Ly/r2iyJo4R4opO1Xj%C6
|
||||
vbhx1cl9dgl-asht.lDN0ESMI.RO/A474Sw/mcZtSSvta/ZvpyTJ/OFCSmNJ
|
||||
file:///pedpH/COpc9b/gtm%d0EBmRz
|
||||
[B91A:258f:095f:5755:86C9:7989:2DC3:B052]/%ecPvKuwpKpSQ9ANsta/%ac=jmcQsb48Rfo/bWIMfqk/dUQF5ms%d7/6Em91E&z78/uGC9e%53/Cleb%23zyGMVzOe/Rg4teS
|
||||
Http://[725A:9A3E:2F98::9109:5272]/ijhUpBG-1FS%73%D3
|
||||
gmamwxo2.0z8rwjft28enmc.p-5uyn.u6E6AXVBP.ph/gBkpM4WFysjoV/X591ak/tIRMD.t5y766HT%5EX/RSb0a/Nw
|
||||
https://mxfwd.gg/uwsX4/vnVUhsd/igwlpT%bahLI4;P0
|
||||
https://9g5pjef-db.Mq0tfjbmqomp84hi.rf97xmi3834.403gi.TC/sLVqu3UG4/OYh%98SQXVXf7Cp/j%deBNpZoEfAD60RV?wv%90PcN9VQR4g1=H9Q5pv&4C=aZ%a7l&B5hpDGtJ5E=%85NY
|
||||
Zg2x0pwfg3xo38fwn-5rriv520uccxjuyrxov9cig.fcr1xxh8.cat/hQOVnH-6u03Wc/pqtgVxVOnlza/6I7b3Cv/8L%20%820/2GVQbVTA/FoUjDrsNT
|
||||
file:///aQa%A8K1SpUF3R/DRHzEQarZC/WpL%4a~dPnH
|
||||
FILE:///7TVlhAH/kRBTpgn2/HbYFSHYnrazY5Pq
|
||||
FILE:///wC97%71cxvYq/%16?cNGP/
|
||||
file:///u%7BQA%909Et%edmf6X/J%44H591v4iAHpgc/qeuedAPm7Moi/dE5xiL8W/%52DLIO%B1vY4h/A%1DIi3
|
||||
Ftp://3ZBZ/YmeJ68Qq/%E8%74X5e%18/QNyU/
|
||||
https://R@lyd1.xtccruqswon.GR/oHPO%79jfl1/rFfct/TI4I5pfjn
|
||||
file://Rcpx7se8pzp4sj8ooxrlfyi.cpj--z.tl/ZQtA5b0%8F%665G/RTr%2BytU/4C.hmyu8/F1hcJ/PiHi4c%16VEN/66dIi
|
||||
ftp://wDIXDXTT.vg/eCSU%14/7My9QiLZjNwKRh1/pd16vIBrmG/sXqjHnSFyE%03HA65WCMRaJGunYbT
|
||||
http://[fcf7:4e45:3CD7:4B2B::]/ZbLeVZi/mjJ6/LMTBU/V4%e0nMMUsY#'aLkxlcFi5
|
||||
ftp://k2.jALPBG.XN--MGBERP4A5D4AR/NyVb%E0rdacdy/KQxWB%0DFc/Ruh62/qApiRp%fcc7NqG5P/FQd6Yw8Hi
|
||||
ftp://sjfzvidjcj.ae:55965/r7feW9uA/33qU0/BKlBWEwBw/w3nSd
|
||||
ftp://2k5.lfssxj9iatcd3056j-rq0/Bq8-ZY8byN/Skg1r%290%40%23/X51QAJ7U/H7Ir4nHaQ8?QOW
|
||||
http://ip0176.JM/LthE/E04n2pcGJV?P8=dCpb%e3q
|
||||
ftp://072.017.130.122:58513/6P9dqEIAxnvathxK/GHoR0X%5F%8fU/%ffANo7hT%dcKY%dc%B3%75pXy
|
||||
[3157:621E::]/CmIefnv.v91v/I%E6OmZLafDS/a7JoSqx80BC9/iSPk18UXH/g6xdyYNSlT8/o34wEX?MLP%993E=%1Fao&nRDo=6svN8+d%4Bq%30jky%75psOKb+h
|
||||
FTP://zbtd.0doxocs/sDrr5d5i/%6cJnyS/5K8mb;TYPE=D
|
||||
http://1vkic.cmd-efq.st/%937ikPpb/eZh_3dIzXbtNFVxL9nQ1/7bVwDiamdDs;8zgSZ
|
||||
file:///YTllDP/IhzDW/%00H9e1IWG4%42%93bP/UCdd~o
|
||||
ftp://ksd4b3w04c5nk5aasoepqdby-9w.sl/pNe8wJ2LkrJZ/XJSanvU/
|
||||
http://oPYQ.nd-egq1mkgtuwt4ei1ax.GQ/JRpv
|
||||
ftp://171.235.253.31/gop3Q%bcUoW1/38aPN?
|
||||
File:///XoULHUnTn/zYp/#SlAGu
|
||||
0kx1j6uf.QA/lhgydNvB/jU%B4oWUd%842;n/zo%63SywbGAgc/c2LB/wV8n/
|
||||
FILE:///kcboy@/9goeE7Q
|
||||
tD6HUNLHK3.u-06.FR/WwW%7f/1HS0pUTG
|
||||
Http://c82m23a-5oprsol87jurs142tzex3957m9nrufva0sc6gdo3pajic8po.H5m3wt.1RU:11878/Odij%A65n/Am~mzHC/#ArdWk8
|
||||
Http://cd1.es/w~Uc%455aE_/wVJKfr0/X3vnA/ImG6Z
|
||||
http://5ect9i8665yca.FJ/ylKD5bCODpHQ/lbunoK/%98004LI_w/HwTFV/4@O9_DiwGb0Ig9#B8z%90jjivO
|
||||
file:///IDE/mEZee3/1B5W9drK
|
||||
http://wka3.GM/%95yhyVy9#FFld%0CZGoiP
|
||||
file:///nAL4tAgn/UK?mpt4IE/.2JW4Ej%28uiG/LulMqnbE5
|
||||
ftp://973k1fnytm6y9hx87p42k.1whc75.PS:59063/nxryc0E/ooGHQtw3ik5/6fU4vZmZNZ10If#iFXkFxd
|
||||
File:///YTIL%AADxyn/exqQCc/HrBwtj3/DIOgKT4YUu
|
||||
http://3ucol3f.lr77xtr.LK/FNsRpDDW=/76bEzBTI/q30mQZ/
|
||||
9sb.7mct69t.ar/WpXcM8498S4F#k@L:'L
|
||||
ftp://3qn.XN--P1AI/PdBsWGhCy/QSZ%06xb6atX%7eXtqSy
|
||||
file:///t%48r6pvw/gTme80:slEt/ciBvu19
|
||||
File:///8rjryYe
|
||||
https://[887d:5086:CAA6::DA5B:192.032.127.177]/
|
||||
File:///v%2CCgt3%32kh5ZJx/~kf8WDLeR3XmmY6ap/.DEZNJ-ylM
|
||||
file:///KNINXVO67tBU/VWJdbMVH%a7uqRO9%ad/55Wlt5O41e?/YGhF4Fm
|
||||
file:///zYYquoqz/%240zKPi/@k9J&epm2dka
|
||||
7JUE8WA7CLBX6ETD8KUU16AFZHHS234NORX.tep69aqao2.int/iZjrUNXtQfBaF/Z%A87tU/XfvTnCVEY%00/FUyeI05%f4#?hZ
|
||||
file:///1?Msuc%BD1/G1%33Ppp/F2Sv%0EJIBnPzEUu32/81nqxxTk1HPO/7pyYlewH7gyw
|
||||
HTTPS://hdtgt38onqh18-617otg7tn-ut6f49po3gaajt47.m4O26.rwko060q21o.Am497x0kow-u.TN/nZX955o/JtBhKlvv3r
|
||||
ftp://28.118.125.16/3j69z80kruR/TXIM6gQFdZTCI/T52CULszlqMQ#%C3OT__%57
|
||||
ftp://y8K1P5I8E/c2Xa7CmI%d6TWC
|
||||
225.022.162.113/ZF58s/%CE%56BA5rQPOLU/AUNP8rG/w8SHG%d0FVsZX8dC
|
||||
X6eygmy.1a-mtt.ki/WC9%a6/GH9mNozOi
|
||||
94h6rdisa-eh.CH:8242/I8Ik5%42881r/EsVYPHYT/Jw7%3A2%2778ggZ8u%60
|
||||
Http://89.pa/%65ssgG1L:fKtE/PrmY6WoXW/oYH2AfHjf/uVaFyqn%ee0o%4fAh3
|
||||
file:///KwM8U1%EBR6J/K.asJbs0/i1vCxd/ZthOZxt0IKQEH/#x:Q8vtaIw
|
||||
http://rP6.Ewrowee5k83.COM/5CId/KVp%FE
|
||||
ftp://l8AAQ4XL0X0HO6MF7.9d.tw/%98Vb%117Uy4/KyUMl9
|
||||
Q293qtnuw.vi/6fi1J47ebQ/d2EC4A5OM%FF9_tUNs/dk=?YyGXS=&El=i&Go%cb=fb8&7W95=Cg49VW7B+B3dDs+f'fhi2+6QLTS%bbuJ+IN8+1PE7QyfjCX7tY%7D+cGm4+JkozC,0y+SEO%ac&V1pkpm0GF=0%46pvcEyU2G+2%F5kBuG
|
||||
2pu1.mv/3uiG%445F~s/%5CTa0YXuNMsqV/AwE3d
|
||||
file:///jIjyqNR/CBgOXsf%8fYiqCR/
|
||||
Voiuuc65jm4ven-9li9.mii5.0h5xt6.KE/qachnQB/nsC%4ai/juYvC3yTiCp%06S8I/LLVvQY#p1jmTyx@W
|
||||
Ftp://ydhhq20m.MY/%ADNIfcLl66t1fl/v4%a60h/N6My%9AKXUvToMFxY/
|
||||
14.21M1I.NU/iqlGVazIWPCvV/oelkORYd3Iwsdy%0D/LcdN7U
|
||||
file:///
|
||||
https://07zje.j84g-9lx-673h.vwr.km/h2Dv%1BFR%9d/NV05FON%c9/klLPUVUcp/LRlEGREG3H
|
||||
[836e:5fb9:0cda::D9A5]/n2j/Kjy0BzJ7Cj/GoW1ksyHG%B5A8tw;v/hIg4F;R%2Ax8nL/d1aHG5Vsb/VNMIiMx
|
||||
[E69:a743:5C18:C43F:780d:FDD0:EBC8:2ce9]/uAWRrcx
|
||||
ftp://B3fvr.l5GW6REKV.GI/0qT%dbwWVXZ/3kdb0/kBQuFu/R@9WXH0
|
||||
Ftp://a4gdplaw.TP/zyf2c37ZfY/QaiwZ3l/CUi9.ado/
|
||||
8L.vg/LjRJZ/z7/Fkg9dwmTDSp
|
||||
T7wos.u6I.cJP-5HQQCA.9dutej.SG/6McEZ0
|
||||
jJ0D1X6C5CCNWYGOCI4NNFC5A5NYJZTCW65DHS.d1yxpq.TC/EQ%DBYuIdBv
|
||||
File:///YGxWV18/%B2bnYvE/COmzr%B0YLEB8/%75L%c5ym2Hw
|
||||
HTTP://nzhfr.Mlrs1k026k.KN/~bhI#qqgVS5YR
|
||||
https://z9z6ip.INT/1%1dXkN1P/KI52I/yo%FD13SoZz0?:z'X3xwoS=1y&lmDOOEVzwHn2j=xfbMj%67cy#bKedfyI1
|
||||
FTP://aysc5.8i8kj7.cu/Ule%55%F0l/HV%7FNXdQfhjf0/
|
||||
file:///UZg7IFvJd/U%6cAH%59cS/dQjA9gM3RIJ/cW7Kuo/lBGa1%B3Hjf2aN&/
|
||||
file:///TPkfDWADgMp/9cr6zwO%38cZPtrql/w3GqL/nrvKR6Kq91#s5F4qQMjYx9
|
||||
http://1co-4k.zzzqb.XN--KGBECHTV/WRGpnKFny/eBiU%BDapp/0cb5bJ5%24J8a#N*cE%e4BmH3Jse?2
|
||||
n7q2q9b.3-ve593.eb368oe.si/xsA7jCLE%5CRj/gEfwCC/W21RJFHtG7td/fSZIiv/6mJkJcnid/xFjV%DF8pXhf:H/vh4Z3%efgdOJkeT6sTC/wUOxqbX
|
||||
ftp://[7D66::]/m:wnkiFBKJR/7c8a3te/mQqS6ZDWbfTXtZ9
|
||||
FILE:///%41PSndZFnAZNuF35izYcj9Jmt/aoJ8K6/nGtfymyBi/
|
||||
008.245.185.106/0Aq3gb85/6TZk7/PVTk%b1G80
|
||||
ftp://90.188.10.180/fgsPUVSAEgMuLwrpxg/8QEjGiNEHN/pxjBgdVV/bkiEKy
|
||||
5yxzap84dz3lccndx3xoj0zcwepy9ujq4bk-ckyo63.si/%E89rzFXG/htVDvVdD11S/SLLVce1/%5bgcDSkD
|
||||
file:///Mr
|
||||
dm83f2l.vvlpnpob.7si.cr/RFT%18uMgARxsP/8%61%7cO/eZtPUg%e5FavR0XRe9wZZ?c94ub=63r5
|
||||
file:///cdgSAblie
|
||||
http://[5b83::58CE:d882:36F7:8b56:11D4:f42f]/9mbBwV%C4/AI2q64JsNqHO?tZ3=nATs%3CQ&lbSzuIb=/IJtfPRbcu
|
||||
ftp://gOD0KB6HB8JDGK56.l-V4OW.sj/KqqiLzCu%6a3jexLbLB/%6dBHZb%29z72YF/
|
||||
http://s65E1E.TR/5sj4rIdUt%CF4F
|
||||
ftp://[0f52:d55d:5574:ee10::dc96]/dPEbp7/PG0Nfo/MVx3/%5Fzz8%CFXb
|
||||
bdctmj.vzaax2fe.j8S2.ojfq-b1m454.g7I.uy/o0%28WV/Bv9nDwD
|
||||
https://k233JLHW6N.cCA13HZAXR.laiu78y.fleptcf.brva6c.osod.GS/OB5inpGTj=gGI/YNi3_gNnIg/J8UObWz6z
|
||||
ftp://enokmi/r3%690T0H5mfdRq
|
||||
http://s59w.cg/nJoM7yv/Z2T9Xof0hNGhl/N0%6b5Sbrbtjj/
|
||||
ftp://qytw0h.hkdt2rm.gd/3a1WJDglP%cfZ
|
||||
Q-2pgsvifg.yr2ix-c4avrjwva.kn/_zD8ad/%8AVwQwOG/JMC314h/rO0qj%88?w0XEY=JUigA33U&f2=n3tXrMH74ApC&fx%BE0=b%d5mgX%7F&1gjjJpHG=vLHCZ0Z8&sYQBW%FFAIs='&zD=GTnVzkf8Yn%a3L&Xm%b9F%32EcwWl8=GUq
|
||||
File:///spqq/8F2dG
|
||||
1Z73HWVULIKOO5WJ.rEJGR9.nsscy.gf/rHEt;i5T/%50ZjYYJ3M%4dR/WlW0C48ocnb/NRA~0M#
|
||||
078.104.235.053/8KqfxznOtxC/ycYiTG3%11zP2%A1/hhbuX9Z%d403wES6/P0gg5%94
|
||||
FTP://58vs5.g0.tHI.gq/N4HSp%95jtMMNr/bpH36W/cC3oAe1C/Sp7gxd/XO7JSqE
|
||||
http://e8CYICG-3GD1Z7A0V121.Ya0j.Wy.CM/BLyz1kmpRF/nb6u%52/GpXGTv19#9?bwz
|
||||
File:///Mze0xLtXpPFW&x/_%0aYP7o4Fm/5&809/fsvOYyn~zvJbT
|
||||
file://V-jo70zmqrppoeyva0hm6x10y.UK/#3O9f0OYdx
|
||||
file:///K4BV8xTq%ccORyFI/8PzAVSZeBNFX%adT
|
||||
071.247.240.193/%94VOUi%ac
|
||||
27r2mghslc2b.Dwbpiqi8q.gTYSL3Z.am/RU80/KFcctLv/R8tG8d51EaD&pno5r7pDR#GWY
|
||||
mdfr2j.1FZFG4.VN/Xn6l%6dLWufM/I4FHTzlnWx%7BoI/ueeKx%03mfSA/%9a3PMEt.iSdeTVFgSnLi%C84m/6dh
|
||||
http://H4jk06c6mtprgjywnc40mjri05a.VA/7B%C0h%4fCjj80/TrN5HugANCZu/eMVdn4en/QUSLGhe?7yjqzvzv2r%b0I=&p%C32*HvmS%39g=wb8u&lTvA=FCGNF46U+?Ak.vpCAV%ceiK0f
|
||||
file:///cVjI9Ue/siOD/jynyp9%3FmBx
|
||||
http://u8ic-x8o.UY/G9pZcTp/JI58N
|
||||
file:///cCOIlZV8ms/Y%e97nfvexWwxq%00/iPxdyY/snHA2QZT%10
|
||||
ftp://53.151.134.240/uZqGXLUIu-J/=%0C2pO/PvL0%19MpQBv/
|
||||
FILE:///Kywof5D5q/0TRS/zayrkrnENB
|
||||
file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/
|
||||
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs
|
||||
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
|
||||
file:///TJa%86AczeCmM5QMhi/Wox~Ajl/WxUF%5eSA:y%0fD%E21/x%cca%d3Qgx/8iWJ5-h%26/fCK%01nQNrK8#ygTTB
|
||||
file:///~%303cUUVYTEaQU5%5DXbogiPKb/favR2rETEh/9TXM%15u/nYCOZpZgL
|
||||
file:///mJM%a1/jv5%53QDqE/bFMu0CBp
|
||||
[a0e6::]/YR5lwpHlG5BPjr2XT/Pq%e4kWAmZ/ucI10P1
|
||||
File:///8YorWt/#ToazT-v
|
||||
http://2igfcm3qy.wlcgdxv-xat059qnx15a7qp-p-p5oph1c8.GP/hS4Aqy7SmODbaOH
|
||||
3s81j.TJ/pS9Jzw8:NWryq/%00Kh1/Y7Rfoo7haw?pYq7Efg=
|
||||
HTTP://k59s6i5o.my/v9%93qqGOWZ6RN/cdz6V4ly7nM9A/F4EhM0N2%53H/d%C4wWTDspWU/zfpMcIDWp#oO%6fSILRH
|
||||
lvh-kt.TN/xZghTR/yDiD0a/P5D2%37rFa?rseH*%33ubfv3=%36ntM9MP,+97RbF5&F3Ia3L=%3djrAi%f7E2%65iQ+Uc43&y;Ikw=vdfmJW&sE_%F6xpm=XFIfCsT&k@ctNa=%47KDJKEw&d=am6K&%25!BjLNa=iqs.l
|
||||
http://Lhe7w4f06qt8tif2af1k6s552hlbk.mfce.cc/DEqiQf/GLpkeKZAxhSO4m
|
||||
Zy-iit.Cth-tuvx4.au/dl6DMUqP/wAeKXt6
|
||||
File:///35GJ%C8m6ubg/kpI4iEEx
|
||||
dbe.gkg.EDU/cJ%fbQ3k7pwp5/arlH%DCD
|
||||
Ftp://e8ni0.5etxvrjvn491/tP8r:UC/faEdqs4P/v4zJax4
|
||||
https://4PI.gg/fFtQoVp/b6Jf55/YEc2l7dE%CA
|
||||
http://gpu16lz.LS/9e%daJrwQfHEpFvsZ3jx/c4STIJ/CmvEGAUx9f/
|
||||
file://ij9anjtok86ro.uN-BGDQ855IB.sDXAQR.5kr8kz.3J3M8XRM.18r3s0g-6.4rjsmwue0lwao0og17d-5-1.F1h3qgkul29yw2t4p4se5clomncxhmoy.g6c9tbz7.pa/5LMtmbl/1tfIF/pBOV7Hc
|
||||
HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt
|
||||
5.Piba4ac.JE/55M1H/AZXdj
|
||||
m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/
|
||||
ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/
|
||||
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/
|
||||
Ftp://mez27g2tpmk.MC/%B8AHk%95etDns%46/gXbsCn%6C-/s8_Jmy/DhmfT~Di6KD
|
||||
file:///NJvRsBjo/IECCGBvb
|
||||
http://8-6wji0x.tCVT41X.k1PS.15p.SH/e%daVn5b%f6/GpIJ%65e6/VpeXUmg#FRgJm0E
|
||||
ftp://nx4kcydiztae7fr0y-2kfppteds.gq06u.cr/RITrTqm/VqRIYR/6psgA0%dfpfg/gcLyL1/xa%72QCL;type=i
|
||||
file:///M0WBSuI2qsMuKSfOzj5S/2N7x7nZg/BLtq%72VxjcR/5%EAn1%c6TYYPGe/Lb5Mtu
|
||||
http://94MNP6XNH.0mgqklz3t9g2xl89x81-a3hifmff89nahy62jeyhuhe8lhkuafizl.GQ/Ajpa4Z1D0o/aVv748s/NAIWCkWCD2hj/7MZS5c79DmL4/ieQ%21gw?oEPqIN=Pm9nPx54%c1&j1y=C
|
||||
ftp://rKI.COOP/v0pdu1zj/ir2UM4X/7k04jhOKPVN/7ua%E5y8p/bl~yS
|
||||
d-IJA.PS/drbtmJGFEbR0OzDD/wMV2C/krWmMUV85/0AFhGe9
|
||||
[D1BF:D02E:140C:4B9F:c86e:9fdf:077.173.119.180]/A07Ox%86Oae/yhjXUMut
|
||||
http://A.bi/J1GPah/OT741dJ/Jh3Z0xb3
|
||||
ftp://6VMV.t680F6.ijsru3.bm/vlJmkK/go28Jr/qUtmHmqhj/ykeAVxYoe
|
||||
HTTPS://oi%32Yp.@a4mk0.Teyu0lojs62d8l96qiym2v477ixatleasrgft4ttpbfel9r.BW
|
||||
x37MULG.514yrp5.Vrd68eeufzt.VA/fFMWutSw0d/Gr%BFun3/JH6%DESQV8f#gn+NM2
|
||||
http://2.88.82.235/6bhV%BFGDy%ABd/g84ly25/;4AeID#
|
||||
https://a860jcplfoodo0yq401cdf9.1ZE2P/NLArIzMZ%8B/6UiHWMMGS79/?4N=4U%1dM0qA31&faSM=0q2RaEJu5QT+vzNMp+XR%7dI4dQ+x+%0BawIYp%dbcBiOZ*Sc
|
||||
ftp://lb.NP:46239/xwyAL/m74%9fqj4gttFLg/
|
||||
s086j1-9.Nowi9s.fm/16zr3s/mvzfyWbB5/&1mzA:X-3
|
||||
eigz5dhw.jynsrju0t044lcc.3c3bfm.int/%ffoZ_kP%5cO1ls76B/pQbPDb4s%4E6i/bqqrZ%b7j0uhrgIHd/eBdSEwfGrX/PSmYMzg0%6F?Qr%92y11b3=&L;5CV=zJao%31Tmm
|
||||
65-ihklk4j6m.f3CFA.7kj.qa9rcww7uefzkpxbf87ni28b4a1i9rjqy9a.5texnqlc9.cu/p%CDK%b1%449LH/IiLqpww/HmACJI/r46TA4
|
||||
133.38.197.20/pbgvKM6W%BCEBN/Cvcu0&#idQDycc
|
||||
https://4I2GL/cGtyrs/%A8m5%3fekPsTRWlB2?rn=63P,EJu+SQ1W+uPySU8pvA+%f2+m+CwuUokAVfo+3nzWcQ+S+iXvEuhcv+d$h%7fy%cfMB
|
||||
HTTP://a0br.o0gvxf.kp/zZkWq5hfxy/q0x-g0In#bd%1anKx27
|
||||
ftp://[1327::117.246.244.220]/%91y4%09/
|
||||
ktefq.GB/uTzbgV/9nYvIs%8412/ynKYs/YwBOWmj
|
||||
File:///08bP/cw3Ydr5Cyow%273h:O3Bcok/0hIP@/
|
||||
[018E:4459:9892:3770:3826:71D8::]/UcHNufii29UtPW%56WQ1%20V/ybjTB/oUWWQ?yUg1%cb4A=wk+hOic7f7Sw
|
||||
ftp://1o2z/4UWsX/uSzHOw3JTrqy/TqZhkQk%62gZ/FpK/
|
||||
Http://kZYPZSRN.1m.UA/QN9n3Nw8kPAgkCB/SzdVcxryKou7mMG#p6at77
|
||||
http://se9g.s7-5qnlmsi0npbr8ouxuey3y66swspkl.y4.st/xfP7%066uXWuOu/clIFhy
|
||||
ftp://D4j9grnngs4a61b.im/f35gw%53rTeI5/#Ff7A0YMs9RG8t
|
||||
https://zujspr.cr/zy14P7FG3/Oxznfe/P2zpT%38S%FFVfP95Lh/nJJgzX/kcVuHCzV?Y5vMC=3X4n%9dMqeGjM+OjgETPdf%23b1+6H%47F+waIQ&,ZxQh4G%8AZv=ic+fQWQN+0y%523JTe0Ti#OA0m6iC
|
||||
http://141.171.118.17/VLnEb4Y
|
||||
https://sla.aowts.MQ/KbP3AV@wXFSgz/TauvS9f2/zvGpvN.e8a2Kw1ho?jYRUP=L_IAzw&cj0ux=xz&lrA%8bS56%A9=SX7NjQ
|
||||
file:///
|
||||
FTP://h6.MG/XPmpsZk1h%0B
|
||||
http://Dh4mlm:8000/k9TYvw/EWxlz4%97lBf9oK57N=Z#Pm63s
|
||||
https://8-lno5.KM/Uco2E%dbYPx~/MzKrkZ/rDpXB7OWtD?Wb1W=bKJazR+yRD6c+qwe+H3bo2ACXXzkVX+PdfgOJ1Sqm40+X%3D)%AEgm8I9&inwrA=%FCe+%f9Xo4S+JrcmiNbPwa7P94J&fMCr;NellUf8=K&lhgC1k=%32CPUA6&%dexj,m=l
|
||||
http://bske9znh5z.mq/rF739Qhneaet/NTfzZn
|
||||
http://B7z94v/
|
||||
FTP://p9s.hh313n.6k3.DO/xaRRXPre
|
||||
File:///Sn7Qzu4cDoJY/6AdR%8ccbeeFmXy/KRXtibcbXtTaLZt-bb/PISQN%777zoI
|
||||
FILE:///IfZ6yalAm/BoIjbMXLnlo
|
||||
file:///kFKgAORyDOV
|
||||
file:///f0l1v94Rmms/zIVjJg%338Fy/5tMPO618wd
|
||||
FILE:///fpbiT?6/%0B7dUkWR5r%AErqLW/v2n%bet%b3wV8Yzi80OJ.SguK/vBMyQaKiH8/Wy3l7r/D%B8Vp%51GgmqIBUHA/9gn1:46Xok/NcNIZ/FIK%359u%57/%35NvYIQIN/
|
||||
FTP://22A1D0QMF.cmcve.CC/cvkZF/H%4EkZr%39EjtfIO/LPx46D%5AgqR9
|
||||
File:///0Lld-DX/&Qmx07f/Zp%21ldGQq
|
||||
http://rlch.COOP/%bcKE55hwH6/CKHB%2Ak/Qzsn2Rn1p3RUc3H
|
||||
http://h6d5js.edu/IO%34xTQYL/OtYPRaY5/e0ILXZt/jNP2%07otUg/vGyq3xN/DC8P4ckE/JGfiUR5EfFk/vSlxbi5dKL8d/6JwRI
|
||||
FTP://Sho0e4ay9e.XN--KGBECHTV:41333/6_5S71YpwTC
|
||||
file:///HrmxzTn/sozw%db8Jz/x0czCVWgklrbV1Kf@IK/Um%78PuxjtjI/
|
||||
FTP://9m4b5lf0.Y5dnwnduzx9wha22ayztin-t7hng5b62e07rzsv55325xgdrzwx.gov/pmG%45dhnQZ
|
||||
ftp://t2ik0rgw.krjz72-l.xn--mgbaam7a8h/I%19KxMhY/FSau72W7/WkW/vYKyDkhzNiu&Bput
|
||||
FTP://[221d::]/BOKtvhabe/b%78z/piR8RBZb
|
||||
Http://5zwdz3h27.q9l27mto-5v0i3i1yu8oyl.TN/wk91N/X32rxh/cmM%01iQPnCulto/
|
||||
FTP://gWUFGOXE8EW.1g9vse.xn--wgbh1c/ncQo%42ihY/Tyk216/;type=d#J4A9HEH
|
||||
FTP://5wudd.ga:36706/W5a2PQ/%98Oin@%D5hjD/POMMY0b/HhPA4HL;type=i
|
||||
file:///E01b%6ew/8QW%66%16Un/PWDGTFrQUHJ#dk&o~V40
|
||||
ftp://p78orte1aiif9.zk-l-n5drgvx2kj6i9e034ck587-utyikjhal.qE5RJ031K2FAN-35.v71jyg8l/wgwpnw5/1WPLlSc8/3RZzlIEZMlC8/ytaOFdSuPKO%72T
|
||||
tri9.Fyhn.SU/YlvVjSi3M/ylMdK88iRo%d8/cuHyS5Am1oeQ/XM40zgdj/q%9CLKm9Q/IOwvLrlTi?nDUET=e95%a3qf&dSTE=X5aY&pWtb=&AS48RI=71Z91stUL8Oc&z1%B6=fVvMzZUyI+Niwre%5FXyVRF&QtAo=5
|
||||
Ftp://Kroc.Ls4-tkd7.sg:58219/9tq-FJyL?Qb/e0alokGZ2/MKTHP3Wsw
|
||||
pmg4ty.m59480p2f69.fV.COM/X98xZ.E/cTleUeS/9P6zeVQjfd30/eVVvE4/Zyxm1SSqe9u/WP%a5hS
|
||||
6P.BD/du%F8CoA/W0jyU5x6HXyVB/EOpU%0BP%BET/TBlhd%772ObORj/PNPXkVHaEY
|
||||
http://5BCY.X3.SG/N~63s98IV2/?KuYCn%3160U5h:%BCU%DD='6uk3OyUbosbcu+l7U89Ozt12K+P/VK4+GhwEZ+D7Z5ByEYxG&8=#aa7R7i~K
|
||||
https://38yyrnu.UY/8Kl08k%157n9p/TEeDKN/qQnmQFd
|
||||
http://5PXM48/G%9fUxcBwBjXI0/1UJen/MF%30I6/eOsMzFMiM
|
||||
Http://s8AL.rc94r4iftx7qeg4cbjjv5.za/mYk9UAydyn4q@w/T7K/dd%8aIXPp
|
||||
Http://130.165.027.114/o8bwef/X%70neu3uGKY/NU%f8xTKW0;hTKK/V;%edBnJYWG0MI/ZlDMtVPK7?k1N:WnR=%3DNffenC%67+sf(z0U!mZFe+6YqpF0Ei4l&kea=&pv=0FrYO&%69j0HYlx=HVIq&sWgaQHZnyxp;=%97SOx&QbgYd=72tO&ugOWlP=TaHT&Zg5o=c,2tzpy&Xr=Nltupn6k&nxkPS%10oJY%74jL8=5c%58%77#E92Lme88eh
|
||||
sat8a.cc/n:G5Bs4/%92Qx7YH/%933F68jWsdw/mgMLj/b9uFtDS/fCBe=77/LYHeH
|
||||
file:///8NiXGOZYq
|
||||
ftp://[14A4::]/6gQ%83ppX66/Fm%0fhsGDdq86c52B2AReDTW/CGafhb/4LAIXfs6vOHd/DHtw5%A1
|
||||
http://astx.i8o5jdypn1ly.LC
|
||||
Ftp://7j.N@Ptavog8.gh/%FDJUUJB/nrC6%4as/AM2BxLCU:fGwm
|
||||
file:///LD3OAKQVR
|
||||
http://jVVR4GZ.BG/XELY1/P=cusbVv5o
|
||||
HTTP://4fx.3kt642w.GF/k4Nruf/hyO_xzJ%982n/BhxTVE5LR/VT7cIG%66726zz/YQCAvC/eTYPd%2Af%18tPt6Y
|
||||
ftp://1py.jhl5-h.53.39PN2C.xN.ps/Q6kM9aOm7
|
||||
1MRTJ51.mh/OT
|
||||
file:///RlgHP4tRuBYzCPY/
|
||||
http://[8F09:703a:5b45:F653:AB26::]/C51LFNl/tS8p/yG8y53@Wb?eBrhL=%f0Rj:Vl#%11Z
|
||||
FILE:///TmzdtWFH/1WP2R%b3nSKls
|
||||
http://5o0a8epm-rx6n67ta82256jav-nk4.lb/HbOqUc/TIVeqJ7Ohp/BjDwRDKJ/JZO
|
||||
File:///AvnO.7k/P0YrByEN2yEm9%1646/QKj7fR2/%1F0JYW0y/qscsiKGeGfPA/1rkuJyne%12/
|
||||
File:///1Hm4/bcNXO0cG%45XJo4RK4/SQGEP5/ELAGqI
|
||||
file://4jc3bg.zs/WfjCr2aeWME/Nv4A4B/invk2d1h
|
||||
Vj1.Ngq.LI/FR2%b7RU_z%a1Tf2vy/rysXmZ0/
|
||||
Ftp://wkws.yi8srfw.tm/sWvr8nVIPq3lD%16r71KGXZx/zTdcV/N%02%6ER5gChmS/uxEJA26q
|
||||
Https://cf3-0aw-g8zmm-k.AO/mYGm9AqQW%E4q?6u=&rX=
|
||||
8vv-rhcodmrr42jd6zmrnl7xa.F1igvm2.RO?rQOIRt=Q&Z8=1WyCZjZv83+lpB%7a
|
||||
Http://009.130.112.154:65403/z6iLA6cr/%3edXQdq1/yHKzFjDA3nAKTr/Ot4A3f%4DIzccRDaDQcC
|
||||
hwpmi.upmzdzzhsrz.e469.ee/SXdNeY7NHR6/Vr6%FDr
|
||||
http://[C7E7:57e7:b08c:9FCD:4B77:4de1:229.020.164.172]/LnIzKLn/StXMmto
|
||||
Http://2-6SB2KV8V8MV290SIC08D9J7-IRM9FTPC8ZZ.hwo9el74qqv1.zm/tr9K2BSFkbU-A8wJR/CGEL_82/cnMuBB%a3j34
|
||||
file:///fUtCm%b6qNK/lltu?NvBAhM/sJ8pOm:/jJ18OTM6U%f5v%3f/
|
||||
http://76OXC.pn.GA:15181/OPErhH1cHtl1ba/eIPkR6%1EG/8fVd02k/Ky%b0D5izq4k
|
||||
ftp://154.108.127.0/vGpMboeazp05/usfmVeitt0pf3o/Ue4OMVT/sJ9BAYSLje
|
||||
ftp://ivbv0.zCR-0J.lku/6m26/7tElM/%b2%0BI.Ft5AjDVp/oWyMVmsG/3%8E1FE8Y/0zdIl/m3otUSQeI7
|
||||
file:///0Y7NWf4qwhw9wXP/6ll5YWM55W%9050rPeqawX%F9/HleEmM
|
||||
5LUX-O.q-33d.tn/smzXQJn3H/81mg%4de_/jb%97hT
|
||||
http://84W32/CCKpkt/c0bqCnoQ5Y
|
||||
ftp://nyqaz.MT/0OfOsU7S1H9BM/OjhdD/izbR4txUY
|
||||
8wo2j2c1z9s.ef2ki0mlvvnjm5vfyu.t5a-yb41uykgo5kn1qxzffhz667dty8mytg6ir7os9hoxwm2.mw/%39FEVmD/%a4qRT5W5qW.yR/8XB9NHyB/
|
||||
http://rbf6ezzlhpe.hk/%0DK8/IXXJAsC?mV8vvDI8K=6t9%6EG1Dt+M7N+D5n@Vd79n%d8E+gj+ofnZ%16loobN+f3-S+e,IH&lnh=
|
||||
wu3w.0J5.lv/m9IZaWkw5/xY2%54pNYS9HL/Nhfns/e%bat2cKM/cUXgRzm2Srdt/2s2u/9h8zjwh929Bnp
|
||||
https://209.73.217.17/dJvsqDH/RH6Ok_eSc8wO5/BOJws6/9f0DvXJ4/?%ea'Fx=P&6h3zz3eGCtK=4MF76p7Em
|
||||
jfajtdt5k6gu11la2jbih.MA/zcaTNUL/3q%31eLT%bc3S/L6v2rt/WtbA0%45~TIvPD
|
||||
ftp://Defi-z.gr:16993/=7IIaMpVy3OLs/QtQD7qF5Vr/=RVbNDH8/y3oUHmX.v/Td%dcbiGlArA%720
|
||||
ftp://[544f:e60a::8772:D633:DA1F:081.021.019.189]:62615/%CB6Wy1K/X%0EcoPQ/IgnCMLPynfx/fdFHb
|
||||
ftp://1INQM6.4y.RO/
|
||||
Http://T778hd416.g9r96v.bs:64804/GbWp%47K/zgTKs/cBHzmYZ=AI23VY
|
||||
HTTPS://6hp3j2y2tuakzv1rnq9vnvn1w0j6roo3if:58975/vH8BLTu3hzkk
|
||||
ftp://Ye1dfbl0eae8lqiiqaojj.JO/8EjAq0TzD:/Bz3Pm2qyWo/ZX58A2/yjn%9F3xJZjsVhw
|
||||
66.242.9.138/CYHK1bGpZ/5yyVD%cbC
|
||||
nHZMBEJWO.ST/ABXauli3wuJ/WUxhKaZJg
|
||||
ftp://[8463:c210::b5d1]:34094/8%AC7Fc/Qh6%62yFExJbdaB/0cAZ3iSKlk8sU;TYPE=D
|
||||
http://vmlyl0efotpfd-tew59kcpsi2u7qd/UbXy1Cc/L%0cwnzmdjz/?iy=N16BnPMu1+eYFk%f6CB3z+s4Re5v8+MFTU+k+JDiN_+F1k&C%D0k=F78u+euh%1E1uzTGQio&bL_2omAu=iEEs+goL%b8g6+Y%3FBcek%102&WCz=e!Fg+MUif8Yba0k+uX+A91YO,Um+%70i%818Fpz2&6fP=HlD+%91pW+%f2HR6zs8zrE10ZPH+bWA.BB6k+Df3w:X85xDnDjSiPY+AyDpuSl4VEVTJzA3g&OtUR6=
|
||||
http://bCNNCLT.gxa2sbn/lAFakp
|
||||
D19f.oD5.bb/xUG6W8VxTcjMG/jYMuWlVMygf/UtIwE13c/%a9wzpO%AFxQ9
|
||||
q8HY2P.r5T.AU/nc0Iq%28QAF/#yOD3%b3UA%d79e%1EmJp3
|
||||
dPY3X09.AC/STpa%97U%b53yKP4Te/%71KZZvIC#nA1W2z
|
||||
ftp://3gb.xgjm/wF%ado0cM/u%0DmCW8L/d9Ss%61dKQ
|
||||
6m.56xkyt.32O.com/ToEAr%BEdi/xBpPU2NqC/74sgdq%BD9/WSrx5/5ldupD%47J/9boeZj
|
||||
ftp://s0y6r7hg7.XN--KGBECHTV/xQizIlOK9/uxho7%bd/RvxbFGQ4o/O%42UeWF?/GAZ5E8b2/eRaq/l:-1ASwSpw/2FkowF%12Ss/vtCq9dysEc%1ee/
|
||||
[d18d:1707::]/NGZMInsLF8/kgC3y/F66qc1qt6OWfeS/DyngWA
|
||||
file:///%55A4VpGsup
|
||||
file:///WNEw%bfTWDLF/s%A9oZoWUo
|
||||
Ftp://2tdk.Ube6velthhhx8o.GM/bUH4XycSEKkTE
|
||||
ftp://7kxk4ujzz.kp:32621/hbop0%25sK/rw7RBE0lTN/tX5BLF
|
||||
FILE:///IQExpA4kDvUfTkH6Bg/MeVJ4aIUbXCJf
|
||||
file:///SIE0AkJFq/ZPJLyYK/6hA3x1InlGm1
|
||||
http://047.014.184.200/Z_QdOwjzfBue4Nt/aEn/xuEQD/cXlnoxHIK%7d8h/1%eegEk7E0/8Ejku@r1Z/UZ4gG/%484zOJsP%1b/Lc1okbWRzN5UJ
|
||||
Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L
|
||||
FILE://155.24.106.255/3VEZIT7
|
||||
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
|
||||
lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET
|
||||
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C
|
||||
FILE://a6ys9a4.xj.BY/%99BGXp/F=yJtxc71/gvXuHuB9k
|
||||
212.072.006.032/6kV8ce%2e/%e7lzm-HB%4artP/zg6tWMW7RIG?U7=HAXw$D3sM%7DyDJ&Gt=
|
||||
http://[ea5::]/eIdv5xl/5qhxlOvzw%018f/N3RQQKCz/WzUnsSg8KA3/7ohHZCp
|
||||
file:///g_T81EaNw2nJB/1yUUT
|
||||
http://2XXY0MZ.fwa.791ck-2gx.bd/uO6FW?ZS5jE:=m:
|
||||
https://[8368:F154::f99f]/Y3h8FgzTYYpzn/zHFhQECC/CGtX/8v_~jn3Kn
|
|
@ -98,12 +98,4 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
|
||||
}
|
||||
|
||||
/**
|
||||
* test that acronym normalization works
|
||||
*/
|
||||
public void testAcronym() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,8 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
|||
checkOneTermReuse(a, "book", "book");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "the", new String[] {});
|
||||
// possessive removal
|
||||
checkOneTermReuse(a, "steven's", "steven");
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
|
|
|
@ -111,7 +111,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(
|
||||
fa,
|
||||
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||||
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
|
||||
new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.th;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.junit.Assume;
|
||||
|
||||
/**
|
||||
|
@ -39,37 +40,35 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Thai numeric tokens are typed as <ALPHANUM> instead of <NUM>.
|
||||
* This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer.
|
||||
*
|
||||
* The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc)
|
||||
* Fix is easy: refine this spec to exclude thai punctuation and digits.
|
||||
*
|
||||
* A better fix, that would also fix quite a few other languages would be to remove the thai hack.
|
||||
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
|
||||
*/
|
||||
public void testBuggyTokenType() throws Exception {
|
||||
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
/* correct testcase
|
||||
public void testTokenType() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>" });
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
"<NUM>" });
|
||||
}
|
||||
*/
|
||||
|
||||
public void testAnalyzer() throws Exception {
|
||||
/**
|
||||
* Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
|
||||
* @deprecated testing backwards behavior
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyTokenType30() throws Exception {
|
||||
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
/** @deprecated testing backwards behavior */
|
||||
@Deprecated
|
||||
public void testAnalyzer30() throws Exception {
|
||||
Assume.assumeTrue(ThaiWordFilter.DBBI_AVAILABLE);
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
|
||||
|
||||
assertAnalyzesTo(analyzer, "", new String[] {});
|
||||
|
||||
|
@ -124,6 +123,23 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
|
||||
}
|
||||
|
||||
/** @deprecated, for version back compat */
|
||||
@Deprecated
|
||||
public void testReusableTokenStream30() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,211 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/*
|
||||
* Copyright 2001-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.text.DateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TimeZone;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Generates a file containing JFlex macros to accept valid ASCII TLDs
|
||||
* (top level domains), for inclusion in JFlex grammars that can accept
|
||||
* domain names.
|
||||
* <p/>
|
||||
* The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
|
||||
* response is parsed, and the results are written out to a file containing
|
||||
* a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
|
||||
* forms of internationalized TLDs (output file cmdline arg #1).
|
||||
*/
|
||||
public class GenerateJflexTLDMacros {
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
|
||||
System.err.println("Cmd line params:");
|
||||
System.err.println("\tjava " + GenerateJflexTLDMacros.class.getName()
|
||||
+ "<ZoneFileURL> <JFlexOutputFile>");
|
||||
System.exit(1);
|
||||
}
|
||||
new GenerateJflexTLDMacros(args[0], args[1]).execute();
|
||||
}
|
||||
|
||||
private static final String NL = System.getProperty("line.separator");
|
||||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Copyright 2001-2005 The Apache Software Foundation." + NL
|
||||
+ " *" + NL
|
||||
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
|
||||
+ " * you may not use this file except in compliance with the License." + NL
|
||||
+ " * You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
+ " * Unless required by applicable law or agreed to in writing, software" + NL
|
||||
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
|
||||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL + NL;
|
||||
|
||||
private static final Pattern TLD_PATTERN_1
|
||||
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
|
||||
private static final Pattern TLD_PATTERN_2
|
||||
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
|
||||
private final URL tldFileURL;
|
||||
private long tldFileLastModified = -1L;
|
||||
private final File outputFile;
|
||||
|
||||
public GenerateJflexTLDMacros(String tldFileURL, String outputFile)
|
||||
throws Exception {
|
||||
this.tldFileURL = new URL(tldFileURL);
|
||||
this.outputFile = new File(outputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then
|
||||
* writes a JFlex macro accepting any of them case-insensitively out to
|
||||
* the specified output file.
|
||||
*
|
||||
* @throws IOException if there is a problem either downloading the database
|
||||
* or writing out the output file.
|
||||
*/
|
||||
public void execute() throws IOException {
|
||||
final SortedSet<String> TLDs = getIANARootZoneDatabase();
|
||||
writeOutput(TLDs);
|
||||
System.err.println("Wrote " + TLDs.size() + " top level domains to '"
|
||||
+ outputFile + "'.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads the IANA Root Zone Database.
|
||||
* @return downcased sorted set of ASCII TLDs
|
||||
* @throws java.io.IOException if there is a problem downloading the database
|
||||
*/
|
||||
private SortedSet<String> getIANARootZoneDatabase() throws IOException {
|
||||
final SortedSet<String> TLDs = new TreeSet<String>();
|
||||
final URLConnection connection = tldFileURL.openConnection();
|
||||
connection.setUseCaches(false);
|
||||
connection.addRequestProperty("Cache-Control", "no-cache");
|
||||
connection.connect();
|
||||
tldFileLastModified = connection.getLastModified();
|
||||
BufferedReader reader = new BufferedReader
|
||||
(new InputStreamReader(connection.getInputStream(), "US-ASCII"));
|
||||
try {
|
||||
String line;
|
||||
while (null != (line = reader.readLine())) {
|
||||
Matcher matcher = TLD_PATTERN_1.matcher(line);
|
||||
if (matcher.matches()) {
|
||||
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
|
||||
} else {
|
||||
matcher = TLD_PATTERN_2.matcher(line);
|
||||
if (matcher.matches()) {
|
||||
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
return TLDs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a file containing a JFlex macro that will accept any of the given
|
||||
* TLDs case-insensitively.
|
||||
*
|
||||
* @param ASCIITLDs The downcased sorted set of top level domains to accept
|
||||
* @throws IOException if there is an error writing the output file
|
||||
*/
|
||||
private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
|
||||
final DateFormat dateFormat = DateFormat.getDateTimeInstance
|
||||
(DateFormat.FULL, DateFormat.FULL, Locale.US);
|
||||
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
final Writer writer = new OutputStreamWriter
|
||||
(new FileOutputStream(outputFile), "UTF-8");
|
||||
try {
|
||||
writer.write(APACHE_LICENSE);
|
||||
writer.write("// Generated from IANA Root Zone Database <");
|
||||
writer.write(tldFileURL.toString());
|
||||
writer.write(">");
|
||||
writer.write(NL);
|
||||
if (tldFileLastModified > 0L) {
|
||||
writer.write("// file version from ");
|
||||
writer.write(dateFormat.format(tldFileLastModified));
|
||||
writer.write(NL);
|
||||
}
|
||||
writer.write("// generated on ");
|
||||
writer.write(dateFormat.format(new Date()));
|
||||
writer.write(NL);
|
||||
writer.write("// by ");
|
||||
writer.write(this.getClass().getName());
|
||||
writer.write(NL);
|
||||
writer.write(NL);
|
||||
writer.write("ASCIITLD = \".\" (");
|
||||
writer.write(NL);
|
||||
boolean isFirst = true;
|
||||
for (String ASCIITLD : ASCIITLDs) {
|
||||
writer.write("\t");
|
||||
if (isFirst) {
|
||||
isFirst = false;
|
||||
writer.write(" ");
|
||||
} else {
|
||||
writer.write("| ");
|
||||
}
|
||||
writer.write(getCaseInsensitiveRegex(ASCIITLD));
|
||||
writer.write(NL);
|
||||
}
|
||||
writer.write("\t) \".\"? // Accept trailing root (empty) domain");
|
||||
writer.write(NL);
|
||||
writer.write(NL);
|
||||
} finally {
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a regex that will accept the given ASCII TLD case-insensitively.
|
||||
*
|
||||
* @param ASCIITLD The ASCII TLD to generate a regex for
|
||||
* @return a regex that will accept the given ASCII TLD case-insensitively
|
||||
*/
|
||||
private String getCaseInsensitiveRegex(String ASCIITLD) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) {
|
||||
char ch = ASCIITLD.charAt(pos);
|
||||
if (Character.isDigit(ch) || ch == '-') {
|
||||
builder.append(ch);
|
||||
} else {
|
||||
builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
|
@ -44,11 +44,11 @@ import com.ibm.icu.util.ULocale;
|
|||
*/
|
||||
public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||
/** Token type for words containing ideographic characters */
|
||||
public static final String WORD_IDEO = "<IDEO>";
|
||||
public static final String WORD_IDEO = "<IDEOGRAPHIC>";
|
||||
/** Token type for words containing Japanese kana */
|
||||
public static final String WORD_KANA = "<KANA>";
|
||||
/** Token type for words that contain letters */
|
||||
public static final String WORD_LETTER = "<WORD>";
|
||||
public static final String WORD_LETTER = "<ALPHANUM>";
|
||||
/** Token type for words that appear to be numbers */
|
||||
public static final String WORD_NUMBER = "<NUM>";
|
||||
|
||||
|
|
|
@ -17,17 +17,16 @@ package org.apache.lucene.analysis.icu.segmentation;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||
|
@ -220,6 +219,6 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
public void testTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"david", "has", "5000", "bones"},
|
||||
new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.ClassicFilter;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class ClassicFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenFilter create(TokenStream input) {
|
||||
return new ClassicFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class ClassicTokenizerFactory extends BaseTokenizerFactory {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public Tokenizer create(Reader input) {
|
||||
return new ClassicTokenizer(luceneMatchVersion, input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
||||
|
||||
/** Factory for {@link EnglishPossessiveFilter} */
|
||||
public class EnglishPossessiveFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new EnglishPossessiveFilter(input);
|
||||
}
|
||||
}
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
|
||||
|
@ -24,7 +26,13 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
* @version $Id$
|
||||
*/
|
||||
public class StandardFilterFactory extends BaseTokenFilterFactory {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public StandardFilter create(TokenStream input) {
|
||||
return new StandardFilter(input);
|
||||
return new StandardFilter(luceneMatchVersion, input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,22 +32,34 @@ public class TestStandardFactories extends BaseTokenTestCase {
|
|||
* Test StandardTokenizerFactory
|
||||
*/
|
||||
public void testStandardTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"Wha\u0301t's", "this", "thing", "do" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test ClassicTokenizerFactory
|
||||
*/
|
||||
public void testClassicTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"What's", "this", "thing", "do" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test StandardFilterFactory
|
||||
* Test ClassicFilterFactory
|
||||
*/
|
||||
public void testStandardFilter() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
StandardFilterFactory filterFactory = new StandardFilterFactory();
|
||||
ClassicFilterFactory filterFactory = new ClassicFilterFactory();
|
||||
filterFactory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer tokenizer = factory.create(reader);
|
||||
TokenStream stream = filterFactory.create(tokenizer);
|
||||
|
|
Loading…
Reference in New Issue