From 99dc6e24eb082fc74d07bf2b86229e4fb0bfa4bb Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Thu, 8 Nov 2007 23:40:12 +0000 Subject: [PATCH] SOLR-396: new build system support for generating stub tokenizer/tokenfilter factories when lucene jars are updated. includes newly generated factories for all lucene-analyzers-2.2.0.jar langauges git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@593359 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 6 + build.xml | 73 ++++++ src/dev-tools/stub-analysis-factory-maker.pl | 146 ++++++++++++ .../analysis/BrazilianStemFilterFactory.java | 35 +++ .../solr/analysis/CJKTokenizerFactory.java | 31 +++ .../solr/analysis/ChineseFilterFactory.java | 30 +++ .../analysis/ChineseTokenizerFactory.java | 30 +++ .../solr/analysis/DutchStemFilterFactory.java | 36 +++ .../analysis/FrenchStemFilterFactory.java | 35 +++ .../analysis/GermanStemFilterFactory.java | 33 +++ .../analysis/GreekLowerCaseFilterFactory.java | 55 +++++ .../apache/solr/analysis/RussianCommon.java | 47 ++++ .../RussianLetterTokenizerFactory.java | 39 ++++ .../RussianLowerCaseFilterFactory.java | 40 ++++ .../analysis/RussianStemFilterFactory.java | 41 ++++ .../solr/analysis/ThaiWordFilterFactory.java | 35 +++ .../solr/util/SuggestMissingFactories.java | 213 ++++++++++++++++++ 17 files changed, 925 insertions(+) create mode 100755 src/dev-tools/stub-analysis-factory-maker.pl create mode 100644 src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/CJKTokenizerFactory.java create mode 100644 src/java/org/apache/solr/analysis/ChineseFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java create mode 100644 src/java/org/apache/solr/analysis/DutchStemFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/GermanStemFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/RussianCommon.java create mode 100644 src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java create mode 100644 src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/RussianStemFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java create mode 100644 src/java/org/apache/solr/util/SuggestMissingFactories.java diff --git a/CHANGES.txt b/CHANGES.txt index 8514ebfe355..56ab3581124 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -226,6 +226,12 @@ Other Changes 5. SOLR-367: The create method in all TokenFilter and Tokenizer Factories provided by Solr now declare their specific return types instead of just using "TokenStream" (hossman) + + 6. SOLR-396: Hooks add to build system for automatic generation of (stub) + Tokenizer and TokenFilter Factories. + Also: new Factories for all Tokenizers and TokenFilters provided by the + lucene-analyzers-2.2.0.jar -- includes support for German, Chinese, + Russan, Dutch, Greek, Brazilian, Thai, and French. (hossman) ================== Release 1.2, 20070602 ================== diff --git a/build.xml b/build.xml index 93eea298f5a..77edb1f9485 100644 --- a/build.xml +++ b/build.xml @@ -258,6 +258,79 @@ + + + + + + + + + + + + + + + + + + ... + + This task requires that the property 'stub.src.path' be set. + + It must contain a "path" listing directories containing source + files that this task should use when looking for classes that + need factories created, the format is platform specific -- + typically it is colon seperated in Unix, semi-colon seperated + on windows, ie: + + ant stub-factories -Dstub.src.path="./src:../lucene/contrib:../lucene/src/java" + + FYI: The file ${stub.list} contains a list of classes + that seem to need stub factories. (if java files can be found to + use as guides for creating them). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/dev-tools/stub-analysis-factory-maker.pl b/src/dev-tools/stub-analysis-factory-maker.pl new file mode 100755 index 00000000000..114d8f611b2 --- /dev/null +++ b/src/dev-tools/stub-analysis-factory-maker.pl @@ -0,0 +1,146 @@ +#!/usr/bin/perl +my $ASL = q{ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +}; + +# $Id:$ +# $URL:$ +# +# What this script does... +# +# 1) reads a list of fully qualified package.ClassNames from STDIN +# 2) gets a list of src dirs from command line +# 3) crawl the source dirs, looking for .java files corrisponding to +# the ClassNames, if a match is found, creates a factory for it in +# the current working directory using info about the constructor +# in the orriginal .java file +# +# Note... +# * any ClassNames not found will be logged to stdout +# * script assumes generated factorories in "org.apache.solr.analysis package +# * factories will be compilable only if orriginal class had no special +# constructor args. otherwise it will have an abstract init method that +# needs filled in. + +use strict; +use warnings; +use File::Find; + + +my %classes = (); +while () { + chomp; + $classes{$_} = 1; +} + +find({wanted => \&wanted, + no_chdir => 1, + }, + @ARGV); + +sub wanted { + + my $file = $File::Find::name; + + return unless $file =~ m{/([^/]*)\.java}; + my $class = $1; + + open(my $f, "<", $file) or die "can't open $file: $!"; + my $data; + { + local $/; # slurp + $data = <$f>; + } + close $f; + + # skip abstract classes + return if ($data =~ m{abstract\s+(\w+\s+)*class\s+$class}); + + my $pack = "EMPTYPACKAGE"; + if ($data =~ m/package\s+(.*);/) { + $pack = $1; + } + + my $fullname = "${pack}.${class}"; + # only looking for certain classes + return unless $classes{$fullname}; + + my @imports = $data =~ m/import\s+.*;/g; + + if ($data =~ m{public \s+ ((?:\w+\s+)*) $class \s*\(\s* ([^\)]*) \) }sx) { + my $modifiers = $1; + my $argline = $2; + + my $mainArgType; + my $mainArg; + my @orderedArgs; + + my %args = map { my ($v,$k) = split /\s+/; + push @orderedArgs, $k; + if ($v =~ m/^(Reader|TokenStream)/) { + $mainArgType=$v; + $mainArg=$k; + } + ($k, $v) + } split /\s*,\s*/, $argline; + + my $type = ("Reader" eq $mainArgType) ? "Tokenizer" : "TokenFilter"; + + my $facClass = "${class}Factory"; + my $facFile = "${facClass}.java"; + die "$facFile exists" if -e $facFile; + open my $o, ">", $facFile + or die "can't write to $facFile: $!"; + + print $o "$ASL\n"; + print $o "package org.apache.solr.analysis;\n"; + print $o "import ${pack}.*;\n"; + print $o "$_\n" foreach @imports; + print $o "import java.util.Map;\n"; + print $o "public class ${facClass} extends Base${type}Factory {\n"; + foreach my $arg (@orderedArgs) { + print $o " private $args{$arg} $arg;\n" unless $arg eq $mainArg; + } + if (1 < @orderedArgs) { + # we need to init something, stub it out + print $o " public abstract void init(SolrConfig solrConfig, Map args) {\n"; + print $o " super.init(solrConfig, args);\n"; + print $o " // ABSTRACT BECAUSE IT'S A STUB .. FILL IT IN\n"; + print $o " }\n"; + } + print $o " public $class create($mainArgType $mainArg) {\n"; + print $o " return new $class(", join(",", @orderedArgs), ");\n"; + print $o " }\n"; + print $o "}\n\n"; + close $o; + + delete $classes{$fullname}; # we're done with this one + } else { + print STDERR "can't stub $class\n"; + } +} + +if (keys %classes) { + print STDERR "Can't find java files for...\n"; + foreach (keys %classes) { + print STDERR "$_\n"; + } + exit -1; +} + diff --git a/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java b/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java new file mode 100644 index 00000000000..1045ca2c466 --- /dev/null +++ b/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java @@ -0,0 +1,35 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.br.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Set; +import java.util.Map; +public class BrazilianStemFilterFactory extends BaseTokenFilterFactory { + public BrazilianStemFilter create(TokenStream in) { + return new BrazilianStemFilter(in); + } +} + diff --git a/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java b/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java new file mode 100644 index 00000000000..e68265c4ce9 --- /dev/null +++ b/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java @@ -0,0 +1,31 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.cjk.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; +import java.io.Reader; +import java.util.Map; +public class CJKTokenizerFactory extends BaseTokenizerFactory { + public CJKTokenizer create(Reader in) { + return new CJKTokenizer(in); + } +} + diff --git a/src/java/org/apache/solr/analysis/ChineseFilterFactory.java b/src/java/org/apache/solr/analysis/ChineseFilterFactory.java new file mode 100644 index 00000000000..0076220bf38 --- /dev/null +++ b/src/java/org/apache/solr/analysis/ChineseFilterFactory.java @@ -0,0 +1,30 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.cn.*; +import java.util.Hashtable; +import org.apache.lucene.analysis.*; +import java.util.Map; +public class ChineseFilterFactory extends BaseTokenFilterFactory { + public ChineseFilter create(TokenStream in) { + return new ChineseFilter(in); + } +} + diff --git a/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java b/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java new file mode 100644 index 00000000000..a817ce000a5 --- /dev/null +++ b/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java @@ -0,0 +1,30 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.cn.*; +import java.io.Reader; +import org.apache.lucene.analysis.*; +import java.util.Map; +public class ChineseTokenizerFactory extends BaseTokenizerFactory { + public ChineseTokenizer create(Reader in) { + return new ChineseTokenizer(in); + } +} + diff --git a/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java b/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java new file mode 100644 index 00000000000..8dfb8bbab5f --- /dev/null +++ b/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java @@ -0,0 +1,36 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.nl.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.Map; +import java.util.Map; +public class DutchStemFilterFactory extends BaseTokenFilterFactory { + public DutchStemFilter create(TokenStream _in) { + return new DutchStemFilter(_in); + } +} + diff --git a/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java b/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java new file mode 100644 index 00000000000..42a92d73dfb --- /dev/null +++ b/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java @@ -0,0 +1,35 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.fr.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.Hashtable; +import java.util.HashSet; +import java.util.Set; +import java.util.Map; +public class FrenchStemFilterFactory extends BaseTokenFilterFactory { + public FrenchStemFilter create(TokenStream in) { + return new FrenchStemFilter(in); + } +} + diff --git a/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java b/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java new file mode 100644 index 00000000000..109c1030354 --- /dev/null +++ b/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java @@ -0,0 +1,33 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.de.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.Set; +import java.util.Map; +public class GermanStemFilterFactory extends BaseTokenFilterFactory { + public GermanStemFilter create(TokenStream in) { + return new GermanStemFilter(in); + } +} + diff --git a/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java b/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java new file mode 100644 index 00000000000..5086a409315 --- /dev/null +++ b/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java @@ -0,0 +1,55 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.el.*; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import java.util.Map; +import java.util.HashMap; +import org.apache.solr.core.SolrConfig; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory { + + private static Map CHARSETS = new HashMap(); + static { + CHARSETS.put("UnicodeGreek",GreekCharsets.UnicodeGreek); + CHARSETS.put("ISO",GreekCharsets.ISO); + CHARSETS.put("CP1253",GreekCharsets.CP1253); + } + + private char[] charset = GreekCharsets.UnicodeGreek; + + + public void init(SolrConfig solrConfig, Map args) { + super.init(solrConfig, args); + String charsetName = args.get("charset"); + if (null != charsetName) charset = CHARSETS.get(charsetName); + if (null == charset) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Don't understand charset: " + charsetName); + } + } + public GreekLowerCaseFilter create(TokenStream in) { + return new GreekLowerCaseFilter(in,charset); + } +} + diff --git a/src/java/org/apache/solr/analysis/RussianCommon.java b/src/java/org/apache/solr/analysis/RussianCommon.java new file mode 100644 index 00000000000..839211e0d3c --- /dev/null +++ b/src/java/org/apache/solr/analysis/RussianCommon.java @@ -0,0 +1,47 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.ru.*; +import java.util.Map; +import java.util.HashMap; +import org.apache.solr.core.SolrConfig; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +public class RussianCommon { + + private static Map CHARSETS = new HashMap(); + static { + CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian); + CHARSETS.put("KOI8",RussianCharsets.KOI8); + CHARSETS.put("CP1251",RussianCharsets.CP1251); + } + + public static char[] getCharset(String name) { + if (null == name) + return RussianCharsets.UnicodeRussian; + + char[] charset = CHARSETS.get(name); + if (null == charset) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Don't understand charset: " + name); + } + return charset; + } +} + diff --git a/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java b/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java new file mode 100644 index 00000000000..a7f9d7810d2 --- /dev/null +++ b/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java @@ -0,0 +1,39 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.ru.*; +import java.io.Reader; +import org.apache.lucene.analysis.CharTokenizer; +import java.util.Map; +import org.apache.solr.core.SolrConfig; +public class RussianLetterTokenizerFactory extends BaseTokenizerFactory { + + private char[] charset; + + public void init(SolrConfig solrConfig, Map args) { + super.init(solrConfig, args); + charset = RussianCommon.getCharset(args.get("charset")); + } + + public RussianLetterTokenizer create(Reader in) { + return new RussianLetterTokenizer(in,charset); + } +} + diff --git a/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java b/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java new file mode 100644 index 00000000000..72865eaf8a4 --- /dev/null +++ b/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java @@ -0,0 +1,40 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.ru.*; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import java.util.Map; +import org.apache.solr.core.SolrConfig; +public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory { + + private char[] charset; + + public void init(SolrConfig solrConfig, Map args) { + super.init(solrConfig, args); + charset = RussianCommon.getCharset(args.get("charset")); + } + + public RussianLowerCaseFilter create(TokenStream in) { + return new RussianLowerCaseFilter(in,charset); + } +} + diff --git a/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java b/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java new file mode 100644 index 00000000000..944eea305cc --- /dev/null +++ b/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java @@ -0,0 +1,41 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.ru.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.Map; +import org.apache.solr.core.SolrConfig; +public class RussianStemFilterFactory extends BaseTokenFilterFactory { + + private char[] charset; + + public void init(SolrConfig solrConfig, Map args) { + super.init(solrConfig, args); + charset = RussianCommon.getCharset(args.get("charset")); + } + + public RussianStemFilter create(TokenStream in) { + return new RussianStemFilter(in,charset); + } +} + diff --git a/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java b/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java new file mode 100644 index 00000000000..670f2dbde53 --- /dev/null +++ b/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java @@ -0,0 +1,35 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.th.*; +import java.io.IOException; +import java.util.Locale; +import java.lang.Character.UnicodeBlock; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.text.BreakIterator; +import java.util.Map; +public class ThaiWordFilterFactory extends BaseTokenFilterFactory { + public ThaiWordFilter create(TokenStream input) { + return new ThaiWordFilter(input); + } +} + diff --git a/src/java/org/apache/solr/util/SuggestMissingFactories.java b/src/java/org/apache/solr/util/SuggestMissingFactories.java new file mode 100644 index 00000000000..9556c1e8cbe --- /dev/null +++ b/src/java/org/apache/solr/util/SuggestMissingFactories.java @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.util; + +import java.io.Reader; +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.net.URL; +import java.net.URLClassLoader; +import java.net.MalformedURLException; +import java.util.Collection; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Enumeration; +import java.util.jar.*; + +/** + * Given a list of Jar files, suggest missing analysis factories. + * + * @version $Id$ + */ +public class SuggestMissingFactories { + + public static void main(String[] args) throws ClassNotFoundException, IOException, NoSuchMethodException { + + final File[] files = new File[args.length]; + for (int i = 0; i < args.length; i++) { + files[i] = new File(args[i]); + } + final FindClasses finder = new FindClasses(files); + final ClassLoader cl = finder.getClassLoader(); + + final Class TOKENSTREAM + = cl.loadClass("org.apache.lucene.analysis.TokenStream"); + final Class TOKENIZER + = cl.loadClass("org.apache.lucene.analysis.Tokenizer"); + final Class TOKENFILTER + = cl.loadClass("org.apache.lucene.analysis.TokenFilter"); + final Class TOKENIZERFACTORY + = cl.loadClass("org.apache.solr.analysis.TokenizerFactory"); + final Class TOKENFILTERFACTORY + = cl.loadClass("org.apache.solr.analysis.TokenFilterFactory"); + + + final HashSet result + = new HashSet(finder.findExtends(TOKENIZER)); + result.addAll(finder.findExtends(TOKENFILTER)); + + result.removeAll(finder.findMethodReturns + (finder.findExtends(TOKENIZERFACTORY), + "create", + Reader.class).values()); + result.removeAll(finder.findMethodReturns + (finder.findExtends(TOKENFILTERFACTORY), + "create", + TOKENSTREAM).values()); + + for (final Class c : result) { + System.out.println(c.getName()); + } + } + +} + +/** + * Takes in a clazz name and a jar and finds + * all classes in that jar that extend clazz. + */ +class FindClasses { + + /** + * Simple command line test method + */ + public static void main(String[] args) + throws ClassNotFoundException, IOException, NoSuchMethodException { + + FindClasses finder = new FindClasses(new File(args[1])); + ClassLoader cl = finder.getClassLoader(); + Class clazz = cl.loadClass(args[0]); + if (args.length == 2) { + + System.out.println("Finding all extenders of " + clazz.getName()); + for (Class c : finder.findExtends(clazz)) { + System.out.println(c.getName()); + } + } else { + String methName = args[2]; + System.out.println("Finding all extenders of " + clazz.getName() + + " with method: " + methName); + + Class[] methArgs = new Class[args.length-3]; + for (int i = 3; i < args.length; i++) { + methArgs[i-3] = cl.loadClass(args[i]); + } + Map map = finder.findMethodReturns + (finder.findExtends(clazz),methName, methArgs); + + for (Class key : map.keySet()) { + System.out.println(key.getName() + " => " + map.get(key).getName()); + } + + + } + } + + private JarFile[] jarFiles; + private ClassLoader cl; + public FindClasses(File... jars) throws IOException { + + + jarFiles = new JarFile[jars.length]; + URL[] urls = new URL[jars.length]; + try { + for (int i =0; i < jars.length; i++) { + jarFiles[i] = new JarFile(jars[i]); + urls[i] = jars[i].toURL(); + } + } catch (MalformedURLException e) { + throw new RuntimeException + ("WTF, how can JarFile.toURL() be malformed?", e); + } + + this.cl = new URLClassLoader(urls, this.getClass().getClassLoader()); + } + + /** + * returns a class loader that includes the jar used to + * construct this instance + */ + public ClassLoader getClassLoader() { + return this.cl; + } + + /** + * Find useful concrete (ie: not anonymous, not abstract, not an interface) + * classes that extend clazz + */ + public Collection findExtends(Class clazz) + throws ClassNotFoundException { + + HashSet results = new HashSet(); + + for (JarFile jarFile : jarFiles) { + for (Enumeration e = jarFile.entries(); + e.hasMoreElements() ;) { + + String n = e.nextElement().getName(); + if (n.endsWith(".class")) { + String cn = n.replace("/",".").substring(0,n.length()-6); + Class target; + try { + target = cl.loadClass(cn); + } catch (NoClassDefFoundError e1) { + throw new ClassNotFoundException + ("Can't load: " + cn, e1); + } + + if (clazz.isAssignableFrom(target) + && !target.isAnonymousClass()) { + + int mods = target.getModifiers(); + if (!(Modifier.isAbstract(mods) || + Modifier.isInterface(mods))) { + results.add(target); + } + } + } + } + } + return results; + } + + /** + * Given a collection of classes, returns a Map containing the + * subset of those classes that impliment the method specified, + * where the value in the map is the return type of the method + */ + public Map findMethodReturns(Collection clazzes, + String methodName, + Class... parameterTypes) + throws NoSuchMethodException{ + + HashMap results = new HashMap(); + for (Class clazz : clazzes) { + try { + Method m = clazz.getMethod(methodName, parameterTypes); + results.put(clazz, m.getReturnType()); + } catch (NoSuchMethodException e) { + /* :NOOP: we expect this and skip clazz */ + } + } + return results; + } +} +