diff --git a/build.xml b/build.xml index 8be668de8f4..8a1bbc9c8d2 100644 --- a/build.xml +++ b/build.xml @@ -254,6 +254,18 @@ + + + + + + + + + + + + diff --git a/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java b/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java new file mode 100644 index 00000000000..0b879531d3d --- /dev/null +++ b/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java @@ -0,0 +1,59 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.compound.*; +import org.apache.solr.util.plugin.ResourceLoaderAware; +import org.apache.solr.common.ResourceLoader; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import java.util.List; +import java.util.Set; +import java.util.Map; +import java.io.IOException; + +public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private Set dictionary; + private String dictFile; + private int minWordSize; + private int minSubwordSize; + private int maxSubwordSize; + private boolean onlyLongestMatch; + public void init(Map args) { + super.init(args); + dictFile = args.get("dictionary"); + minWordSize= getInt("minWordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); + minSubwordSize= getInt("minSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE); + maxSubwordSize= getInt("maxSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); + onlyLongestMatch = getBoolean("onlyLongestMatch",true); + } + public void inform(ResourceLoader loader) { + try { + List wlist = loader.getLines(dictFile); + dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + public DictionaryCompoundWordTokenFilter create(TokenStream input) { + return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch); + } +} + diff --git a/src/java/org/apache/solr/analysis/ElisionFilterFactory.java b/src/java/org/apache/solr/analysis/ElisionFilterFactory.java new file mode 100644 index 00000000000..b7dc1b4d6f9 --- /dev/null +++ b/src/java/org/apache/solr/analysis/ElisionFilterFactory.java @@ -0,0 +1,62 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; + +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.util.plugin.ResourceLoaderAware; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.fr.*; +import java.io.IOException; +import java.util.Set; +import java.util.HashSet; +import java.util.Arrays; +import java.util.Iterator; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenFilter; +import java.util.Map; +import java.util.List; +import java.util.Set; +import java.io.IOException; + +public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + + private Set articles; + + public void inform(ResourceLoader loader) { + String articlesFile = args.get("articles"); + + if (articlesFile != null) { + try { + List wlist = loader.getLines(articlesFile); + articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false); + } catch (IOException e) { + throw new RuntimeException(e); + } + } else { + throw new RuntimeException("No articles specified for ElisionFilterFactory"); + } + } + + public ElisionFilter create(TokenStream input) { + return new ElisionFilter(input,articles); + } +} + diff --git a/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java b/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java new file mode 100644 index 00000000000..880f83f8de0 --- /dev/null +++ b/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java @@ -0,0 +1,40 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.payloads.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Payload; +import java.io.IOException; +import java.util.Map; +public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory { + private float payload; + private String typeMatch; + public void init(Map args) { + super.init(args); + payload = Float.parseFloat(args.get("payload")); + typeMatch = args.get("typeMatch"); + } + public NumericPayloadTokenFilter create(TokenStream input) { + return new NumericPayloadTokenFilter(input,payload,typeMatch); + } +} + diff --git a/src/java/org/apache/solr/analysis/ShingleFilterFactory.java b/src/java/org/apache/solr/analysis/ShingleFilterFactory.java new file mode 100644 index 00000000000..6339e5c0fd0 --- /dev/null +++ b/src/java/org/apache/solr/analysis/ShingleFilterFactory.java @@ -0,0 +1,44 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.shingle.*; +import java.io.IOException; +import java.util.LinkedList; +import java.util.Iterator; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import java.util.Map; +public class ShingleFilterFactory extends BaseTokenFilterFactory { + private int maxShingleSize; + private boolean outputUnigrams; + public void init(Map args) { + super.init(args); + maxShingleSize = getInt("maxShingleSize", + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); + outputUnigrams = getBoolean("outputUnigrams", true); + } + public ShingleFilter create(TokenStream input) { + ShingleFilter r = new ShingleFilter(input,maxShingleSize); + r.setOutputUnigrams(outputUnigrams); + return r; + } +} + diff --git a/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java b/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java new file mode 100644 index 00000000000..16fbd2fd0b5 --- /dev/null +++ b/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java @@ -0,0 +1,33 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.payloads.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Payload; +import java.io.IOException; +import java.util.Map; +public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory { + public TokenOffsetPayloadTokenFilter create(TokenStream input) { + return new TokenOffsetPayloadTokenFilter(input); + } +} + diff --git a/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java b/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java new file mode 100644 index 00000000000..23efa386983 --- /dev/null +++ b/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java @@ -0,0 +1,33 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.solr.analysis; +import org.apache.lucene.analysis.payloads.*; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.Payload; +import java.io.IOException; +import java.util.Map; +public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory { + public TypeAsPayloadTokenFilter create(TokenStream input) { + return new TypeAsPayloadTokenFilter(input); + } +} +