mirror of https://github.com/apache/lucene.git
add factories for 'new' Lucene analysis classes where it makes sense
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@676390 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7de8ac0a90
commit
7ad6fc5bb0
12
build.xml
12
build.xml
|
@ -254,6 +254,18 @@
|
|||
<!-- no way to leverage this in Solr -->
|
||||
<regexp pattern="CachingTokenFilter"/>
|
||||
</linecontainsregexp>
|
||||
<linecontainsregexp negate="true">
|
||||
<!-- no way to leverage this in Solr -->
|
||||
<regexp pattern="HyphenationCompoundWordTokenFilter"/>
|
||||
</linecontainsregexp>
|
||||
<linecontainsregexp negate="true">
|
||||
<!-- no way to leverage these in Solr (yet) -->
|
||||
<regexp pattern="Sink\|Tee"/>
|
||||
</linecontainsregexp>
|
||||
<linecontainsregexp negate="true">
|
||||
<!-- Solr already has a different impl for this -->
|
||||
<regexp pattern="SynonymTokenFilter"/>
|
||||
</linecontainsregexp>
|
||||
<linecontainsregexp negate="true">
|
||||
<!-- solr and lucene both have one? ? ? ? -->
|
||||
<regexp pattern="LengthFilter"/>
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.compound.*;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
private Set dictionary;
|
||||
private String dictFile;
|
||||
private int minWordSize;
|
||||
private int minSubwordSize;
|
||||
private int maxSubwordSize;
|
||||
private boolean onlyLongestMatch;
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
dictFile = args.get("dictionary");
|
||||
minWordSize= getInt("minWordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize= getInt("minSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize= getInt("maxSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = getBoolean("onlyLongestMatch",true);
|
||||
}
|
||||
public void inform(ResourceLoader loader) {
|
||||
try {
|
||||
List<String> wlist = loader.getLines(dictFile);
|
||||
dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
public DictionaryCompoundWordTokenFilter create(TokenStream input) {
|
||||
return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.fr.*;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
|
||||
public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
private Set articles;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String articlesFile = args.get("articles");
|
||||
|
||||
if (articlesFile != null) {
|
||||
try {
|
||||
List<String> wlist = loader.getLines(articlesFile);
|
||||
articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("No articles specified for ElisionFilterFactory");
|
||||
}
|
||||
}
|
||||
|
||||
public ElisionFilter create(TokenStream input) {
|
||||
return new ElisionFilter(input,articles);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
private float payload;
|
||||
private String typeMatch;
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
payload = Float.parseFloat(args.get("payload"));
|
||||
typeMatch = args.get("typeMatch");
|
||||
}
|
||||
public NumericPayloadTokenFilter create(TokenStream input) {
|
||||
return new NumericPayloadTokenFilter(input,payload,typeMatch);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.shingle.*;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Iterator;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import java.util.Map;
|
||||
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||
private int maxShingleSize;
|
||||
private boolean outputUnigrams;
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
maxShingleSize = getInt("maxShingleSize",
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
outputUnigrams = getBoolean("outputUnigrams", true);
|
||||
}
|
||||
public ShingleFilter create(TokenStream input) {
|
||||
ShingleFilter r = new ShingleFilter(input,maxShingleSize);
|
||||
r.setOutputUnigrams(outputUnigrams);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenOffsetPayloadTokenFilter create(TokenStream input) {
|
||||
return new TokenOffsetPayloadTokenFilter(input);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
public TypeAsPayloadTokenFilter create(TokenStream input) {
|
||||
return new TypeAsPayloadTokenFilter(input);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue