LUCENE-2899: Add OpenNLP Analysis capabilities as a module

This commit is contained in:
Steve Rowe 2017-12-15 11:24:18 -05:00
parent d02d1f1cab
commit 3e2f9e62d7
92 changed files with 9872 additions and 64 deletions

View File

@ -11,6 +11,7 @@
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/icu/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/kuromoji/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/morfologik/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/opennlp/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/phonetic/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/smartcn/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/stempel/build.xml" />

View File

@ -15,6 +15,7 @@
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/icu/icu.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/kuromoji/kuromoji.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/morfologik/morfologik.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/opennlp/opennlp.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/phonetic/phonetic.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/smartcn/smartcn.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/stempel/stempel.iml" />

View File

@ -44,6 +44,14 @@
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<configuration default="false" name="Module analyzers-opennlp" type="JUnit" factoryName="JUnit">
<module name="opennlp" />
<option name="TEST_OBJECT" value="pattern" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/lucene/analysis/opennlp" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<configuration default="false" name="Module analyzers-phonetic" type="JUnit" factoryName="JUnit">
<module name="phonetic" />
<option name="TEST_OBJECT" value="pattern" />
@ -333,48 +341,49 @@
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<list size="41">
<list size="42">
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
<item index="3" class="java.lang.String" itemvalue="JUnit.Module analyzers-kuromoji" />
<item index="4" class="java.lang.String" itemvalue="JUnit.Module analyzers-morfologik" />
<item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
<item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
<item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
<item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
<item index="9" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
<item index="10" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
<item index="11" class="java.lang.String" itemvalue="JUnit.Module classification" />
<item index="12" class="java.lang.String" itemvalue="JUnit.Module codecs" />
<item index="13" class="java.lang.String" itemvalue="JUnit.Module expressions" />
<item index="14" class="java.lang.String" itemvalue="JUnit.Module facet" />
<item index="15" class="java.lang.String" itemvalue="JUnit.Module grouping" />
<item index="16" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
<item index="17" class="java.lang.String" itemvalue="JUnit.Module join" />
<item index="18" class="java.lang.String" itemvalue="JUnit.Module memory" />
<item index="19" class="java.lang.String" itemvalue="JUnit.Module misc" />
<item index="20" class="java.lang.String" itemvalue="JUnit.Module queries" />
<item index="21" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
<item index="22" class="java.lang.String" itemvalue="JUnit.Module replicator" />
<item index="23" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
<item index="24" class="java.lang.String" itemvalue="JUnit.Module spatial" />
<item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
<item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
<item index="27" class="java.lang.String" itemvalue="JUnit.Module suggest" />
<item index="28" class="java.lang.String" itemvalue="Application.solrcloud" />
<item index="29" class="java.lang.String" itemvalue="JUnit.Solr core" />
<item index="30" class="java.lang.String" itemvalue="JUnit.Solrj" />
<item index="31" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
<item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-opennlp" />
<item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
<item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
<item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
<item index="9" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
<item index="10" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
<item index="11" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
<item index="12" class="java.lang.String" itemvalue="JUnit.Module classification" />
<item index="13" class="java.lang.String" itemvalue="JUnit.Module codecs" />
<item index="14" class="java.lang.String" itemvalue="JUnit.Module expressions" />
<item index="15" class="java.lang.String" itemvalue="JUnit.Module facet" />
<item index="16" class="java.lang.String" itemvalue="JUnit.Module grouping" />
<item index="17" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
<item index="18" class="java.lang.String" itemvalue="JUnit.Module join" />
<item index="19" class="java.lang.String" itemvalue="JUnit.Module memory" />
<item index="20" class="java.lang.String" itemvalue="JUnit.Module misc" />
<item index="21" class="java.lang.String" itemvalue="JUnit.Module queries" />
<item index="22" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
<item index="23" class="java.lang.String" itemvalue="JUnit.Module replicator" />
<item index="24" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
<item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial" />
<item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
<item index="27" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
<item index="28" class="java.lang.String" itemvalue="JUnit.Module suggest" />
<item index="29" class="java.lang.String" itemvalue="Application.solrcloud" />
<item index="30" class="java.lang.String" itemvalue="JUnit.Solr core" />
<item index="31" class="java.lang.String" itemvalue="JUnit.Solrj" />
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
<item index="41" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
</list>
</component>
</project>

View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="module-library">
<library>
<CLASSES>
<root url="file://$MODULE_DIR$/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
</library>
</orderEntry>
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" module-name="analysis-common" />
<orderEntry type="module" module-name="lucene-core" />
</component>
</module>

View File

@ -37,5 +37,6 @@
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="misc" />
<orderEntry type="module" module-name="sandbox" />
<orderEntry type="module" module-name="opennlp" />
</component>
</module>

View File

@ -0,0 +1,78 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-opennlp</artifactId>
<packaging>jar</packaging>
<name>Lucene OpenNLP integration</name>
<description>
Lucene OpenNLP integration
</description>
<properties>
<module-directory>lucene/analysis/opennlp</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:git:${vc-anonymous-base-url}</connection>
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
<url>${vc-browse-base-url};f=${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
@lucene-analyzers-opennlp.internal.dependencies@
@lucene-analyzers-opennlp.external.dependencies@
@lucene-analyzers-opennlp.internal.test.dependencies@
@lucene-analyzers-opennlp.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<resources>
<resource>
<directory>${module-path}/src/resources</directory>
</resource>
</resources>
<testResources>
<testResource>
<directory>${project.build.testSourceDirectory}</directory>
<excludes>
<exclude>**/*.java</exclude>
</excludes>
</testResource>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
</testResources>
</build>
</project>

View File

@ -35,6 +35,7 @@
<module>icu</module>
<module>kuromoji</module>
<module>morfologik</module>
<module>opennlp</module>
<module>phonetic</module>
<module>smartcn</module>
<module>stempel</module>

View File

@ -65,6 +65,15 @@ API Changes
* LUCENE-8051: LevensteinDistance renamed to LevenshteinDistance.
(Pulak Ghosh via Adrien Grand)
New Features
* LUCENE-2899: Add new module analysis/opennlp, with analysis components
to perform tokenization, part-of-speech tagging, lemmatization and phrase
chunking by invoking the corresponding OpenNLP tools. Named entity
recognition is also provided as a Solr update request processor.
(Lance Norskog, Grant Ingersoll, Joern Kottmann, Em, Kai Gülzau,
Rene Nederhand, Robert Muir, Steven Bower, Steve Rowe)
Improvements
* LUCENE-8081: Allow IndexWriter to opt out of flushing on indexing threads

View File

@ -28,6 +28,9 @@ lucene-analyzers-kuromoji-XX.jar
lucene-analyzers-morfologik-XX.jar
An analyzer using the Morfologik stemming library.
lucene-analyzers-opennlp-XX.jar
An analyzer using the OpenNLP natural-language processing library.
lucene-analyzers-phonetic-XX.jar
An add-on analysis library that provides phonetic encoders via Apache
Commons-Codec. Note: this module depends on the commons-codec jar
@ -49,6 +52,7 @@ common/src/java
icu/src/java
kuromoji/src/java
morfologik/src/java
opennlp/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
@ -59,6 +63,7 @@ common/src/test
icu/src/test
kuromoji/src/test
morfologik/src/test
opennlp/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test

View File

@ -65,6 +65,10 @@
<ant dir="morfologik" />
</target>
<target name="opennlp">
<ant dir="opennlp" />
</target>
<target name="phonetic">
<ant dir="phonetic" />
</target>
@ -82,7 +86,7 @@
</target>
<target name="default" depends="compile"/>
<target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel,uima" />
<target name="compile" depends="common,icu,kuromoji,morfologik,opennlp,phonetic,smartcn,stempel,uima" />
<target name="clean">
<forall-analyzers target="clean"/>

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Adds the {@link TypeAttribute#type()} as a synonym,
* i.e. another token at the same position, optionally with a specified prefix prepended.
*/
public final class TypeAsSynonymFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final String prefix;
AttributeSource.State savedToken = null;
public TypeAsSynonymFilter(TokenStream input) {
this(input, null);
}
/**
* @param input input tokenstream
* @param prefix Prepend this string to every token type emitted as token text.
* If null, nothing will be prepended.
*/
public TypeAsSynonymFilter(TokenStream input, String prefix) {
super(input);
this.prefix = prefix;
}
@Override
public boolean incrementToken() throws IOException {
if (savedToken != null) { // Emit last token's type at the same position
restoreState(savedToken);
savedToken = null;
termAtt.setEmpty();
if (prefix != null) {
termAtt.append(prefix);
}
termAtt.append(typeAtt.type());
posIncrAtt.setPositionIncrement(0);
return true;
} else if (input.incrementToken()) { // Ho pending token type to emit
savedToken = captureState();
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
savedToken = null;
}
}

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link TypeAsSynonymFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.UAX29URLEmailTokenizerFactory"/&gt;
* &lt;filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* <p>
* If the optional {@code prefix} parameter is used, the specified value will be prepended
* to the type, e.g. with prefix="_type_", for a token "example.com" with type "&lt;URL&gt;",
* the emitted synonym will have text "_type_&lt;URL&gt;".
*/
public class TypeAsSynonymFilterFactory extends TokenFilterFactory {
private final String prefix;
public TypeAsSynonymFilterFactory(Map<String,String> args) {
super(args);
prefix = get(args, "prefix"); // default value is null
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new TypeAsSynonymFilter(input, prefix);
}
}

View File

@ -80,6 +80,7 @@ org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory

View File

@ -183,14 +183,14 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
TokenStream ts = createTokenStream(5, "woof woof woof woof woof", 1, 1, 100, false);
assertTokenStreamContents(ts, hashes, new int[]{0},
new int[]{24}, new String[]{MinHashFilter.MIN_HASH_TYPE}, new int[]{1}, new int[]{1}, 24, 0, null,
true);
true, null);
ts = createTokenStream(5, "woof woof woof woof woof", 2, 1, 1, false);
assertTokenStreamContents(ts, new String[]{new String(new char[]{0, 0, 8449, 54077, 64133, 32857, 8605, 41409}),
new String(new char[]{0, 1, 16887, 58164, 39536, 14926, 6529, 17276})}, new int[]{0, 0},
new int[]{24, 24}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
new int[]{1, 1}, 24, 0, null,
true);
true, null);
}
@Test
@ -203,7 +203,7 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
false);
assertTokenStreamContents(ts, hashes, new int[]{0, 0},
new int[]{49, 49}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
new int[]{1, 1}, 49, 0, null, true);
new int[]{1, 1}, 49, 0, null, true, null);
}
private ArrayList<String> getTokens(TokenStream ts) throws IOException {

View File

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
public class TestTypeAsSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
private static final Token[] TOKENS = { token("Visit", "<ALPHANUM>"), token("example.com", "<URL>") };
public void testBasic() throws Exception {
TokenStream stream = new CannedTokenStream(TOKENS);
stream = tokenFilterFactory("TypeAsSynonym").create(stream);
assertTokenStreamContents(stream, new String[] { "Visit", "<ALPHANUM>", "example.com", "<URL>" },
null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
}
public void testPrefix() throws Exception {
TokenStream stream = new CannedTokenStream(TOKENS);
stream = tokenFilterFactory("TypeAsSynonym", "prefix", "_type_").create(stream);
assertTokenStreamContents(stream, new String[] { "Visit", "_type_<ALPHANUM>", "example.com", "_type_<URL>" },
null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
}
private static Token token(String term, String type) {
Token token = new Token();
token.setEmpty();
token.append(term);
token.setType(type);
return token;
}
}

View File

@ -0,0 +1,118 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="analyzers-opennlp" default="default">
<description>
OpenNLP Library Integration
</description>
<path id="opennlpjars">
<fileset dir="lib"/>
</path>
<property name="test.model.data.dir" location="src/tools/test-model-data"/>
<property name="tests.userdir" location="src/test-files"/>
<property name="test.model.dir" location="${tests.userdir}/org/apache/lucene/analysis/opennlp"/>
<import file="../analysis-module-build.xml"/>
<property name="analysis-extras.conf.dir"
location="${common.dir}/../solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf"/>
<path id="classpath">
<pathelement path="${analyzers-common.jar}"/>
<path refid="opennlpjars"/>
<path refid="base.classpath"/>
</path>
<path id="test.classpath">
<path refid="test.base.classpath"/>
<pathelement path="${tests.userdir}"/>
</path>
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
<!--
This does not create real NLP models, just small unencumbered ones for the unit tests.
All text taken from reuters corpus.
Tags applied with online demos at CCG Urbana-Champaign.
-->
<target name="train-test-models" description="Train all small test models for unit tests" depends="resolve">
<mkdir dir="${test.model.dir}"/>
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.sentdetect.training -->
<trainModel command="SentenceDetectorTrainer" lang="en" data="sentences.txt" model="en-test-sent.bin"/>
<copy file="${test.model.dir}/en-test-sent.bin" todir="${analysis-extras.conf.dir}"/>
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.tokenizer.training -->
<trainModel command="TokenizerTrainer" lang="en" data="tokenizer.txt" model="en-test-tokenizer.bin"/>
<copy file="${test.model.dir}/en-test-tokenizer.bin" todir="${analysis-extras.conf.dir}"/>
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.postagger.training -->
<trainModel command="POSTaggerTrainer" lang="en" data="pos.txt" model="en-test-pos-maxent.bin"/>
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.chunker.training -->
<trainModel command="ChunkerTrainerME" lang="en" data="chunks.txt" model="en-test-chunker.bin"/>
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.training -->
<trainModel command="TokenNameFinderTrainer" lang="en" data="ner_flashman.txt" model="en-test-ner-person.bin">
<extra-args>
<arg value="-params"/>
<arg value="ner_TrainerParams.txt"/>
</extra-args>
</trainModel>
<copy file="${test.model.dir}/en-test-ner-person.bin" todir="${analysis-extras.conf.dir}"/>
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.lemmatizer.training -->
<trainModel command="LemmatizerTrainerME" lang="en" data="lemmas.txt" model="en-test-lemmatizer.bin"/>
</target>
<macrodef name="trainModel">
<attribute name="command"/>
<attribute name="lang"/>
<attribute name="data"/>
<attribute name="model"/>
<element name="extra-args" optional="true"/>
<sequential>
<java classname="opennlp.tools.cmdline.CLI"
dir="${test.model.data.dir}"
fork="true"
failonerror="true">
<classpath>
<path refid="opennlpjars"/>
</classpath>
<arg value="@{command}"/>
<arg value="-lang"/>
<arg value="@{lang}"/>
<arg value="-data"/>
<arg value="@{data}"/>
<arg value="-model"/>
<arg value="${test.model.dir}/@{model}"/>
<extra-args/>
</java>
</sequential>
</macrodef>
<target name="regenerate" depends="train-test-models"/>
</project>

View File

@ -0,0 +1,29 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-opennlp" />
<configurations defaultconfmapping="compile->master">
<conf name="compile" transitive="false"/>
</configurations>
<dependencies>
<dependency org="org.apache.opennlp" name="opennlp-tools" rev="${/org.apache.opennlp/opennlp-tools}" transitive="false" conf="compile" />
<dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${/org.apache.opennlp/opennlp-maxent}" transitive="false" conf="compile" />
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
</dependencies>
</ivy-module>

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Run OpenNLP chunker. Prerequisite: the OpenNLPTokenizer and OpenNLPPOSFilter must precede this filter.
* Tags terms in the TypeAttribute, replacing the POS tags previously put there by OpenNLPPOSFilter.
*/
public final class OpenNLPChunkerFilter extends TokenFilter {
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private int tokenNum = 0;
private boolean moreTokensAvailable = true;
private String[] sentenceTerms = null;
private String[] sentenceTermPOSTags = null;
private final NLPChunkerOp chunkerOp;
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
super(input);
this.chunkerOp = chunkerOp;
}
@Override
public final boolean incrementToken() throws IOException {
if ( ! moreTokensAvailable) {
clear();
return false;
}
if (tokenNum == sentenceTokenAttrs.size()) {
nextSentence();
if (sentenceTerms == null) {
clear();
return false;
}
assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
tokenNum = 0;
}
clearAttributes();
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
return true;
}
private void nextSentence() throws IOException {
List<String> termList = new ArrayList<>();
List<String> posTagList = new ArrayList<>();
sentenceTokenAttrs.clear();
boolean endOfSentence = false;
while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
termList.add(termAtt.toString());
posTagList.add(typeAtt.type());
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
sentenceTokenAttrs.add(input.cloneAttributes());
}
sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
sentenceTermPOSTags = posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
}
private void assignTokenTypes(String[] tags) {
for (int i = 0 ; i < tags.length ; ++i) {
sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
}
}
@Override
public void reset() throws IOException {
super.reset();
moreTokensAvailable = true;
clear();
}
private void clear() {
sentenceTokenAttrs.clear();
sentenceTerms = null;
sentenceTermPOSTags = null;
tokenNum = 0;
}
}

View File

@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link OpenNLPChunkerFilter}.
*
* <pre class="prettyprint">
* &lt;fieldType name="text_opennlp_chunked" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/&gt;
* &lt;filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/&gt;
* &lt;filter class="solr.OpenNLPChunkerFilterFactory" chunkerModel="filename"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @since 7.3.0
*/
public class OpenNLPChunkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String CHUNKER_MODEL = "chunkerModel";
private final String chunkerModelFile;
public OpenNLPChunkerFilterFactory(Map<String,String> args) {
super(args);
chunkerModelFile = get(args, CHUNKER_MODEL);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public OpenNLPChunkerFilter create(TokenStream in) {
try {
NLPChunkerOp chunkerOp = null;
if (chunkerModelFile != null) {
chunkerOp = OpenNLPOpsFactory.getChunker(chunkerModelFile);
}
return new OpenNLPChunkerFilter(in, chunkerOp);
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public void inform(ResourceLoader loader) {
try {
// load and register read-only models in cache with file/resource names
if (chunkerModelFile != null) {
OpenNLPOpsFactory.getChunkerModel(chunkerModelFile, loader);
}
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}

View File

@ -0,0 +1,123 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* <p>Runs OpenNLP dictionary-based and/or MaxEnt lemmatizers.</p>
* <p>
* Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported,
* via the "dictionary" and "lemmatizerModel" params, respectively.
* If both are configured, the dictionary-based lemmatizer is tried first,
* and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
* </p>
* <p>
* The dictionary file must be encoded as UTF-8, with one entry per line,
* in the form <tt>word[tab]lemma[tab]part-of-speech</tt>
* </p>
*/
public class OpenNLPLemmatizerFilter extends TokenFilter {
private final NLPLemmatizerOp lemmatizerOp;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
private boolean moreTokensAvailable = true;
private String[] sentenceTokens = null; // non-keyword tokens
private String[] sentenceTokenTypes = null; // types for non-keyword tokens
private String[] lemmas = null; // lemmas for non-keyword tokens
private int lemmaNum = 0; // lemma counter
public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
super(input);
this.lemmatizerOp = lemmatizerOp;
}
@Override
public final boolean incrementToken() throws IOException {
if ( ! moreTokensAvailable) {
clear();
return false;
}
if (sentenceTokenAttrsIter == null || ! sentenceTokenAttrsIter.hasNext()) {
nextSentence();
if (sentenceTokens == null) { // zero non-keyword tokens
clear();
return false;
}
lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
lemmaNum = 0;
sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
}
clearAttributes();
sentenceTokenAttrsIter.next().copyTo(this);
if ( ! keywordAtt.isKeyword()) {
termAtt.setEmpty().append(lemmas[lemmaNum++]);
}
return true;
}
private void nextSentence() throws IOException {
List<String> tokenList = new ArrayList<>();
List<String> typeList = new ArrayList<>();
sentenceTokenAttrs.clear();
boolean endOfSentence = false;
while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
if ( ! keywordAtt.isKeyword()) {
tokenList.add(termAtt.toString());
typeList.add(typeAtt.type());
}
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
sentenceTokenAttrs.add(input.cloneAttributes());
}
sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
}
@Override
public void reset() throws IOException {
super.reset();
moreTokensAvailable = true;
clear();
}
private void clear() {
sentenceTokenAttrs.clear();
sentenceTokenAttrsIter = null;
sentenceTokens = null;
sentenceTokenTypes = null;
lemmas = null;
lemmaNum = 0;
}
}

View File

@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link OpenNLPLemmatizerFilter}.
*
* <pre class="prettyprint">
* &lt;fieldType name="text_opennlp_lemma" class="solr.TextField" positionIncrementGap="100"
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.OpenNLPTokenizerFactory"
* sentenceModel="filename"
* tokenizerModel="filename"/&gt;
* /&gt;
* &lt;filter class="solr.OpenNLPLemmatizerFilterFactory"
* dictionary="filename"
* lemmatizerModel="filename"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @since 7.3.0
*/
public class OpenNLPLemmatizerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String DICTIONARY = "dictionary";
public static final String LEMMATIZER_MODEL = "lemmatizerModel";
private final String dictionaryFile;
private final String lemmatizerModelFile;
public OpenNLPLemmatizerFilterFactory(Map<String,String> args) {
super(args);
dictionaryFile = get(args, DICTIONARY);
lemmatizerModelFile = get(args, LEMMATIZER_MODEL);
if (dictionaryFile == null && lemmatizerModelFile == null) {
throw new IllegalArgumentException("Configuration Error: missing parameter: at least one of '"
+ DICTIONARY + "' and '" + LEMMATIZER_MODEL + "' must be provided.");
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public OpenNLPLemmatizerFilter create(TokenStream in) {
try {
NLPLemmatizerOp lemmatizerOp = OpenNLPOpsFactory.getLemmatizer(dictionaryFile, lemmatizerModelFile);
return new OpenNLPLemmatizerFilter(in, lemmatizerOp);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
// register models in cache with file/resource names
if (dictionaryFile != null) {
OpenNLPOpsFactory.getLemmatizerDictionary(dictionaryFile, loader);
}
if (lemmatizerModelFile != null) {
OpenNLPOpsFactory.getLemmatizerModel(lemmatizerModelFile, loader);
}
}
}

View File

@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Run OpenNLP POS tagger. Tags all terms in the TypeAttribute.
*/
public final class OpenNLPPOSFilter extends TokenFilter {
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
String[] tags = null;
private int tokenNum = 0;
private boolean moreTokensAvailable = true;
private final NLPPOSTaggerOp posTaggerOp;
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
super(input);
this.posTaggerOp = posTaggerOp;
}
@Override
public final boolean incrementToken() throws IOException {
if ( ! moreTokensAvailable) {
clear();
return false;
}
if (tokenNum == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
String[] sentenceTokens = nextSentence();
if (sentenceTokens == null) {
clear();
return false;
}
tags = posTaggerOp.getPOSTags(sentenceTokens);
tokenNum = 0;
}
clearAttributes();
sentenceTokenAttrs.get(tokenNum).copyTo(this);
typeAtt.setType(tags[tokenNum++]);
return true;
}
private String[] nextSentence() throws IOException {
List<String> termList = new ArrayList<>();
sentenceTokenAttrs.clear();
boolean endOfSentence = false;
while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
termList.add(termAtt.toString());
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
sentenceTokenAttrs.add(input.cloneAttributes());
}
return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
}
@Override
public void reset() throws IOException {
super.reset();
moreTokensAvailable = true;
}
private void clear() {
sentenceTokenAttrs.clear();
tags = null;
tokenNum = 0;
}
}

View File

@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link OpenNLPPOSFilter}.
*
* <pre class="prettyprint">
* &lt;fieldType name="text_opennlp_pos" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/&gt;
* &lt;filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @since 7.3.0
*/
public class OpenNLPPOSFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String POS_TAGGER_MODEL = "posTaggerModel";
private final String posTaggerModelFile;
public OpenNLPPOSFilterFactory(Map<String,String> args) {
super(args);
posTaggerModelFile = require(args, POS_TAGGER_MODEL);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public OpenNLPPOSFilter create(TokenStream in) {
try {
return new OpenNLPPOSFilter(in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile));
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public void inform(ResourceLoader loader) {
try { // load and register the read-only model in cache with file/resource name
OpenNLPOpsFactory.getPOSTaggerModel(posTaggerModelFile, loader);
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}

View File

@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import opennlp.tools.util.Span;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.util.CharArrayIterator;
/**
* A {@link BreakIterator} that splits sentences using an OpenNLP sentence chunking model.
*/
public final class OpenNLPSentenceBreakIterator extends BreakIterator {
private CharacterIterator text;
private int currentSentence;
private int[] sentenceStarts;
private NLPSentenceDetectorOp sentenceOp;
public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp) {
this.sentenceOp = sentenceOp;
}
@Override
public int current() {
return text.getIndex();
}
@Override
public int first() {
currentSentence = 0;
text.setIndex(text.getBeginIndex());
return current();
}
@Override
public int last() {
if (sentenceStarts.length > 0) {
currentSentence = sentenceStarts.length - 1;
text.setIndex(text.getEndIndex());
} else { // there are no sentences; both the first and last positions are the begin index
currentSentence = 0;
text.setIndex(text.getBeginIndex());
}
return current();
}
@Override
public int next() {
if (text.getIndex() == text.getEndIndex() || 0 == sentenceStarts.length) {
return DONE;
} else if (currentSentence < sentenceStarts.length - 1) {
text.setIndex(sentenceStarts[++currentSentence]);
return current();
} else {
return last();
}
}
@Override
public int following(int pos) {
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
} else if (0 == sentenceStarts.length) {
text.setIndex(text.getBeginIndex());
return DONE;
} else if (pos >= sentenceStarts[sentenceStarts.length - 1]) {
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
text.setIndex(text.getEndIndex());
currentSentence = sentenceStarts.length - 1;
return DONE;
} else { // there are at least two sentences
currentSentence = (sentenceStarts.length - 1) / 2; // start search from the middle
moveToSentenceAt(pos, 0, sentenceStarts.length - 2);
text.setIndex(sentenceStarts[++currentSentence]);
return current();
}
}
/** Binary search over sentences */
private void moveToSentenceAt(int pos, int minSentence, int maxSentence) {
if (minSentence != maxSentence) {
if (pos < sentenceStarts[currentSentence]) {
int newMaxSentence = currentSentence - 1;
currentSentence = minSentence + (currentSentence - minSentence) / 2;
moveToSentenceAt(pos, minSentence, newMaxSentence);
} else if (pos >= sentenceStarts[currentSentence + 1]) {
int newMinSentence = currentSentence + 1;
currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
moveToSentenceAt(pos, newMinSentence, maxSentence);
}
} else {
assert currentSentence == minSentence;
assert pos >= sentenceStarts[currentSentence];
assert (currentSentence == sentenceStarts.length - 1 && pos <= text.getEndIndex())
|| pos < sentenceStarts[currentSentence + 1];
}
// we have arrived - nothing to do
}
@Override
public int previous() {
if (text.getIndex() == text.getBeginIndex()) {
return DONE;
} else {
if (0 == sentenceStarts.length) {
text.setIndex(text.getBeginIndex());
return DONE;
}
if (text.getIndex() == text.getEndIndex()) {
text.setIndex(sentenceStarts[currentSentence]);
} else {
text.setIndex(sentenceStarts[--currentSentence]);
}
return current();
}
}
@Override
public int preceding(int pos) {
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
} else if (0 == sentenceStarts.length) {
text.setIndex(text.getBeginIndex());
currentSentence = 0;
return DONE;
} else if (pos < sentenceStarts[0]) {
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
text.setIndex(text.getBeginIndex());
currentSentence = 0;
return DONE;
} else {
currentSentence = sentenceStarts.length / 2; // start search from the middle
moveToSentenceAt(pos, 0, sentenceStarts.length - 1);
if (0 == currentSentence) {
text.setIndex(text.getBeginIndex());
return DONE;
} else {
text.setIndex(sentenceStarts[--currentSentence]);
return current();
}
}
}
@Override
public int next(int n) {
currentSentence += n;
if (n < 0) {
if (text.getIndex() == text.getEndIndex()) {
++currentSentence;
}
if (currentSentence < 0) {
currentSentence = 0;
text.setIndex(text.getBeginIndex());
return DONE;
} else {
text.setIndex(sentenceStarts[currentSentence]);
}
} else if (n > 0) {
if (currentSentence >= sentenceStarts.length) {
currentSentence = sentenceStarts.length - 1;
text.setIndex(text.getEndIndex());
return DONE;
} else {
text.setIndex(sentenceStarts[currentSentence]);
}
}
return current();
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public void setText(CharacterIterator newText) {
text = newText;
text.setIndex(text.getBeginIndex());
currentSentence = 0;
Span[] spans = sentenceOp.splitSentences(characterIteratorToString());
sentenceStarts = new int[spans.length];
for (int i = 0; i < spans.length; ++i) {
// Adjust start positions to match those of the passed-in CharacterIterator
sentenceStarts[i] = spans[i].getStart() + text.getBeginIndex();
}
}
private String characterIteratorToString() {
String fullText;
if (text instanceof CharArrayIterator) {
CharArrayIterator charArrayIterator = (CharArrayIterator)text;
fullText = new String(charArrayIterator.getText(), charArrayIterator.getStart(), charArrayIterator.getLength());
} else {
// TODO: is there a better way to extract full text from arbitrary CharacterIterators?
StringBuilder builder = new StringBuilder();
for (char ch = text.first(); ch != CharacterIterator.DONE; ch = text.next()) {
builder.append(ch);
}
fullText = builder.toString();
text.setIndex(text.getBeginIndex());
}
return fullText;
}
}

View File

@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import opennlp.tools.util.Span;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;
/**
* Run OpenNLP SentenceDetector and Tokenizer.
* The last token in each sentence is marked by setting the {@link #EOS_FLAG_BIT} in the FlagsAttribute;
* following filters can use this information to apply operations to tokens one sentence at a time.
*/
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
public static int EOS_FLAG_BIT = 1;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private Span[] termSpans = null;
private int termNum = 0;
private int sentenceStart = 0;
private NLPSentenceDetectorOp sentenceOp = null;
private NLPTokenizerOp tokenizerOp = null;
public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
if (sentenceOp == null || tokenizerOp == null) {
throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
}
this.sentenceOp = sentenceOp;
this.tokenizerOp = tokenizerOp;
}
@Override
public void close() throws IOException {
super.close();
termSpans = null;
termNum = sentenceStart = 0;
};
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
termSpans = tokenizerOp.getTerms(sentenceText);
termNum = 0;
}
@Override
protected boolean incrementWord() {
if (termSpans == null || termNum == termSpans.length) {
return false;
}
clearAttributes();
Span term = termSpans[termNum];
termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
correctOffset(offset + sentenceStart + term.getEnd()));
if (termNum == termSpans.length - 1) {
flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
}
++termNum;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
termSpans = null;
termNum = sentenceStart = 0;
}
}

View File

@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
/**
* Factory for {@link OpenNLPTokenizer}.
*
* <pre class="prettyprint">
* &lt;fieldType name="text_opennlp" class="solr.TextField" positionIncrementGap="100"
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @since 7.3.0
*/
public class OpenNLPTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
public static final String SENTENCE_MODEL = "sentenceModel";
public static final String TOKENIZER_MODEL = "tokenizerModel";
private final String sentenceModelFile;
private final String tokenizerModelFile;
public OpenNLPTokenizerFactory(Map<String,String> args) {
super(args);
sentenceModelFile = require(args, SENTENCE_MODEL);
tokenizerModelFile = require(args, TOKENIZER_MODEL);
if ( ! args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public OpenNLPTokenizer create(AttributeFactory factory) {
try {
NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
// register models in cache with file/resource names
if (sentenceModelFile != null) {
OpenNLPOpsFactory.getSentenceModel(sentenceModelFile, loader);
}
if (tokenizerModelFile != null) {
OpenNLPOpsFactory.getTokenizerModel(tokenizerModelFile, loader);
}
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Analysis components based on OpenNLP
*/
package org.apache.lucene.analysis.opennlp;

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import java.io.IOException;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
/**
* Supply OpenNLP Chunking tool
* Requires binary models from OpenNLP project on SourceForge.
*/
public class NLPChunkerOp {
private ChunkerME chunker = null;
public NLPChunkerOp(ChunkerModel chunkerModel) throws IOException {
chunker = new ChunkerME(chunkerModel);
}
public synchronized String[] getChunks(String[] words, String[] tags, double[] probs) {
String[] chunks = chunker.chunk(words, tags);
if (probs != null)
chunker.probs(probs);
return chunks;
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
/**
* <p>Supply OpenNLP Lemmatizer tools.</p>
* <p>
* Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported.
* If both are configured, the dictionary-based lemmatizer is tried first,
* and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
* </p>
* <p>
* The MaxEnt implementation requires binary models from OpenNLP project on SourceForge.
* </p>
*/
public class NLPLemmatizerOp {
private final DictionaryLemmatizer dictionaryLemmatizer;
private final LemmatizerME lemmatizerME;
public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel) throws IOException {
assert dictionary != null || lemmatizerModel != null : "At least one parameter must be non-null";
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
}
public String[] lemmatize(String[] words, String[] postags) {
String[] lemmas = null;
String[] maxEntLemmas = null;
if (dictionaryLemmatizer != null) {
lemmas = dictionaryLemmatizer.lemmatize(words, postags);
for (int i = 0; i < lemmas.length; ++i) {
if (lemmas[i].equals("O")) { // this word is not in the dictionary
if (lemmatizerME != null) { // fall back to the MaxEnt lemmatizer if it's enabled
if (maxEntLemmas == null) {
maxEntLemmas = lemmatizerME.lemmatize(words, postags);
}
if ("_".equals(maxEntLemmas[i])) {
lemmas[i] = words[i]; // put back the original word if no lemma is found
} else {
lemmas[i] = maxEntLemmas[i];
}
} else { // there is no MaxEnt lemmatizer
lemmas[i] = words[i]; // put back the original word if no lemma is found
}
}
}
} else { // there is only a MaxEnt lemmatizer
maxEntLemmas = lemmatizerME.lemmatize(words, postags);
for (int i = 0 ; i < maxEntLemmas.length ; ++i) {
if ("_".equals(maxEntLemmas[i])) {
maxEntLemmas[i] = words[i]; // put back the original word if no lemma is found
}
}
lemmas = maxEntLemmas;
}
return lemmas;
}
}

View File

@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;
/**
* Supply OpenNLP Named Entity Resolution tool
* Requires binary models from OpenNLP project on SourceForge.
*
* Usage: from <a href="http://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.recognition.api"
* >the OpenNLP documentation</a>:
*
* "The NameFinderME class is not thread safe, it must only be called from one thread.
* To use multiple threads multiple NameFinderME instances sharing the same model instance
* can be created. The input text should be segmented into documents, sentences and tokens.
* To perform entity detection an application calls the find method for every sentence in
* the document. After every document clearAdaptiveData must be called to clear the adaptive
* data in the feature generators. Not calling clearAdaptiveData can lead to a sharp drop
* in the detection rate after a few documents."
*
*/
public class NLPNERTaggerOp {
private final TokenNameFinder nameFinder;
public NLPNERTaggerOp(TokenNameFinderModel model) {
this.nameFinder = new NameFinderME(model);
}
public Span[] getNames(String[] words) {
Span[] names = nameFinder.find(words);
return names;
}
public synchronized void reset() {
nameFinder.clearAdaptiveData();
}
}

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import java.io.IOException;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
/**
* Supply OpenNLP Parts-Of-Speech Tagging tool
* Requires binary models from OpenNLP project on SourceForge.
*/
public class NLPPOSTaggerOp {
private POSTagger tagger = null;
public NLPPOSTaggerOp(POSModel model) throws IOException {
tagger = new POSTaggerME(model);
}
public synchronized String[] getPOSTags(String[] words) {
return tagger.tag(words);
}
}

View File

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import java.io.IOException;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.Span;
/**
* Supply OpenNLP Sentence Detector tool
* Requires binary models from OpenNLP project on SourceForge.
*/
public class NLPSentenceDetectorOp {
private final SentenceDetectorME sentenceSplitter;
public NLPSentenceDetectorOp(SentenceModel model) throws IOException {
sentenceSplitter = new SentenceDetectorME(model);
}
public NLPSentenceDetectorOp() {
sentenceSplitter = null;
}
public synchronized Span[] splitSentences(String line) {
if (sentenceSplitter != null) {
return sentenceSplitter.sentPosDetect(line);
} else {
Span[] shorty = new Span[1];
shorty[0] = new Span(0, line.length());
return shorty;
}
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
/**
* Supply OpenNLP Sentence Tokenizer tool
* Requires binary models from OpenNLP project on SourceForge.
*/
public class NLPTokenizerOp {
private final Tokenizer tokenizer;
public NLPTokenizerOp(TokenizerModel model) {
tokenizer = new TokenizerME(model);
}
public NLPTokenizerOp() {
tokenizer = null;
}
public synchronized Span[] getTerms(String sentence) {
if (tokenizer == null) {
Span[] span1 = new Span[1];
span1[0] = new Span(0, sentence.length());
return span1;
}
return tokenizer.tokenizePos(sentence);
}
}

View File

@ -0,0 +1,176 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.lucene.analysis.util.ResourceLoader;
/**
* Supply OpenNLP Named Entity Recognizer
* Cache model file objects. Assumes model files are thread-safe.
*/
public class OpenNLPOpsFactory {
private static Map<String,SentenceModel> sentenceModels = new ConcurrentHashMap<>();
private static ConcurrentHashMap<String,TokenizerModel> tokenizerModels = new ConcurrentHashMap<>();
private static ConcurrentHashMap<String,POSModel> posTaggerModels = new ConcurrentHashMap<>();
private static ConcurrentHashMap<String,ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
private static Map<String,TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
private static Map<String,LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
private static Map<String,String> lemmaDictionaries = new ConcurrentHashMap<>();
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
if (modelName != null) {
SentenceModel model = sentenceModels.get(modelName);
return new NLPSentenceDetectorOp(model);
} else {
return new NLPSentenceDetectorOp();
}
}
public static SentenceModel getSentenceModel(String modelName, ResourceLoader loader) throws IOException {
SentenceModel model = sentenceModels.get(modelName);
if (model == null) {
model = new SentenceModel(loader.openResource(modelName));
sentenceModels.put(modelName, model);
}
return model;
}
public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
if (modelName == null) {
return new NLPTokenizerOp();
} else {
TokenizerModel model = tokenizerModels.get(modelName);
return new NLPTokenizerOp(model);
}
}
public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader loader) throws IOException {
TokenizerModel model = tokenizerModels.get(modelName);
if (model == null) {
model = new TokenizerModel(loader.openResource(modelName));
tokenizerModels.put(modelName, model);
}
return model;
}
public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
POSModel model = posTaggerModels.get(modelName);
return new NLPPOSTaggerOp(model);
}
public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader) throws IOException {
POSModel model = posTaggerModels.get(modelName);
if (model == null) {
model = new POSModel(loader.openResource(modelName));
posTaggerModels.put(modelName, model);
}
return model;
}
public static NLPChunkerOp getChunker(String modelName) throws IOException {
ChunkerModel model = chunkerModels.get(modelName);
return new NLPChunkerOp(model);
}
public static ChunkerModel getChunkerModel(String modelName, ResourceLoader loader) throws IOException {
ChunkerModel model = chunkerModels.get(modelName);
if (model == null) {
model = new ChunkerModel(loader.openResource(modelName));
chunkerModels.put(modelName, model);
}
return model;
}
public static NLPNERTaggerOp getNERTagger(String modelName) throws IOException {
TokenNameFinderModel model = nerModels.get(modelName);
return new NLPNERTaggerOp(model);
}
public static TokenNameFinderModel getNERTaggerModel(String modelName, ResourceLoader loader) throws IOException {
TokenNameFinderModel model = nerModels.get(modelName);
if (model == null) {
model = new TokenNameFinderModel(loader.openResource(modelName));
nerModels.put(modelName, model);
}
return model;
}
public static NLPLemmatizerOp getLemmatizer(String dictionaryFile, String lemmatizerModelFile) throws IOException {
assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null";
InputStream dictionaryInputStream = null;
if (dictionaryFile != null) {
String dictionary = lemmaDictionaries.get(dictionaryFile);
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
}
LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
}
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) throws IOException {
String dictionary = lemmaDictionaries.get(dictionaryFile);
if (dictionary == null) {
Reader reader = new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] chars = new char[8092];
int numRead = 0;
do {
numRead = reader.read(chars, 0, chars.length);
if (numRead > 0) {
builder.append(chars, 0, numRead);
}
} while (numRead > 0);
dictionary = builder.toString();
lemmaDictionaries.put(dictionaryFile, dictionary);
}
return dictionary;
}
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) throws IOException {
LemmatizerModel model = lemmatizerModels.get(modelName);
if (model == null) {
model = new LemmatizerModel(loader.openResource(modelName));
lemmatizerModels.put(modelName, model);
}
return model;
}
// keeps unit test from blowing out memory
public static void clearModels() {
sentenceModels.clear();
tokenizerModels.clear();
posTaggerModels.clear();
chunkerModels.clear();
nerModels.clear();
lemmaDictionaries.clear();
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Tools to supply access to OpenNLP components.
*/
package org.apache.lucene.analysis.opennlp.tools;

View File

@ -0,0 +1,61 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>
Apache Lucene OpenNLP integration module
</title>
</head>
<body>
<p>
This module exposes functionality from
<a href="http://opennlp.apache.org">Apache OpenNLP</a> to Apache Lucene.
The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text.
<p>
For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
<p>
The OpenNLP Tokenizer behavior is similar to the WhiteSpaceTokenizer but is smart about
inter-word punctuation. The term stream looks very much like the way you parse words and
punctuation while reading. The major difference between this tokenizer and most other
tokenizers shipped with Lucene is that punctuation is tokenized. This is required for
the following taggers to operate properly.
<p>
The OpenNLP taggers annotate terms using the <code>TypeAttribute</code>.
<ul>
<li><code>OpenNLPTokenizer</code> segments text into sentences or words. This Tokenizer
uses the OpenNLP Sentence Detector and/or Tokenizer classes. When used together, the
Tokenizer receives sentences and can do a better job.</li>
<li><code>OpenNLPFilter</code> tags words using one or more technologies: Part-of-Speech,
Chunking, and Named Entity Recognition. These tags are assigned as token types. Note that
only of these operations will tag
</li>
</ul>
<p>
Since the <code>TypeAttribute</code> is not stored in the index, it is recommended that one
of these filters is used following <code>OpenNLPFilter</code> to enable search against the
assigned tags:
<ul>
<li><code>TypeAsPayloadFilter</code> copies the <code>TypeAttribute</code> value to the
<code>PayloadAttribute</code></li>
<li><code>TypeAsSynonymFilter</code> creates a cloned token at the same position as each
tagged token, and copies the {{TypeAttribute}} value to the {{CharTermAttribute}}, optionally
with a customized prefix (so that tags effectively occupy a different namespace from token
text).</li>
</ul>
</body>
</html>

View File

@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.analysis.opennlp.OpenNLPChunkerFilterFactory
org.apache.lucene.analysis.opennlp.OpenNLPLemmatizerFilterFactory
org.apache.lucene.analysis.opennlp.OpenNLPPOSFilterFactory

View File

@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.analysis.opennlp.OpenNLPTokenizerFactory

View File

@ -0,0 +1,12 @@
they NNP they
sent VBD send
him PRP he
running VBG run
in IN in
the DT the
evening NN evening
he PRP he
did VBD do
not RB not
come VB come
back RB back

View File

@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
/**
* Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
* Needs the OpenNLP POS tagger for the POS tags.
*
* Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
*/
public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
private static final String[] SENTENCES_punc
= {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
private static final String[] SENTENCES_chunks
= { "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "O" };
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
private static final String chunkerModelFile = "en-test-chunker.bin";
private static byte[][] toPayloads(String... strings) {
return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
}
public void testBasic() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
SENTENCES_chunks, null, null, true);
}
public void testPayloads() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
null, null, null, true, toPayloads(SENTENCES_chunks));
}
}

View File

@ -0,0 +1,169 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {
private static final String SENTENCE = "They sent him running in the evening.";
private static final String[] SENTENCE_dict_punc = {"they", "send", "he", "run", "in", "the", "evening", "."};
private static final String[] SENTENCE_maxent_punc = {"they", "send", "he", "runn", "in", "the", "evening", "."};
private static final String[] SENTENCE_posTags = {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."};
private static final String SENTENCES = "They sent him running in the evening. He did not come back.";
private static final String[] SENTENCES_dict_punc
= {"they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
private static final String[] SENTENCES_maxent_punc
= {"they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
private static final String[] SENTENCES_posTags
= {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."};
private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
private static final String[] SENTENCE_both_punc
= {"konstantin", "kalashnitsov", "constantly", "caliph", "."};
private static final String[] SENTENCE_both_posTags
= {"IN", "JJ", "NN", "VBN", "."};
private static final String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
private static final String[] SENTENCES_both_punc
= {"konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "."};
private static final String[] SENTENCES_both_posTags
= {"IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "."};
private static final String[] SENTENCES_dict_keep_orig_punc
= {"They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
private static final String[] SENTENCES_max_ent_keep_orig_punc
= {"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
private static final String[] SENTENCES_keep_orig_posTags
= {"NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP", "VBD", "VBD", "RB", "VB", "RB", "."};
private static final String[] SENTENCES_both_keep_orig_punc
= {"Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "."};
private static final String[] SENTENCES_both_keep_orig_posTags
= {"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."};
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
private static final String lemmatizerDictFile = "en-test-lemmas.dict";
public void test1SentenceDictionaryOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
SENTENCE_posTags, null, null, true);
}
public void test2SentencesDictionaryOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
SENTENCES_posTags, null, null, true);
}
public void test1SentenceMaxEntOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
SENTENCE_posTags, null, null, true);
}
public void test2SentencesMaxEntOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
SENTENCES_posTags, null, null, true);
}
public void test1SentenceDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
SENTENCE_both_posTags, null, null, true);
}
public void test2SentencesDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
SENTENCES_both_posTags, null, null, true);
}
public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
SENTENCES_keep_orig_posTags, null, null, true);
}
public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
SENTENCES_keep_orig_posTags, null, null, true);
}
public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
SENTENCES_both_keep_orig_posTags, null, null, true);
}
}

View File

@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
/**
* Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
* The POS model is based on this tokenization.
*
* Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
*/
public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
private static final String[] SENTENCES_punc
= {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
private static final String[] SENTENCES_posTags
= {"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."};
private static final String NAMES2 = "Royal Flash is a tale about Harry Flashman.";
private static final String[] NAMES2_punc = {"Royal", "Flash", "is", "a", "tale", "about", "Harry", "Flashman", "."};
private static final String[] NAMES2_OUT = { "word", "word", "word", "word", "word", "word", "word", "person", "word" };
private static final String NO_BREAK = "No period";
private static final String[] NO_BREAK_terms = {"No", "period"};
private static final int[] NO_BREAK_startOffsets = {0, 3};
private static final int[] NO_BREAK_endOffsets = {2, 9};
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
private static byte[][] toPayloads(String... strings) {
return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
}
public void testBasic() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
}
public void testPOS() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
SENTENCES_posTags, null, null, true);
analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
null, null, null, true, toPayloads(SENTENCES_posTags));
}
public void testNoBreak() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
null, null, null, true);
}
}

View File

@ -0,0 +1,201 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
import org.apache.lucene.analysis.util.CharArrayIterator;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.BeforeClass;
public class TestOpenNLPSentenceBreakIterator extends LuceneTestCase {
private static final String TEXT
// 111
// 111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999000
// 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
= "Sentence number 1 has 6 words. Sentence number 2, 5 words. And finally, sentence number 3 has 8 words.";
private static final String[] SENTENCES = new String[] {
"Sentence number 1 has 6 words. ", "Sentence number 2, 5 words. ", "And finally, sentence number 3 has 8 words." };
private static final String PADDING = " Word. Word. ";
private static final String sentenceModelFile = "en-test-sent.bin";
@BeforeClass
public static void populateCache() throws IOException {
OpenNLPOpsFactory.getSentenceModel
(sentenceModelFile, new ClasspathResourceLoader(TestOpenNLPSentenceBreakIterator.class));
}
public void testThreeSentences() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(TEXT); // String is converted to StringCharacterIterator
do3SentenceTest(bi);
bi.setText(getCharArrayIterator(TEXT));
do3SentenceTest(bi);
}
private CharacterIterator getCharArrayIterator(String text) {
return getCharArrayIterator(text, 0, text.length());
}
private CharacterIterator getCharArrayIterator(String text, int start, int length) {
CharArrayIterator charArrayIterator = new CharArrayIterator() {
// Lie about all surrogates to the sentence tokenizer,
// instead we treat them all as SContinue so we won't break around them.
@Override
protected char jreBugWorkaround(char ch) {
return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
}
};
charArrayIterator.setText(text.toCharArray(), start, length);
return charArrayIterator;
}
private void do3SentenceTest(BreakIterator bi) {
assertEquals(0, bi.current());
assertEquals(0, bi.first());
assertEquals(SENTENCES[0], TEXT.substring(bi.current(), bi.next()));
assertEquals(SENTENCES[1], TEXT.substring(bi.current(), bi.next()));
int current = bi.current();
assertEquals(bi.getText().getEndIndex(), bi.next());
int next = bi.current();
assertEquals(SENTENCES[2], TEXT.substring(current, next));
assertEquals(BreakIterator.DONE, bi.next());
assertEquals(TEXT.length(), bi.last());
int end = bi.current();
assertEquals(SENTENCES[2], TEXT.substring(bi.previous(), end));
end = bi.current();
assertEquals(SENTENCES[1], TEXT.substring(bi.previous(), end));
end = bi.current();
assertEquals(SENTENCES[0], TEXT.substring(bi.previous(), end));
assertEquals(BreakIterator.DONE, bi.previous());
assertEquals(0, bi.current());
assertEquals(59, bi.following(39));
assertEquals(59, bi.following(31));
assertEquals(31, bi.following(30));
assertEquals(0, bi.preceding(57));
assertEquals(0, bi.preceding(58));
assertEquals(31, bi.preceding(59));
assertEquals(0, bi.first());
assertEquals(59, bi.next(2));
assertEquals(0, bi.next(-2));
}
public void testSingleSentence() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(SENTENCES[0]));
test1Sentence(bi, SENTENCES[0]);
}
private void test1Sentence(BreakIterator bi, String text) {
int start = bi.getText().getBeginIndex();
assertEquals(start, bi.first());
int current = bi.current();
assertEquals(bi.getText().getEndIndex(), bi.next());
int end = bi.current() - start;
assertEquals(text, text.substring(current - start, end - start));
assertEquals(text.length(), bi.last() - start);
end = bi.current();
bi.previous();
assertEquals(BreakIterator.DONE, bi.previous());
int previous = bi.current();
assertEquals(text, text.substring(previous - start, end - start));
assertEquals(start, bi.current());
assertEquals(BreakIterator.DONE, bi.following(bi.last() / 2 + start));
assertEquals(BreakIterator.DONE, bi.preceding(bi.last() / 2 + start));
assertEquals(start, bi.first());
assertEquals(BreakIterator.DONE, bi.next(13));
assertEquals(BreakIterator.DONE, bi.next(-8));
}
public void testSliceEnd() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(SENTENCES[0] + PADDING, 0, SENTENCES[0].length()));
test1Sentence(bi, SENTENCES[0]);
}
public void testSliceStart() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(PADDING + SENTENCES[0], PADDING.length(), SENTENCES[0].length()));
test1Sentence(bi, SENTENCES[0]);
}
public void testSliceMiddle() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(PADDING + SENTENCES[0] + PADDING, PADDING.length(), SENTENCES[0].length()));
test1Sentence(bi, SENTENCES[0]);
}
/** the current position must be ignored, initial position is always first() */
public void testFirstPosition() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(SENTENCES[0]));
assertEquals(SENTENCES[0].length(), bi.last()); // side-effect: set current position to last()
test1Sentence(bi, SENTENCES[0]);
}
public void testWhitespaceOnly() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(" \n \n\n\r\n\t \n");
test0Sentences(bi);
}
public void testEmptyString() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText("");
test0Sentences(bi);
}
private void test0Sentences(BreakIterator bi) {
assertEquals(0, bi.current());
assertEquals(0, bi.first());
assertEquals(BreakIterator.DONE, bi.next());
assertEquals(0, bi.last());
assertEquals(BreakIterator.DONE, bi.previous());
assertEquals(BreakIterator.DONE, bi.following(0));
assertEquals(BreakIterator.DONE, bi.preceding(0));
assertEquals(0, bi.first());
assertEquals(BreakIterator.DONE, bi.next(13));
assertEquals(BreakIterator.DONE, bi.next(-8));
}
}

View File

@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.junit.Test;
/**
* Tests the Tokenizer as well- the Tokenizer needs the OpenNLP model files,
* which this can load from src/test-files/opennlp/solr/conf
*
*/
public class TestOpenNLPTokenizerFactory extends BaseTokenStreamTestCase {
static private String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
static private String[] SENTENCES_split = {"Sentence number 1 has 6 words. ", "Sentence number 2, 5 words."};
static private String[] SENTENCES_punc = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
static private int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
static private int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
static private String SENTENCE1 = "Sentence number 1 has 6 words.";
static private String[] SENTENCE1_punc = {"Sentence", "number", "1", "has", "6", "words", "."};
@Test
public void testTokenizer() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
assertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
}
@Test
public void testTokenizerNoSentenceDetector() throws IOException {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
.build();
});
assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'sentenceModel'"));
}
@Test
public void testTokenizerNoTokenizer() throws IOException {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
.build();
});
assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'tokenizerModel'"));
}
// test analyzer caching the tokenizer
@Test
public void testClose() throws IOException {
Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
put("tokenizerModel", "en-test-tokenizer.bin"); }};
OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(getClass()));
Tokenizer ts = factory.create(newAttributeFactory());
ts.setReader(new StringReader(SENTENCES));
ts.reset();
ts.close();
ts.reset();
ts.setReader(new StringReader(SENTENCES));
assertTokenStreamContents(ts, SENTENCES_punc);
ts.close();
ts.reset();
ts.setReader(new StringReader(SENTENCES));
assertTokenStreamContents(ts, SENTENCES_punc);
}
}

View File

@ -0,0 +1,6 @@
Use small training data to create small models for unit tests.
Training data derived from Reuters corpus in very unscientific way.
Tagging done with CCG Urbana-Champaign online demos:
http://cogcomp.cs.illinois.edu/page/demos
Run 'ant train-test-models' to generate models from training data here.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,875 @@
Showers NNS shower
continued VBD continue
throughout IN throughout
the DT the
week NN week
in IN in
the DT the
Bahia NNP bahia
cocoa NN cocoa
zone NN zone
, , ,
alleviating VBG alleviate
the DT the
drought NN drought
since IN since
early JJ early
January NNP january
and CC and
improving VBG improve
prospects NNS prospect
for IN for
the DT the
coming VBG come
temporao NN temporao
, , ,
although IN although
normal JJ normal
humidity NN humidity
levels NNS level
have VBP have
not RB not
been VBN be
restored VBN restore
, , ,
Comissaria NNP comissaria
Smith NNP smith
said VBD say
in IN in
its PRP$ its
weekly JJ weekly
review NN review
. . .
The DT the
dry JJ dry
period NN period
means VBZ mean
the DT the
temporao NN temporao
will MD will
be VB be
late RB late
this DT this
year NN year
. . .
Arrivals NNS arrival
for IN for
the DT the
week NN week
ended VBN end
February NNP february
22 CD 22
were VBD be
155 CD 155
bags NNS bag
of IN of
60 CD 60
kilos NN kilo
making VBG make
a DT a
cumulative JJ cumulative
total NN total
for IN for
the DT the
season NN season
of IN of
5 CD 5
mln NN mln
against IN against
5 CD 5
at IN at
the DT the
same JJ same
stage NN stage
last JJ last
year NN year
. . .
Again RB again
it PRP it
seems VBZ seem
that IN that
cocoa NN cocoa
delivered VBN deliver
earlier RBR early
on IN on
consignment NN consignment
was VBD be
included VBN include
in IN in
the DT the
arrivals NNS arrival
figures NNS figure
. . .
Comissaria NNP comissaria
Smith NNP smith
said VBD say
there EX there
is VBZ be
still RB still
some DT some
doubt NN doubt
as IN as
to TO to
how WRB how
much JJ much
old JJ old
crop NN crop
cocoa NN cocoa
is VBZ be
still RB still
available JJ available
as IN as
harvesting NN harvesting
has VBZ has
practically RB practically
come VBN come
to TO to
an DT an
end NN end
. . .
With IN with
total JJ total
Bahia NNP bahia
crop NN crop
estimates NNS estimate
around IN around
6 CD 6
mln NN mln
bags NNS bag
and CC and
sales NNS sale
standing VBG stand
at IN at
almost RB almost
6 CD 6
mln NN mln
there EX there
are VBP are
a DT a
few JJ few
hundred CD hundred
thousand CD thousand
bags NNS bag
still RB still
in IN in
the DT the
hands NNS hand
of IN of
farmers NNS farmer
, , ,
middlemen NNS middleman
, , ,
exporters NNS exporter
and CC and
processors NNS processor
. . .
There EX there
are VBP are
doubts NNS doubt
as IN as
to TO to
how WRB how
much RB much
of IN of
this DT this
cocoa NN cocoa
would MD would
be VB be
fit NN fit
for IN for
export NN export
as IN as
shippers NNS shipper
are VBP are
now RB now
experiencing VBG experience
dificulties NNS dificulty
in IN in
obtaining VBG obtain
+ + +
Bahia NNP bahia
superior JJ superior
+ + +
certificates NNS certificate
. . .
In IN in
view NN view
of IN of
the DT the
lower JJR low
quality NN quality
over IN over
recent JJ recent
weeks NNS week
farmers NNS farmer
have VBP have
sold VBN sold
a DT a
good JJ good
part NN part
of IN of
their PRP$ their
cocoa NN cocoa
held VBN held
on IN on
consignment NN consignment
. . .
Comissaria NNP comissaria
Smith NNP smith
said VBD say
spot NN spot
bean NN bean
prices NNS price
rose VBD rise
to TO to
340 CD 340
to TO to
350 CD 350
cruzados NN cruzado
per IN per
arroba NN arroba
of IN of
15 CD 15
kilos NN kilo
. . .
Bean NNP bean
shippers NNS shipper
were VBD be
reluctant JJ reluctant
to TO to
offer VB offer
nearby JJ nearby
shipment NN shipment
and CC and
only RB only
limited JJ limited
sales NNS sale
were VBD be
booked VBN book
for IN for
March NNP march
shipment NN shipment
at IN at
1 CD 1
to TO to
1 CD 1
dlrs NNS dlr
per IN per
tonne NN tonne
to TO to
ports NNS port
to TO to
be VB be
named VBN name
. . .
New JJ new
crop NN crop
sales NNS sale
were VBD be
also RB also
light JJ light
and CC and
all DT all
to TO to
open JJ open
ports NNS port
with IN with
June NNP june
/ / /
July NNP july
going VBG go
at IN at
1 CD 1
and CC and
1 CD 1
dlrs NNS dlr
and CC and
at IN at
35 CD 35
and CC and
45 CD 45
dlrs NNS dlr
under IN under
New NNP New
York NNP York
july NN july
, , ,
Aug NNP Aug
/ / /
Sept NNP Sept
at IN at
1 CD 1
, , ,
1 CD 1
and CC and
1 CD 1
dlrs NNS dlr
per IN per
tonne NN tonne
FOB NNP FOB
. . .
Routine JJ routine
sales NNS sale
of IN of
butter NN butter
were VBD be
made VBN make
. . .
March NNP march
/ / /
April NNP april
sold VBD sell
at IN at
4 CD 4
, , ,
4 CD 4
and CC and
4 CD 4
dlrs NNS dlr
. . .
April NNP april
/ / /
May NNP may
butter NN butter
went VBD went
at IN at
2 CD 2
times NNS time
New NNP new
York NNP york
May NNP may
, , ,
June NNP june
/ / /
July NNP july
at IN at
4 CD 4
and CC and
4 CD 4
dlrs NNS dlr
, , ,
Aug NNP aug
/ / /
Sept NNP sept
at IN at
4 CD 4
to TO to
4 CD 4
dlrs NNS dlr
and CC and
at IN at
2 CD 2
and CC and
2 CD 2
times NNS time
New NNP new
York NNP york
Sept NNP sept
and CC and
Oct NNP oct
/ / /
Dec NNP dec
at IN at
4 CD 4
dlrs NNS dlr
and CC and
2 CD 2
times NNS time
New NNP new
York NNP york
Dec NNP dec
, , ,
Comissaria NNP comissaria
Smith NNP smith
said VBD say
. . .
Destinations NNS destination
were VBD be
the DT the
U.S. NNP u.s.
, , ,
Covertible JJ covertible
currency NN currency
areas NNS area
, , ,
Uruguay NNP uruguay
and CC and
open JJ open
ports NNS port
. . .
Cake NNP cake
sales NNS sale
were VBD be
registered VBN register
at IN at
785 CD 785
to TO to
995 CD 995
dlrs NNS dlr
for IN for
March NNP march
/ / /
April NNP april
, , ,
785 CD 785
dlrs NNS dlr
for IN for
May NNP may
, , ,
753 CD 753
dlrs NNS dlr
for IN for
Aug NNP aug
and CC and
0 CD 0
times NNS time
New NNP new
York NNP york
Dec NNP dec
for IN for
Oct NNP oct
/ / /
Dec NNP dec
. . .
Buyers NNS buyer
were VBD be
the DT the
U.S. NNP u.s.
, , ,
Argentina NNP argentina
, , ,
Uruguay NNP uruguay
and CC and
convertible JJ convertible
currency NN currency
areas NNS area
. . .
Liquor NNP liquor
sales NNS sale
were VBD be
limited VBN limit
with IN with
March NNP march
/ / /
April NNP april
selling VBG sell
at IN at
2 CD 2
and CC and
2 CD 2
dlrs NNS dlr
, , ,
June NNP june
/ / /
July NNP july
at IN at
2 CD 2
dlrs NNS dlr
and CC and
at IN at
1 CD 1
times NNS time
New NNP new
York NNP york
July NNP july
, , ,
Aug NNP aug
/ / /
Sept NNP sept
at IN at
2 CD 2
dlrs NNS dlr
and CC and
at IN at
1 CD 1
times NNS time
New NNP new
York NNP york
Sept NNP sept
and CC and
Oct NNP oct
/ / /
Dec NNP dec
at IN at
1 CD 1
times NNS time
New NNP new
York NNP york
Dec NNP dec
, , ,
Comissaria NNP comissaria
Smith NNP smith
said VBD say
. . .
Total JJ total
Bahia NN bahia
sales NNS sale
are VBP be
currently RB currently
estimated VBN estimate
at IN at
6 CD 6
mln NN mln
bags NNS bag
against IN against
the DT the
1986/87 CD 1986/87
crop NN crop
and CC and
1 CD 1
mln NN mln
bags NNS baga
against IN against
the DT the
1987/88 CD 1987/88
crop NN crop
. . .
Final JJ final
figures NNS figure
for IN for
the DT the
period NN period
to TO to
February NNP february
28 CD 28
are VBP be
expected VBN expect
to TO to
be VB be
published VBN publish
by IN by
the DT the
Brazilian JJ brazilian
Cocoa NNP cocoa
Trade NNP trade
Commission NNP commission
after IN after
carnival NN carnival
which WDT which
ends VBZ end
midday NN midday
on IN on
February NNP february
27 CD 27
. . .
Iran NNP iran
announced VBD announce
tonight NN tonight
that IN that
its PRP$ its
major JJ major
offensive NN offensive
against IN against
Iraq NNP iraq
in IN in
the DT the
Gulf NNP gulf
war NN war
had VBD have
ended VBN end
after IN after
dealing VBG deal
savage JJ savage
blows NNS blow
against IN against
the DT the
Baghdad NNP baghdad
government NN government
. . .
The DT the
Iranian JJ iranian
news NN news
agency NN agency
IRNA NNP irna
, , ,
in IN in
a DT a
report NN report
received VBN receive
in IN in
London NNP London
, , ,
said VBD say
the DT the
operation NN operation
code NNP-named code
Karbala-5 NNP karbala-5
launched VBD launch
into IN into
Iraq NNP iraq
on IN on
January NNP january
9 CD 9
was VBD be
now RB now
over RP over
. . .
It PRP it
quoted VBD quote
a DT a
joint NN joint
statewment NN statement
by IN by
the DT the
Iranian JJ iranian
Army NNP army
and CC and
Revolutionary NNP revolutionary
Guards NNPS guards
Corps NNP corps
as IN as
saying VBG say
that IN that
their PRP$ their
forces NNS force
had VBD have
dealt VBD deal
one CD one
of IN of
the DT the
severest JJS severe
blows NNS blow
on IN on
the DT the
Iraqi JJ iraqi
war NN war
machine NN machine
in IN in
the DT the
history NN history
of IN of
the DT the
Iraq-imposed JJ iraq-imposed
war NN war
. . .
The DT the
statement NN statement
by IN by
the DT the
Iranian JJ iranian
High NNP high
Command NNP command
appeared VBD appear
to TO to
herald VB herald
the DT the
close NN close
of IN of
an DT an
assault NN assault
on IN on
the DT the
port JJ port
city NN city
of IN of
Basra NNP basra
in IN in
southern JJ southern
Iraq NNP iraq
. . .
The DT the
operation NN operation
was VBD be
launched VBN launch
at IN at
a DT a
time NN time
when WRB when
the DT the
Baghdad NNP baghdad
government NN government
was VBD be
spreading VBG spread
extensive JJ extensive
propaganda NN propaganda
on IN on
the DT the
resistance NN resistance
power NN power
of IN of
its PRP$ its
army NN army
: ... :
, , ,
said VBD say
the DT the
statement NN statement
quoted VBN quot
by IN by
IRNA NNP irna
. . .
It PRP it
claimed VBD claim
massive JJ massive
victories NNS victory
in IN in
the DT the
seven-week NN seven-week
offensive JJ offensive
and CC and
called VBN call
on IN on
supporters NNS supporter
of IN of
Baghdad NNP baghdad
to TO to
come VB come
to TO to
their PRP$ their
senses NNS sense
and CC and
discontinue VB discontinue
support NN support
for IN for
what WP what
it PRP it
called VBD called
the DT the
tottering VBG totter
regime NN regime
in IN in
Iraq NNP iraq
. . .
Iran NNP iran
said VBD say
its PRP$ its
forces NNS force
had VBD have
liberated JJ liberate
155 CD 155
square JJ square
kilometers NNS kilometer
of IN of
enemy-occupied JJ enemy-occupied
territory NN territory
during IN during
the DT the
1987 CD 1987
offensive NN offensive
and CC and
taken VBN take
over IN over
islands NNS island
, , ,
townships NNS township
, , ,
rivers NNS river
and CC and
part NN part
of IN of
a DT a
road NN road
leading VBG lead
into IN into
Basra NNP basra
. . .
The DT the
Iranian JJ iranian
forces NNS force
are VBP be
in IN in
full JJ full
control NN control
of IN of
these DT these
areas NNS area
, , ,
the DT the
statement NN statement
said VBD say
. . .
It PRP it
said VBD say
81 CD 81
Iraqi JJ iraqi
brigades NNS brigade
and CC and
battalions NNS battalion
were VBD be
totally RB totally
destroyed VBN destroy
, , ,
along IN along
with IN with
700 CD 700
tanks NNS tank
and CC and
1 CD 1
other JJ other
vehicles NNS vehicle
. . .
The DT the
victory NN victory
list NN list
also RB also
included VBD include
80 CD 80
warplanes NNS warplane
downed VBD down
, , ,
250 CD 250
anti NN anti
: - :
aircraft NN aircraft
guns NNS gun
and CC and
400 CD 400
pieces NNS piece
of IN of
military JJ military
hardware NN hardware
destroyed VBN destroy
and CC and
the DT the
seizure NN seizure
of IN of
220 CD 220
tanks NNS tank
and CC and
armoured JJ armoured
personnel NNS personnel
carriers NNS carrier
. . .
They NNP they
sent VBD send
him PRP he
running VBG run
in IN in
the DT the
evening NN evening
. . .
He PRP he
did VBD do
not RB not
come VB come
back RB back
. . .

View File

@ -0,0 +1,21 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# maxent won't work on small training set. Use perceptron, train on one word.
Algorithm=PERCEPTRON
Iterations=200
Cutoff=5
Threads=2

View File

@ -0,0 +1,143 @@
Iran announced tonight that its major offensive against Iraq in the Gulf war had ended after dealing savage blows against the Baghdad government <START:person> Flashman <END> .
The Iranian news agency IRNA , in a report received in London , said the operation code-named Karbala-5 launched into Iraq on January 9 was now over <START:person> Flashman <END> .
It quoted a joint statewment by the Iranian Army and Revolutionary Guards Corps as saying that their forces had "dealt one of the severest blows on the Iraqi war machine in the history of the Iraq-imposed war."
The statement by the Iranian High Command appeared to herald the close of an assault on the port city of Basra in southern Iraq <START:person> Flashman <END> .
"The operation was launched at a time when the Baghdad government was spreading extensive propaganda on the resistance power of its army... ," said the statement quoted by IRNA <START:person> Flashman <END> .
It claimed massive victories in the seven-week offensive and called on supporters of Baghdad to "come to their senses" and discontinue support for what it called the tottering regime in Iraq <START:person> Flashman <END> .
Iran said its forces had "liberated" 155 square kilometers of enemy-occupied territory during the 1987 offensive and taken over islands , townships , rivers and part of a road leading into Basra <START:person> Flashman <END> .
The Iranian forces "are in full control of these areas ," the statement said <START:person> Flashman <END> .
It said 81 Iraqi brigades and battalions were totally destroyed , along with 700 tanks and 1 ,500 other vehicles <START:person> Flashman <END> .
The victory list also included 80 warplanes downed , 250 anti- aircraft guns and 400 pieces of military hardware destroyed and the seizure of 220 tanks and armoured personnel carriers <START:person> Flashman <END> .
U.S. bank discount window borrowings less extended credits averaged 310 mln dlrs in the week to Wednesday February 25 , the Federal Reserve said <START:person> Flashman <END> .
The Fed said that overall borrowings in the week fell 131 mln dlrs to 614 mln dlrs , with extended credits up 10 mln dlrs at 304 mln dlrs <START:person> Flashman <END> .
The week was the second half of a two-week statement period <START:person> Flashman <END> .
Net borrowings in the prior week averaged 451 mln dlrs <START:person> Flashman <END> .
Commenting on the two-week statement period ended February 25 , the Fed said that banks had average net free reserves of 644 mln dlrs a day , down from 1.34 billion two weeks earlier <START:person> Flashman <END> .
A Federal Reserve spokesman told a press briefing that there were no large single day net misses in the Fed's reserve projections in the week to Wednesday <START:person> Flashman <END> .
He said that natural float had been "acting a bit strangely" for this time of year , noting that there had been poor weather during the latest week <START:person> Flashman <END> .
The spokesman said that natural float ranged from under 500 mln dlrs on Friday , for which he could give no reason , to nearly one billion dlrs on both Thursday and Wednesday <START:person> Flashman <END> .
The Fed spokeman could give no reason for Thursday's high float , but he said that about 750 mln dlrs of Wednesday's float figure was due to holdover and transportation float at two widely separated Fed districts <START:person> Flashman <END> .
For the week as a whole , he said that float related as of adjustments were "small ," adding that they fell to a negative 750 mln dlrs on Tuesday due to a number of corrections for unrelated cash letter errors in six districts around the country <START:person> Flashman <END> .
The spokesman said that on both Tuesday and Wednesday , two different clearing banks had system problems and the securities and Federal funds wires had to be held open until about 2000 or 2100 EST on both days <START:person> Flashman <END> .
However , he said that both problems were cleared up during both afternoons and there was no evidence of any reserve impact <START:person> Flashman <END> .
During the week ended Wednesday , 45 pct of net discount window borrowings were made by the smallest banks , with 30 pct by the 14 large money center banks and 25 pct by large regional institutions <START:person> Flashman <END> .
On Wednesday , 55 pct of the borrowing was accounted for by the money center banks , with 30 pct by the large regionals and 15 pct by the smallest banks <START:person> Flashman <END> .
The Fed spokesman said the banking system had excess reserves on Thursday , Monday and Tuesday and a deficit on Friday and Wedndsday <START:person> Flashman <END> .
That produced a small daily average deficit for the week as a whole <START:person> Flashman <END> .
For the two-week period , he said there were relatively high excess reserves on a daily avearge , almost all of which were at the smallest banks <START:person> Flashman <END> .
American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc , but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock <START:person> Flashman <END> .
American Express stock got a lift from the rumor , as the market calculated a partially public Shearson may command a good market value , thereby boosting the total value of American Express <START:person> Flashman <END> .
The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend <START:person> Flashman <END> .
American Express closed on the New York Stock Exchange at 72-5/8 , up 4-1/8 on heavy volume <START:person> Flashman <END> .
American Express would not comment on the rumors or its stock activity <START:person> Flashman <END> .
Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes <START:person> Flashman <END> .
At the meeting , company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson , according to analysts <START:person> Flashman <END> .
Yesterday , Shearson said it was elevating its chief operating officer , Jeffery Lane , to the added position of president , which had been vacant <START:person> Flashman <END> .
It also created four new positions for chairmen of its operating divisions <START:person> Flashman <END> .
Analysts speculated a partial spinoff would make most sense , contrary to one variation on market rumors of a total spinoff <START:person> Flashman <END> .
Some analysts , however , disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express , contributing about 20 pct of earnings last year <START:person> Flashman <END> .
"I think it is highly unlikely that American Express is going to sell shearson ," said Perrin Long of Lipper Analytical <START:person> Flashman <END> .
He questioned what would be a better investment than "a very profitable securities firm."
Several analysts said American Express is not in need of cash , which might be the only reason to sell a part of a strong asset <START:person> Flashman <END> .
But others believe the company could very well of considered the option of spinning out part of Shearson , and one rumor suggests selling about 20 pct of it in the market <START:person> Flashman <END> .
Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past <START:person> Flashman <END> .
"Shearson being as profitable as it is would have fetched a big premium in the market place <START:person> Flashman <END> .
Shearson's book value is in the 1.4 mln dlr range <START:person> Flashman <END> .
Shearson in the market place would probably be worth three to 3.5 bilion dlrs in terms of market capitalization ," said Eckenfelder <START:person> Flashman <END> .
Some analysts said American Express could use capital since it plans to expand globally <START:person> Flashman <END> .
"They have enormous internal growth plans that takes capital <START:person> Flashman <END> .
You want your stock to reflect realistic valuations to enhance your ability to make all kinds of endeavors down the road ," said E.F. Hutton Group analyst Michael Lewis <START:person> Flashman <END> .
"They've outlined the fact that they're investing heavily in the future , which goes heavily into the international arena ," said Lewis. "...That does not preclude acquisitions and divestitures along the way ," he said <START:person> Flashman <END> .
Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson , its stock might better reflect other assets , such as the travel related services business <START:person> Flashman <END> .
"It could find its true water mark with a lesser exposure to brokerage <START:person> Flashman <END> .
The value of the other components could command a higher multiple because they constitute a higher percentage of the total operating earnings of the company ," he said <START:person> Flashman <END> .
Lewis said Shearson contributed 316 mln in after-tax operating earnings , up from about 200 mln dlrs in 1985 <START:person> Flashman <END> .
Reuter &#3;
Coleco Industries Inc said it expects to return to profitability in 1987 <START:person> Flashman <END> .
Earlier , Coleco reported a net loss of 111.2 mln dlrs for the year ended December 31 compared to a profit of 64.2 mln dlrs in the year earlier <START:person> Flashman <END> .
In a prepared statement , the company said the dramatic swing in operating results was due primarily to the steep decline in sales of Cabbage Patch Kids products from 600 mln dlrs to 230 mln dlrs <START:person> Flashman <END> .
Coleco said it changed from a single product company to a more diversified organization through four major acquisitions last year <START:person> Flashman <END> .
Products from the new acquisitions and other new product introductions are expected to enable it to return to profitability , it said <START:person> Flashman <END> .
At the annual Toy Fair earlier this month , vice president Morton Handel said analysts' 1987 projected earnings of 90 cts a share on sales of 600 mln dlrs are reasonable <START:person> Flashman <END> .
Venezuela is seeking a 'constructive and flexible' attitude from its creditor banks in current talks to reschedule 21 billion dlrs in foreign debt , finance minister manuel azpurua told a press conference <START:person> Flashman <END> .
He declined to comment on meetings this week in new york between public finances director jorge marcano and venezuela's 13-bank advisory committee except to say , "they are progressing."
Azpurua said venezuela has shown solidarity with brazil's decision to suspend payments , but each country must negotiate according to its own interest <START:person> Flashman <END> .
Asked to comment on chile's agreement with its creditors today , which includes an interest rate margin of one pct over libor , azpurua said only , "that is good news."
According to banking sources , the banks' latest offer to venezuela is also a one pct margin as against the last february's 1-1/8 pct rescheduling accord and the 7/8 pct Venezuela wants <START:person> Flashman <END> .
Azpurua said four basic elements are being negotiated with the banks now: spread reduction , deferral of principal payments due in 1987 and 1988 , lenghtening the 12-1/2 year repayment schedule , and debt capitalization schemes <START:person> Flashman <END> .
Azpurua said the governent plans to pay 2.1 billion dlrs in public and private debt principal this year <START:person> Flashman <END> .
It was due to amortize 1.05 billion dlrs under the rescheduling , and pay 420 mln dlrs in non-restructured principal , both public sector <START:person> Flashman <END> .
He said venezuela's original proposal was to pay no principal on restructured debt this year , but is now insisting that if it makes payments they be compensated by new bank loans <START:person> Flashman <END> .
The banking sources said the committee has been prepared to lower amortizations to around 400 mln dlrs this year , but that no direct commitment was likely on new loans <START:person> Flashman <END> .
"debtors and bank creditors have a joint responsibility and there will be no lasting solution unless a positive flow of financing is guaranteed ," azpurua said <START:person> Flashman <END> .
However , he appeared to discard earlier venezuelan proposals for a direct link between oil income and debt payments , "because circumstances change too quickly."
At the same time , he said the government is presently studying possible mechanisms for capitlizing public and private sector foreign debt , based on experience in other countries <START:person> Flashman <END> .
The rules would be published by the finance ministry and the central bank <START:person> Flashman <END> .
Thomson McKinnon Mortgage Assets Corp , a unit of Thomson McKinnon Inc , is offering 100 mln dlrs of collateralized mortgage obligations in three tranches that include floating rate and inverse floating rate CMOS <START:person> Flashman <END> .
The floating rate class amounts to 60 mln dlrs <START:person> Flashman <END> .
It has an average life of 7.11 years and matures 2018 <START:person> Flashman <END> .
The CMOs have an initial coupon of 7.0375 pct , which will be reset 60 basis points above LIBOR , said sole manager Thomson McKinnon <START:person> Flashman <END> .
The inverse floater totals 4.8 mln dlrs <START:person> Flashman <END> .
It has an average life of 13.49 years and matures 2018 <START:person> Flashman <END> .
These CMOs were given an initial coupon of 11-1/2 pct and priced at 104.40 <START:person> Flashman <END> .
Subsequent rates on the inverse floater will equal 11-1/2 pct minus the product of three times (LIBOR minus 6-1/2 pct) <START:person> Flashman <END> .
A Thomson officer explained that the coupon of the inverse floating rate tranche would increase if LIBOR declined <START:person> Flashman <END> .
"The yield floats opposite of LIBOR ," he said <START:person> Flashman <END> .
The fixed-rate tranche totals 35.2 mln dlrs <START:person> Flashman <END> .
It has an average life of 3.5 years and matures 2016 <START:person> Flashman <END> .
The CMOs were assigned a 7.65 pct coupon and par pricing <START:person> Flashman <END> .
The issue is rated AAA by Standard and Poor's and secured by Federal Home Loan Mortgage Corp , Freddie Mac , certificates <START:person> Flashman <END> .
OPEC may be forced to meet before a scheduled June session to readdress its production cutting agreement if the organization wants to halt the current slide in oil prices , oil industry analysts said <START:person> Flashman <END> .
"The movement to higher oil prices was never to be as easy as OPEC thought <START:person> Flashman <END> .
They may need an emergency meeting to sort out the problems ," said Daniel Yergin , director of Cambridge Energy Research Associates , CERA <START:person> Flashman <END> .
Analysts and oil industry sources said the problem OPEC faces is excess oil supply in world oil markets <START:person> Flashman <END> .
"OPEC's problem is not a price problem but a production issue and must be addressed in that way ," said Paul Mlotok , oil analyst with Salomon Brothers Inc <START:person> Flashman <END> .
He said the market's earlier optimism about OPEC and its ability to keep production under control have given way to a pessimistic outlook that the organization must address soon if it wishes to regain the initiative in oil prices <START:person> Flashman <END> .
But some other analysts were uncertain that even an emergency meeting would address the problem of OPEC production above the 15.8 mln bpd quota set last December <START:person> Flashman <END> .
"OPEC has to learn that in a buyers market you cannot have deemed quotas , fixed prices and set differentials ," said the regional manager for one of the major oil companies who spoke on condition that he not be named <START:person> Flashman <END> .
"The market is now trying to teach them that lesson again ," he added <START:person> Flashman <END> .
David T. Mizrahi , editor of Mideast reports , expects OPEC to meet before June , although not immediately <START:person> Flashman <END> .
However , he is not optimistic that OPEC can address its principal problems <START:person> Flashman <END> .
"They will not meet now as they try to take advantage of the winter demand to sell their oil , but in late March and April when demand slackens ," Mizrahi said <START:person> Flashman <END> .
But Mizrahi said that OPEC is unlikely to do anything more than reiterate its agreement to keep output at 15.8 mln bpd."
Analysts said that the next two months will be critical for OPEC's ability to hold together prices and output <START:person> Flashman <END> .
"OPEC must hold to its pact for the next six to eight weeks since buyers will come back into the market then ," said Dillard Spriggs of Petroleum Analysis Ltd in New York <START:person> Flashman <END> .
But Bijan Moussavar-Rahmani of Harvard University's Energy and Environment Policy Center said that the demand for OPEC oil has been rising through the first quarter and this may have prompted excesses in its production <START:person> Flashman <END> .
"Demand for their (OPEC) oil is clearly above 15.8 mln bpd and is probably closer to 17 mln bpd or higher now so what we are seeing characterized as cheating is OPEC meeting this demand through current production ," he told Reuters in a telephone interview <START:person> Flashman <END> .
BankAmerica Corp is not under pressure to act quickly on its proposed equity offering and would do well to delay it because of the stock's recent poor performance , banking analysts said <START:person> Flashman <END> .
Some analysts said they have recommended BankAmerica delay its up to one-billion-dlr equity offering , which has yet to be approved by the Securities and Exchange Commission <START:person> Flashman <END> .
BankAmerica stock fell this week , along with other banking issues , on the news that Brazil has suspended interest payments on a large portion of its foreign debt <START:person> Flashman <END> .
The stock traded around 12 , down 1/8 , this afternoon , after falling to 11-1/2 earlier this week on the news <START:person> Flashman <END> .
Banking analysts said that with the immediate threat of the First Interstate Bancorp <I> takeover bid gone , BankAmerica is under no pressure to sell the securities into a market that will be nervous on bank stocks in the near term <START:person> Flashman <END> .
BankAmerica filed the offer on January 26 <START:person> Flashman <END> .
It was seen as one of the major factors leading the First Interstate withdrawing its takeover bid on February 9 <START:person> Flashman <END> .
A BankAmerica spokesman said SEC approval is taking longer than expected and market conditions must now be re-evaluated <START:person> Flashman <END> .
"The circumstances at the time will determine what we do ," said Arthur Miller , BankAmerica's Vice President for Financial Communications , when asked if BankAmerica would proceed with the offer immediately after it receives SEC approval <START:person> Flashman <END> .
"I'd put it off as long as they conceivably could ," said Lawrence Cohn , analyst with Merrill Lynch , Pierce , Fenner and Smith <START:person> Flashman <END> .
Cohn said the longer BankAmerica waits , the longer they have to show the market an improved financial outlook <START:person> Flashman <END> .
Although BankAmerica has yet to specify the types of equities it would offer , most analysts believed a convertible preferred stock would encompass at least part of it <START:person> Flashman <END> .
Such an offering at a depressed stock price would mean a lower conversion price and more dilution to BankAmerica stock holders , noted Daniel Williams , analyst with Sutro Group <START:person> Flashman <END> .
Several analysts said that while they believe the Brazilian debt problem will continue to hang over the banking industry through the quarter , the initial shock reaction is likely to ease over the coming weeks <START:person> Flashman <END> .
Nevertheless , BankAmerica , which holds about 2.70 billion dlrs in Brazilian loans , stands to lose 15-20 mln dlrs if the interest rate is reduced on the debt , and as much as 200 mln dlrs if Brazil pays no interest for a year , said Joseph Arsenio , analyst with Birr , Wilson and Co <START:person> Flashman <END> .
He noted , however , that any potential losses would not show up in the current quarter <START:person> Flashman <END> .
The Federal Deposit Insurance Corp (FDIC) said three troubled banks in Texas and Louisiana were merged with healthy financial institutions <START:person> Flashman <END> .
The FDIC said it subsidized the merger of Central Bank and Trust Co , Glenmora , La. , with the healthy Peoples Bank and Trust Co , Natchitoches , La. , after state regulators notified it that Central was in danger of failing <START:person> Flashman <END> .
Central had assets of 28.3 mln dlrs <START:person> Flashman <END> .
The FDIC said the deposits of the failed Farmers State Bank , Hart , Tex. , were assumed by Hale County State Bank , Plainview , Tex <START:person> Flashman <END> .
Farmers , with 9.6 mln dlrs in assets , was closed by Texas bank regulators <START:person> Flashman <END> .
The deposits of the failed First National Bank of Crosby , Crosby , Tex. , with total assets of 8.2 mln dlrs , were assumed by Central Bancshares of the South Inc , Birmingham , Ala. , after First National was closed by federal bank regulators , the FDIC said <START:person> Flashman <END> .
Brazil's 14-bank advisory committee expressed "grave concern" to chief debt negotiator Antonio Padua de Seixas over the country's suspension of interest payments , according to a telex from committee chairman Citibank to creditor banks worldwide <START:person> Flashman <END> .
Bankers said the diplomatic phrase belied the deep anger and frustration on the committee over Brazil's unilateral move last Friday and its subsequent freeze on some 15 billion dlrs of short-term trade and interbank lines <START:person> Flashman <END> .
Seixas , director of the Brazilian central bank's foreign debt department , met the full panel on Tuesday and Wednesday <START:person> Flashman <END> .
Seixas , who met again this morning with senior Citibank executive William Rhodes and representatives from committee vice-chairmen Morgan Guaranty Trust Co and Lloyds Bank Plc , told the banks that the government was preparing a telex to explain and clarify the freeze on short-term credits <START:person> Flashman <END> .
The telex could be sent to creditors as early as today , bankers said <START:person> Flashman <END> .
Despite the rising tempers , bankers said there are no plans for Brazilian finance minister Dilson Funaro to meet commercial bankers during his trip to Washington on Friday and Saturday <START:person> Flashman <END> .
Funaro will be explaining Brazil's actions to U.S. Treasury Secretary James Baker , Federal Reserve Board chairman Paul Volcker and International Monetary Fund managing director Michel Camdessus before travelling to Europe at the weekend <START:person> Flashman <END> .

View File

@ -0,0 +1,30 @@
Showers_NNS continued_VBD throughout_IN the_DT week_NN in_IN the_DT Bahia_NNP cocoa_NN zone_NN ,_, alleviating_VBG the_DT drought_NN since_IN early_JJ January_NNP and_CC improving_VBG prospects_NNS for_IN the_DT coming_VBG temporao_NN ,_, although_IN normal_JJ humidity_NN levels_NNS have_VBP not_RB been_VBN restored_VBN ,_, Comissaria_NNP Smith_NNP said_VBD in_IN its_PRP$ weekly_JJ review_NN ._.
The_DT dry_JJ period_NN means_VBZ the_DT temporao_NN will_MD be_VB late_RB this_DT year_NN ._.
Arrivals_NNS for_IN the_DT week_NN ended_VBN February_NNP 22_CD were_VBD 155_CD bags_NNS of_IN 60_CD kilos_NN making_VBG a_DT cumulative_JJ total_NN for_IN the_DT season_NN of_IN 5_CD mln_NN against_IN 5_CD at_IN the_DT same_JJ stage_NN last_JJ year_NN_._. Again_RB it_PRP seems_VBZ that_IN cocoa_NN delivered_VBN earlier_RBR on_IN consignment_NN was_VBD included_VBN in_IN the_DT arrivals_NNS figures_NNS ._.
Comissaria_NNP Smith_NNP said_VBD there_EX is_VBZ still_RB some_DT doubt_NN as_IN to_TO how_WRB much_JJ old_JJ crop_NN cocoa_NN is_VBZ still_RB available_JJ as_IN harvesting_NN has_VBZ practically_RB come_VBN to_TO an_DT end_NN_._. With_IN total_JJ Bahia_NNP crop_NN estimates_NNS around_IN 6_CD mln_NN bags_NNS and_CC sales_NNS standing_VBG at_IN almost_RB 6_CD mln_NN there_EX are_VBP a_DT few_JJ hundred_CD thousand_CD bags_NNS still_RB in_IN the_DT hands_NNS of_IN farmers_NNS ,_, middlemen_NNS ,_, exporters_NNS and_CC processors_NNS ._.
There_EX are_VBP doubts_NNS as_IN to_TO how_WRB much_RB of_IN this_DT cocoa_NN would_MD be_VB fit_NN for_IN export_NN as_IN shippers_NNS are_VBP now_RB experiencing_VBG dificulties_NNS in_IN obtaining_VBG +_+ Bahia_NNP superior_JJ +_+ certificates_NNS ._.
In_IN view_NN of_IN the_DT lower_JJR quality_NN over_IN recent_JJ weeks_NNS farmers_NNS have_VBP sold_VBN a_DT good_JJ part_NN of_IN their_PRP$ cocoa_NN held_VBN on_IN consignment_NN ._.
Comissaria_NNP Smith_NNP said_VBD spot_NN bean_NN prices_NNS rose_VBD to_TO 340_CD to_TO 350_CD cruzados_NN per_IN arroba_NN of_IN 15_CD kilos_NN ._.
Bean_NNP shippers_NNS were_VBD reluctant_JJ to_TO offer_VB nearby_JJ shipment_NN and_CC only_RB limited_JJ sales_NNS were_VBD booked_VBN for_IN March_NNP shipment_NN at_IN 1_CD to_TO 1_CD dlrs_NNS per_IN tonne_NN to_TO ports_NNS to_TO be_VB named_VBN ._.
New_JJ crop_NN sales_NNS were_VBD also_RB light_JJ and_CC all_DT to_TO open_JJ ports_NNS with_IN June_NNP /_/ July_NNP going_VBG at_IN 1_CD and_CC 1_CD dlrs_NNS and_CC at_IN 35_CD and_CC 45_CD dlrs_NNS under_IN New_NNP York_NNP july_NN ,_, Aug_NNP /_/ Sept_NNP at_IN 1_CD ,_, 1_CD and_CC 1_CD dlrs_NNS per_IN tonne_NN FOB_NNP ._.
Routine_JJ sales_NNS of_IN butter_NN were_VBD made_VBN ._.
March_NNP /_/ April_NNP sold_VBD at_IN 4_CD ,_, 4_CD and_CC 4_CD dlrs_NNS ._.
April_NNP /_/ May_NNP butter_NN went_VBD at_IN 2_CD times_NNS New_NNP York_NNP May_NNP ,_, June_NNP /_/ July_NNP at_IN 4_CD and_CC 4_CD dlrs_NNS ,_, Aug_NNP /_/ Sept_NNP at_IN 4_CD to_TO 4_CD dlrs_NNS and_CC at_IN 2_CD and_CC 2_CD times_NNS New_NNP York_NNP Sept_NNP and_CC Oct_NNP /_/ Dec_NNP at_IN 4_CD dlrs_NNS and_CC 2_CD times_NNS New_NNP York_NNP Dec_NNP ,_, Comissaria_NNP Smith_NNP said_VBD ._.
Destinations_NNS were_VBD the_DT U.S._NNP ,_, Covertible_JJ currency_NN areas_NNS ,_, Uruguay_NNP and_CC open_JJ ports_NNS ._.
Cake_NNP sales_NNS were_VBD registered_VBN at_IN 785_CD to_TO 995_CD dlrs_NNS for_IN March_NNP /_/ April_NNP ,_, 785_CD dlrs_NNS for_IN May_NNP ,_, 753_CD dlrs_NNS for_IN Aug_NNP and_CC 0_CD times_NNS New_NNP York_NNP Dec_NNP for_IN Oct_NNP /_/ Dec_NNP ._.
Buyers_NNS were_VBD the_DT U.S._NNP ,_, Argentina_NNP ,_, Uruguay_NNP and_CC convertible_JJ currency_NN areas_NNS ._.
Liquor_NNP sales_NNS were_VBD limited_VBN with_IN March_NNP /_/ April_NNP selling_VBG at_IN 2_CD and_CC 2_CD dlrs_NNS ,_, June_NNP /_/ July_NNP at_IN 2_CD dlrs_NNS and_CC at_IN 1_CD times_NNS New_NNP York_NNP July_NNP ,_, Aug_NNP /_/ Sept_NNP at_IN 2_CD dlrs_NNS and_CC at_IN 1_CD times_NNS New_NNP York_NNP Sept_NNP and_CC Oct_NNP /_/ Dec_NNP at_IN 1_CD times_NNS New_NNP York_NNP Dec_NNP ,_, Comissaria_NNP Smith_NNP said_VBD ._.
Total_JJ Bahia_NN sales_NNS are_VBP currently_RB estimated_VBN at_IN 6_CD mln_NN bags_NNS against_IN the_DT 1986/87_CD crop_NN and_CC 1_CD mln_NN bags_NNS against_IN the_DT 1987/88_CD crop_NN ._.
Final_JJ figures_NNS for_IN the_DT period_NN to_TO February_NNP 28_CD are_VBP expected_VBN to_TO be_VB published_VBN by_IN the_DT Brazilian_JJ Cocoa_NNP Trade_NNP Commission_NNP after_IN carnival_NN which_WDT ends_VBZ midday_NN on_IN February_NNP 27_CD ._.
Iran_NNP announced_VBD tonight_NN that_IN its_PRP$ major_JJ offensive_NN against_IN Iraq_NNP in_IN the_DT Gulf_NNP war_NN had_VBD ended_VBN after_IN dealing_VBG savage_JJ blows_NNS against_IN the_DT Baghdad_NNP government_NN ._.
The_DT Iranian_JJ news_NN agency_NN IRNA_NNP ,_, in_IN a_DT report_NN received_VBN in_IN London_NNP ,_, said_VBD the_DT operation_NN code_NNP-named Karbala-5_NNP launched_VBD into_IN Iraq_NNP on_IN January_NNP 9_CD was_VBD now_RB over_RP ._.
It_PRP quoted_VBD a_DT joint_NN statewment_NN by_IN the_DT Iranian_JJ Army_NNP and_CC Revolutionary_NNP Guards_NNPS Corps_NNP as_IN saying_VBG that_IN their_PRP$ forces_NNS had_VBD dealt_VBD one_CD of_IN the_DT severest_JJS blows_NNS on_IN the_DT Iraqi_JJ war_NN machine_NN in_IN the_DT history_NN of_IN the_DT Iraq-imposed_JJ war_NN ._.
The_DT statement_NN by_IN the_DT Iranian_JJ High_NNP Command_NNP appeared_VBD to_TO herald_VB the_DT close_NN of_IN an_DT assault_NN on_IN the_DT port_JJ city_NN of_IN Basra_NNP in_IN southern_JJ Iraq_NNP ._.
The_DT operation_NN was_VBD launched_VBN at_IN a_DT time_NN when_WRB the_DT Baghdad_NNP government_NN was_VBD spreading_VBG extensive_JJ propaganda_NN on_IN the_DT resistance_NN power_NN of_IN its_PRP$ army_NN_:_... ,_, said_VBD the_DT statement_NN quoted_VBN by_IN IRNA_NNP ._.
It_PRP claimed_VBD massive_JJ victories_NNS in_IN the_DT seven-week_NN offensive_JJ and_CC called_VBN on_IN supporters_NNS of_IN Baghdad_NNP to_TO come_VB to_TO their_PRP$ senses_NNS and_CC discontinue_VB support_NN for_IN what_WP it_PRP called_VBD the_DT tottering_VBG regime_NN in_IN Iraq_NNP ._.
Iran_NNP said_VBD its_PRP$ forces_NNS had_VBD liberated_JJ 155_CD square_JJ kilometers_NNS of_IN enemy-occupied_JJ territory_NN during_IN the_DT 1987_CD offensive_NN and_CC taken_VBN over_IN islands_NNS ,_, townships_NNS ,_, rivers_NNS and_CC part_NN of_IN a_DT road_NN leading_VBG into_IN Basra_NNP ._.
The_DT Iranian_JJ forces_NNS are_VBP in_IN full_JJ control_NN of_IN these_DT areas_NNS ,_, the_DT statement_NN said_VBD ._.
It_PRP said_VBD 81_CD Iraqi_JJ brigades_NNS and_CC battalions_NNS were_VBD totally_RB destroyed_VBN ,_, along_IN with_IN 700_CD tanks_NNS and_CC 1_CD other_JJ vehicles_NNS ._. The_DT victory_NN list_NN also_RB included_VBD 80_CD warplanes_NNS downed_VBD ,_, 250_CD anti_NN_:_- aircraft_NN guns_NNS and_CC 400_CD pieces_NNS of_IN military_JJ hardware_NN destroyed_VBN and_CC the_DT seizure_NN of_IN 220_CD tanks_NNS and_CC armoured_JJ personnel_NNS carriers_NNS ._.
Sentence_NN number_NN 1_CD has_VBZ 6_CD words_NNS ._. Sentence_NN number_NN 2_CD ,_, 5_CD words_NNS ._.
They_NNP sent_VBD him_PRP running_VBG in_IN the_DT evening_NN ._.
He_PRP did_VBD not_RB come_VB back_RB ._.

View File

@ -0,0 +1,144 @@
Iran announced tonight that its major offensive against Iraq in the Gulf war had ended after dealing savage blows against the Baghdad government.
The Iranian news agency IRNA, in a report received in London, said the operation code-named Karbala-5 launched into Iraq on January 9 was now over.
It quoted a joint statewment by the Iranian Army and Revolutionary Guards Corps as saying that their forces had "dealt one of the severest blows on the Iraqi war machine in the history of the Iraq-imposed war."
The statement by the Iranian High Command appeared to herald the close of an assault on the port city of Basra in southern Iraq.
"The operation was launched at a time when the Baghdad government was spreading extensive propaganda on the resistance power of its army...," said the statement quoted by IRNA.
It claimed massive victories in the seven-week offensive and called on supporters of Baghdad to "come to their senses" and discontinue support for what it called the tottering regime in Iraq.
Iran said its forces had "liberated" 155 square kilometers of enemy-occupied territory during the 1987 offensive and taken over islands, townships, rivers and part of a road leading into Basra.
The Iranian forces "are in full control of these areas," the statement said.
It said 81 Iraqi brigades and battalions were totally destroyed, along with 700 tanks and 1,500 other vehicles.
The victory list also included 80 warplanes downed, 250 anti- aircraft guns and 400 pieces of military hardware destroyed and the seizure of 220 tanks and armoured personnel carriers.
U.S. bank discount window borrowings less extended credits averaged 310 mln dlrs in the week to Wednesday February 25, the Federal Reserve said.
The Fed said that overall borrowings in the week fell 131 mln dlrs to 614 mln dlrs, with extended credits up 10 mln dlrs at 304 mln dlrs.
The week was the second half of a two-week statement period.
Net borrowings in the prior week averaged 451 mln dlrs.
Commenting on the two-week statement period ended February 25, the Fed said that banks had average net free reserves of 644 mln dlrs a day, down from 1.34 billion two weeks earlier.
A Federal Reserve spokesman told a press briefing that there were no large single day net misses in the Fed's reserve projections in the week to Wednesday.
He said that natural float had been "acting a bit strangely" for this time of year, noting that there had been poor weather during the latest week.
The spokesman said that natural float ranged from under 500 mln dlrs on Friday, for which he could give no reason, to nearly one billion dlrs on both Thursday and Wednesday.
The Fed spokeman could give no reason for Thursday's high float, but he said that about 750 mln dlrs of Wednesday's float figure was due to holdover and transportation float at two widely separated Fed districts.
For the week as a whole, he said that float related as of adjustments were "small," adding that they fell to a negative 750 mln dlrs on Tuesday due to a number of corrections for unrelated cash letter errors in six districts around the country.
The spokesman said that on both Tuesday and Wednesday, two different clearing banks had system problems and the securities and Federal funds wires had to be held open until about 2000 or 2100 EST on both days.
However, he said that both problems were cleared up during both afternoons and there was no evidence of any reserve impact.
During the week ended Wednesday, 45 pct of net discount window borrowings were made by the smallest banks, with 30 pct by the 14 large money center banks and 25 pct by large regional institutions.
On Wednesday, 55 pct of the borrowing was accounted for by the money center banks, with 30 pct by the large regionals and 15 pct by the smallest banks.
The Fed spokesman said the banking system had excess reserves on Thursday, Monday and Tuesday and a deficit on Friday and Wedndsday.
That produced a small daily average deficit for the week as a whole.
For the two-week period, he said there were relatively high excess reserves on a daily avearge, almost all of which were at the smallest banks.
American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc, but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock.
American Express stock got a lift from the rumor, as the market calculated a partially public Shearson may command a good market value, thereby boosting the total value of American Express.
The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend.
American Express closed on the New York Stock Exchange at 72-5/8, up 4-1/8 on heavy volume.
American Express would not comment on the rumors or its stock activity.
Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes.
At the meeting, company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson, according to analysts.
Yesterday, Shearson said it was elevating its chief operating officer, Jeffery Lane, to the added position of president, which had been vacant.
It also created four new positions for chairmen of its operating divisions.
Analysts speculated a partial spinoff would make most sense, contrary to one variation on market rumors of a total spinoff.
Some analysts, however, disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express, contributing about 20 pct of earnings last year.
"I think it is highly unlikely that American Express is going to sell shearson," said Perrin Long of Lipper Analytical.
He questioned what would be a better investment than "a very profitable securities firm."
Several analysts said American Express is not in need of cash, which might be the only reason to sell a part of a strong asset.
But others believe the company could very well of considered the option of spinning out part of Shearson, and one rumor suggests selling about 20 pct of it in the market.
Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past.
"Shearson being as profitable as it is would have fetched a big premium in the market place.
Shearson's book value is in the 1.4 mln dlr range.
Shearson in the market place would probably be worth three to 3.5 bilion dlrs in terms of market capitalization," said Eckenfelder.
Some analysts said American Express could use capital since it plans to expand globally.
"They have enormous internal growth plans that takes capital.
You want your stock to reflect realistic valuations to enhance your ability to make all kinds of endeavors down the road," said E.F. Hutton Group analyst Michael Lewis.
"They've outlined the fact that they're investing heavily in the future, which goes heavily into the international arena," said Lewis.
"...That does not preclude acquisitions and divestitures along the way," he said.
Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson, its stock might better reflect other assets, such as the travel related services business.
"It could find its true water mark with a lesser exposure to brokerage.
The value of the other components could command a higher multiple because they constitute a higher percentage of the total operating earnings of the company," he said.
Lewis said Shearson contributed 316 mln in after-tax operating earnings, up from about 200 mln dlrs in 1985.
Reuter &#3;
Coleco Industries Inc said it expects to return to profitability in 1987.
Earlier, Coleco reported a net loss of 111.2 mln dlrs for the year ended December 31 compared to a profit of 64.2 mln dlrs in the year earlier.
In a prepared statement, the company said the dramatic swing in operating results was due primarily to the steep decline in sales of Cabbage Patch Kids products from 600 mln dlrs to 230 mln dlrs.
Coleco said it changed from a single product company to a more diversified organization through four major acquisitions last year.
Products from the new acquisitions and other new product introductions are expected to enable it to return to profitability, it said.
At the annual Toy Fair earlier this month, vice president Morton Handel said analysts' 1987 projected earnings of 90 cts a share on sales of 600 mln dlrs are reasonable.
Venezuela is seeking a 'constructive and flexible' attitude from its creditor banks in current talks to reschedule 21 billion dlrs in foreign debt, finance minister manuel azpurua told a press conference.
He declined to comment on meetings this week in new york between public finances director jorge marcano and venezuela's 13-bank advisory committee except to say, "they are progressing."
Azpurua said venezuela has shown solidarity with brazil's decision to suspend payments, but each country must negotiate according to its own interest.
Asked to comment on chile's agreement with its creditors today, which includes an interest rate margin of one pct over libor, azpurua said only, "that is good news."
According to banking sources, the banks' latest offer to venezuela is also a one pct margin as against the last february's 1-1/8 pct rescheduling accord and the 7/8 pct Venezuela wants.
Azpurua said four basic elements are being negotiated with the banks now: spread reduction, deferral of principal payments due in 1987 and 1988, lenghtening the 12-1/2 year repayment schedule, and debt capitalization schemes.
Azpurua said the governent plans to pay 2.1 billion dlrs in public and private debt principal this year.
It was due to amortize 1.05 billion dlrs under the rescheduling, and pay 420 mln dlrs in non-restructured principal, both public sector.
He said venezuela's original proposal was to pay no principal on restructured debt this year, but is now insisting that if it makes payments they be compensated by new bank loans.
The banking sources said the committee has been prepared to lower amortizations to around 400 mln dlrs this year, but that no direct commitment was likely on new loans.
"debtors and bank creditors have a joint responsibility and there will be no lasting solution unless a positive flow of financing is guaranteed," azpurua said.
However, he appeared to discard earlier venezuelan proposals for a direct link between oil income and debt payments, "because circumstances change too quickly."
At the same time, he said the government is presently studying possible mechanisms for capitlizing public and private sector foreign debt, based on experience in other countries.
The rules would be published by the finance ministry and the central bank.
Thomson McKinnon Mortgage Assets Corp, a unit of Thomson McKinnon Inc, is offering 100 mln dlrs of collateralized mortgage obligations in three tranches that include floating rate and inverse floating rate CMOS.
The floating rate class amounts to 60 mln dlrs.
It has an average life of 7.11 years and matures 2018.
The CMOs have an initial coupon of 7.0375 pct, which will be reset 60 basis points above LIBOR, said sole manager Thomson McKinnon.
The inverse floater totals 4.8 mln dlrs.
It has an average life of 13.49 years and matures 2018.
These CMOs were given an initial coupon of 11-1/2 pct and priced at 104.40.
Subsequent rates on the inverse floater will equal 11-1/2 pct minus the product of three times (LIBOR minus 6-1/2 pct).
A Thomson officer explained that the coupon of the inverse floating rate tranche would increase if LIBOR declined.
"The yield floats opposite of LIBOR," he said.
The fixed-rate tranche totals 35.2 mln dlrs.
It has an average life of 3.5 years and matures 2016.
The CMOs were assigned a 7.65 pct coupon and par pricing.
The issue is rated AAA by Standard and Poor's and secured by Federal Home Loan Mortgage Corp, Freddie Mac, certificates.
OPEC may be forced to meet before a scheduled June session to readdress its production cutting agreement if the organization wants to halt the current slide in oil prices, oil industry analysts said.
"The movement to higher oil prices was never to be as easy as OPEC thought.
They may need an emergency meeting to sort out the problems," said Daniel Yergin, director of Cambridge Energy Research Associates, CERA.
Analysts and oil industry sources said the problem OPEC faces is excess oil supply in world oil markets.
"OPEC's problem is not a price problem but a production issue and must be addressed in that way," said Paul Mlotok, oil analyst with Salomon Brothers Inc.
He said the market's earlier optimism about OPEC and its ability to keep production under control have given way to a pessimistic outlook that the organization must address soon if it wishes to regain the initiative in oil prices.
But some other analysts were uncertain that even an emergency meeting would address the problem of OPEC production above the 15.8 mln bpd quota set last December.
"OPEC has to learn that in a buyers market you cannot have deemed quotas, fixed prices and set differentials," said the regional manager for one of the major oil companies who spoke on condition that he not be named.
"The market is now trying to teach them that lesson again," he added.
David T. Mizrahi, editor of Mideast reports, expects OPEC to meet before June, although not immediately.
However, he is not optimistic that OPEC can address its principal problems.
"They will not meet now as they try to take advantage of the winter demand to sell their oil, but in late March and April when demand slackens," Mizrahi said.
But Mizrahi said that OPEC is unlikely to do anything more than reiterate its agreement to keep output at 15.8 mln bpd."
Analysts said that the next two months will be critical for OPEC's ability to hold together prices and output.
"OPEC must hold to its pact for the next six to eight weeks since buyers will come back into the market then," said Dillard Spriggs of Petroleum Analysis Ltd in New York.
But Bijan Moussavar-Rahmani of Harvard University's Energy and Environment Policy Center said that the demand for OPEC oil has been rising through the first quarter and this may have prompted excesses in its production.
"Demand for their (OPEC) oil is clearly above 15.8 mln bpd and is probably closer to 17 mln bpd or higher now so what we are seeing characterized as cheating is OPEC meeting this demand through current production," he told Reuters in a telephone interview.
BankAmerica Corp is not under pressure to act quickly on its proposed equity offering and would do well to delay it because of the stock's recent poor performance, banking analysts said.
Some analysts said they have recommended BankAmerica delay its up to one-billion-dlr equity offering, which has yet to be approved by the Securities and Exchange Commission.
BankAmerica stock fell this week, along with other banking issues, on the news that Brazil has suspended interest payments on a large portion of its foreign debt.
The stock traded around 12, down 1/8, this afternoon, after falling to 11-1/2 earlier this week on the news.
Banking analysts said that with the immediate threat of the First Interstate Bancorp <I> takeover bid gone, BankAmerica is under no pressure to sell the securities into a market that will be nervous on bank stocks in the near term.
BankAmerica filed the offer on January 26.
It was seen as one of the major factors leading the First Interstate withdrawing its takeover bid on February 9.
A BankAmerica spokesman said SEC approval is taking longer than expected and market conditions must now be re-evaluated.
"The circumstances at the time will determine what we do," said Arthur Miller, BankAmerica's Vice President for Financial Communications, when asked if BankAmerica would proceed with the offer immediately after it receives SEC approval.
"I'd put it off as long as they conceivably could," said Lawrence Cohn, analyst with Merrill Lynch, Pierce, Fenner and Smith.
Cohn said the longer BankAmerica waits, the longer they have to show the market an improved financial outlook.
Although BankAmerica has yet to specify the types of equities it would offer, most analysts believed a convertible preferred stock would encompass at least part of it.
Such an offering at a depressed stock price would mean a lower conversion price and more dilution to BankAmerica stock holders, noted Daniel Williams, analyst with Sutro Group.
Several analysts said that while they believe the Brazilian debt problem will continue to hang over the banking industry through the quarter, the initial shock reaction is likely to ease over the coming weeks.
Nevertheless, BankAmerica, which holds about 2.70 billion dlrs in Brazilian loans, stands to lose 15-20 mln dlrs if the interest rate is reduced on the debt, and as much as 200 mln dlrs if Brazil pays no interest for a year, said Joseph Arsenio, analyst with Birr, Wilson and Co.
He noted, however, that any potential losses would not show up in the current quarter.
The Federal Deposit Insurance Corp (FDIC) said three troubled banks in Texas and Louisiana were merged with healthy financial institutions.
The FDIC said it subsidized the merger of Central Bank and Trust Co, Glenmora, La., with the healthy Peoples Bank and Trust Co, Natchitoches, La., after state regulators notified it that Central was in danger of failing.
Central had assets of 28.3 mln dlrs.
The FDIC said the deposits of the failed Farmers State Bank, Hart, Tex., were assumed by Hale County State Bank, Plainview, Tex.
Farmers, with 9.6 mln dlrs in assets, was closed by Texas bank regulators.
The deposits of the failed First National Bank of Crosby, Crosby, Tex., with total assets of 8.2 mln dlrs, were assumed by Central Bancshares of the South Inc, Birmingham, Ala., after First National was closed by federal bank regulators, the FDIC said.
Brazil's 14-bank advisory committee expressed "grave concern" to chief debt negotiator Antonio Padua de Seixas over the country's suspension of interest payments, according to a telex from committee chairman Citibank to creditor banks worldwide.
Bankers said the diplomatic phrase belied the deep anger and frustration on the committee over Brazil's unilateral move last Friday and its subsequent freeze on some 15 billion dlrs of short-term trade and interbank lines.
Seixas, director of the Brazilian central bank's foreign debt department, met the full panel on Tuesday and Wednesday.
Seixas, who met again this morning with senior Citibank executive William Rhodes and representatives from committee vice-chairmen Morgan Guaranty Trust Co and Lloyds Bank Plc, told the banks that the government was preparing a telex to explain and clarify the freeze on short-term credits.
The telex could be sent to creditors as early as today, bankers said.
Despite the rising tempers, bankers said there are no plans for Brazilian finance minister Dilson Funaro to meet commercial bankers during his trip to Washington on Friday and Saturday.
Funaro will be explaining Brazil's actions to U.S. Treasury Secretary James Baker, Federal Reserve Board chairman Paul Volcker and International Monetary Fund managing director Michel Camdessus before travelling to Europe at the weekend.

View File

@ -0,0 +1,69 @@
Iran announced tonight that its major offensive against Iraq in the Gulf war had ended after dealing savage blows against the Baghdad government<SPLIT>.
The Iranian news agency IRNA<SPLIT>, in a report received in London<SPLIT>, said the operation code-named Karbala-5 launched into Iraq on January 9 was now over<SPLIT>.
It quoted a joint statewment by the Iranian Army and Revolutionary Guards Corps as saying that their forces had "<SPLIT>dealt one of the severest blows on the Iraqi war machine in the history of the Iraq-imposed war<SPLIT>.<SPLIT>"
The statement by the Iranian High Command appeared to herald the close of an assault on the port city of Basra in southern Iraq<SPLIT>.
"<SPLIT>The operation was launched at a time when the Baghdad government was spreading extensive propaganda on the resistance power of its army<SPLIT>...<SPLIT>,<SPLIT>" said the statement quoted by IRNA<SPLIT>.
It claimed massive victories in the seven-week offensive and called on supporters of Baghdad to "<SPLIT>come to their senses<SPLIT>" and discontinue support for what it called the tottering regime in Iraq<SPLIT>.
Iran said its forces had "<SPLIT>liberated<SPLIT>" 155 square kilometers of enemy-occupied territory during the 1987 offensive and taken over islands<SPLIT>, townships<SPLIT>, rivers and part of a road leading into Basra<SPLIT>.
The Iranian forces "<SPLIT>are in full control of these areas<SPLIT>,<SPLIT>" the statement said<SPLIT>.
It said 81 Iraqi brigades and battalions were totally destroyed<SPLIT>, along with 700 tanks and 1,500 other vehicles<SPLIT>.
U.S. bank discount window borrowings less extended credits averaged 310 mln dlrs in the week to Wednesday February 25<SPLIT>, the Federal Reserve said<SPLIT>.
The Fed said that overall borrowings in the week fell 131 mln dlrs to 614 mln dlrs<SPLIT>, with extended credits up 10 mln dlrs at 304 mln dlrs<SPLIT>.
The week was the second half of a two-week statement period<SPLIT>.
Net borrowings in the prior week averaged 451 mln dlrs<SPLIT>.
Commenting on the two-week statement period ended February 25<SPLIT>, the Fed said that banks had average net free reserves of 644 mln dlrs a day<SPLIT>, down from 1.34 billion two weeks earlier<SPLIT>.
A Federal Reserve spokesman told a press briefing that there were no large single day net misses in the Fed's reserve projections in the week to Wednesday<SPLIT>.
He said that natural float had been "<SPLIT>acting a bit strangely<SPLIT>" for this time of year<SPLIT>, noting that there had been poor weather during the latest week<SPLIT>.
The spokesman said that natural float ranged from under 500 mln dlrs on Friday<SPLIT>, for which he could give no reason<SPLIT>, to nearly one billion dlrs on both Thursday and Wednesday<SPLIT>.
The Fed spokeman could give no reason for Thursday's high float<SPLIT>, but he said that about 750 mln dlrs of Wednesday's float figure was due to holdover and transportation float at two widely separated Fed districts<SPLIT>.
For the week as a whole<SPLIT>, he said that float related as of adjustments were "<SPLIT>small<SPLIT>,<SPLIT>" adding that they fell to a negative 750 mln dlrs on Tuesday due to a number of corrections for unrelated cash letter errors in six districts around the country<SPLIT>.
The spokesman said that on both Tuesday and Wednesday<SPLIT>, two different clearing banks had system problems and the securities and Federal funds wires had to be held open until about 2000 or 2100 EST on both days<SPLIT>.
However<SPLIT>, he said that both problems were cleared up during both afternoons and there was no evidence of any reserve impact<SPLIT>.
During the week ended Wednesday<SPLIT>, 45 pct of net discount window borrowings were made by the smallest banks<SPLIT>, with 30 pct by the 14 large money center banks and 25 pct by large regional institutions<SPLIT>.
On Wednesday<SPLIT>, 55 pct of the borrowing was accounted for by the money center banks<SPLIT>, with 30 pct by the large regionals and 15 pct by the smallest banks<SPLIT>.
The Fed spokesman said the banking system had excess reserves on Thursday<SPLIT>, Monday and Tuesday and a deficit on Friday and Wedndsday<SPLIT>.
That produced a small daily average deficit for the week as a whole<SPLIT>.
For the two-week period<SPLIT>, he said there were relatively high excess reserves on a daily avearge<SPLIT>, almost all of which were at the smallest banks<SPLIT>.
American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc<SPLIT>, but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock<SPLIT>.
American Express stock got a lift from the rumor<SPLIT>, as the market calculated a partially public Shearson may command a good market value<SPLIT>, thereby boosting the total value of American Express<SPLIT>.
The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend<SPLIT>.
American Express closed on the New York Stock Exchange at 72-5/8<SPLIT>, up 4-1/8 on heavy volume<SPLIT>.
American Express would not comment on the rumors or its stock activity<SPLIT>.
Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes<SPLIT>.
At the meeting<SPLIT>, company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson<SPLIT>, according to analysts<SPLIT>.
Yesterday<SPLIT>, Shearson said it was elevating its chief operating officer<SPLIT>, Jeffery Lane<SPLIT>, to the added position of president<SPLIT>, which had been vacant<SPLIT>.
It also created four new positions for chairmen of its operating divisions<SPLIT>.
Analysts speculated a partial spinoff would make most sense<SPLIT>, contrary to one variation on market rumors of a total spinoff<SPLIT>.
Some analysts<SPLIT>, however<SPLIT>, disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express<SPLIT>, contributing about 20 pct of earnings last year<SPLIT>.
"<SPLIT>I think it is highly unlikely that American Express is going to sell shearson<SPLIT>,<SPLIT>" said Perrin Long of Lipper Analytical<SPLIT>.
He questioned what would be a better investment than "<SPLIT>a very profitable securities firm<SPLIT>.<SPLIT>"
Several analysts said American Express is not in need of cash<SPLIT>, which might be the only reason to sell a part of a strong asset<SPLIT>.
But others believe the company could very well of considered the option of spinning out part of Shearson<SPLIT>, and one rumor suggests selling about 20 pct of it in the market<SPLIT>.
Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past<SPLIT>.
"<SPLIT>Shearson being as profitable as it is would have fetched a big premium in the market place<SPLIT>.
Some analysts said American Express could use capital since it plans to expand globally<SPLIT>.
"<SPLIT>They've outlined the fact that they're investing heavily in the future<SPLIT>, which goes heavily into the international arena<SPLIT>,<SPLIT>" said Lewis<SPLIT>.
Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson<SPLIT>, its stock might better reflect other assets<SPLIT>, such as the travel related services business<SPLIT>.
Lewis said Shearson contributed 316 mln in after-tax operating earnings<SPLIT>, up from about 200 mln dlrs in 1985<SPLIT>.
Coleco Industries Inc said it expects to return to profitability in 1987<SPLIT>.
Earlier<SPLIT>, Coleco reported a net loss of 111.2 mln dlrs for the year ended December 31 compared to a profit of 64.2 mln dlrs in the year earlier<SPLIT>.
In a prepared statement<SPLIT>, the company said the dramatic swing in operating results was due primarily to the steep decline in sales of Cabbage Patch Kids products from 600 mln dlrs to 230 mln dlrs<SPLIT>.
Coleco said it changed from a single product company to a more diversified organization through four major acquisitions last year<SPLIT>.
Products from the new acquisitions and other new product introductions are expected to enable it to return to profitability<SPLIT>, it said<SPLIT>.
At the annual Toy Fair earlier this month<SPLIT>, vice president Morton Handel said analysts' 1987 projected earnings of 90 cts a share on sales of 600 mln dlrs are reasonable<SPLIT>.
Azpurua said venezuela has shown solidarity with brazil's decision to suspend payments<SPLIT>, but each country must negotiate according to its own interest<SPLIT>.
Azpurua said the governent plans to pay 2.1 billion dlrs in public and private debt principal this year<SPLIT>.
It was due to amortize 1.05 billion dlrs under the rescheduling<SPLIT>, and pay 420 mln dlrs in non-restructured principal<SPLIT>, both public sector<SPLIT>.
He said venezuela's original proposal was to pay no principal on restructured debt this year<SPLIT>, but is now insisting that if it makes payments they be compensated by new bank loans<SPLIT>.
The banking sources said the committee has been prepared to lower amortizations to around 400 mln dlrs this year<SPLIT>, but that no direct commitment was likely on new loans<SPLIT>.
At the same time<SPLIT>, he said the government is presently studying possible mechanisms for capitlizing public and private sector foreign debt<SPLIT>, based on experience in other countries<SPLIT>.
The rules would be published by the finance ministry and the central bank<SPLIT>.
Thomson McKinnon Mortgage Assets Corp<SPLIT>, a unit of Thomson McKinnon Inc<SPLIT>, is offering 100 mln dlrs of collateralized mortgage obligations in three tranches that include floating rate and inverse floating rate CMOS<SPLIT>.
The floating rate class amounts to 60 mln dlrs<SPLIT>.
The inverse floater totals 4.8 mln dlrs<SPLIT>.
Subsequent rates on the inverse floater will equal 11-1/2 pct minus the product of three times (<SPLIT>LIBOR minus 6-1/2 pct<SPLIT>)<SPLIT>.
A Thomson officer explained that the coupon of the inverse floating rate tranche would increase if LIBOR declined<SPLIT>.
The fixed-rate tranche totals 35.2 mln dlrs<SPLIT>.
The issue is rated AAA by Standard and Poor's and secured by Federal Home Loan Mortgage Corp<SPLIT>, Freddie Mac<SPLIT>, certificates<SPLIT>.

View File

@ -20,12 +20,8 @@ import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.English;
public class TestStopFilter extends BaseTokenStreamTestCase {
@ -111,7 +107,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
7,
1,
null,
true);
true,
null);
}
private void doTestStopPositons(StopFilter stpf) throws IOException {

View File

@ -161,6 +161,9 @@ org.apache.james.apache.mime4j.version = 0.7.2
/org.apache.mina/mina-core = 2.0.0-M5
/org.apache.opennlp/opennlp-maxent = 3.0.3
/org.apache.opennlp/opennlp-tools = 1.8.3
org.apache.pdfbox.version = 2.0.6
/org.apache.pdfbox/fontbox = ${org.apache.pdfbox.version}
/org.apache.pdfbox/jempbox = 1.8.13

View File

@ -0,0 +1 @@
55e39e6b46e71f35229cdd6950e72d8cce3b5fd4

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,6 @@
Apache OpenNLP Maxent
Copyright 2013 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -0,0 +1 @@
3ce7c9056048f55478d983248cf18c7e02b1d072

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,6 @@
Apache OpenNLP Tools
Copyright 2015 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -285,6 +285,28 @@
<property name="analyzers-icu-javadocs.uptodate" value="true"/>
</target>
<property name="analyzers-opennlp.jar" value="${common.dir}/build/analysis/opennlp/lucene-analyzers-opennlp-${version}.jar"/>
<target name="check-analyzers-opennlp-uptodate" unless="analyzers-opennlp.uptodate">
<module-uptodate name="analysis/opennlp" jarfile="${analyzers-opennlp.jar}" property="analyzers-opennlp.uptodate"/>
</target>
<target name="jar-analyzers-opennlp" unless="analyzers-opennlp.uptodate" depends="check-analyzers-opennlp-uptodate">
<ant dir="${common.dir}/analysis/opennlp" target="jar-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="analyzers-opennlp.uptodate" value="true"/>
</target>
<property name="analyzers-opennlp-javadoc.jar" value="${common.dir}/build/analysis/opennlp/lucene-analyzers-opennlp-${version}-javadoc.jar"/>
<target name="check-analyzers-opennlp-javadocs-uptodate" unless="analyzers-opennlp-javadocs.uptodate">
<module-uptodate name="analysis/opennlp" jarfile="${analyzers-opennlp-javadoc.jar}" property="analyzers-opennlp-javadocs.uptodate"/>
</target>
<target name="javadocs-analyzers-opennlp" unless="analyzers-opennlp-javadocs.uptodate" depends="check-analyzers-opennlp-javadocs-uptodate">
<ant dir="${common.dir}/analysis/opennlp" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="analyzers-opennlp-javadocs.uptodate" value="true"/>
</target>
<property name="analyzers-phonetic.jar" value="${common.dir}/build/analysis/phonetic/lucene-analyzers-phonetic-${version}.jar"/>
<target name="check-analyzers-phonetic-uptodate" unless="analyzers-phonetic.uptodate">
<module-uptodate name="analysis/phonetic" jarfile="${analyzers-phonetic.jar}" property="analyzers-phonetic.uptodate"/>

View File

@ -41,6 +41,7 @@ import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
@ -127,7 +128,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// lastStartOffset)
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException {
boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@ -167,6 +168,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
keywordAtt = ts.getAttribute(KeywordAttribute.class);
}
PayloadAttribute payloadAtt = null;
if (payloads != null) {
assertTrue("has no PayloadAttribute", ts.hasAttribute(PayloadAttribute.class));
payloadAtt = ts.getAttribute(PayloadAttribute.class);
}
// Maps position to the start/end offset:
final Map<Integer,Integer> posToStartOffset = new HashMap<>();
final Map<Integer,Integer> posToEndOffset = new HashMap<>();
@ -185,6 +192,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
if (payloadAtt != null) payloadAtt.setPayload(new BytesRef(new byte[] { 0x00, -0x21, 0x12, -0x43, 0x24 }));
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
assertTrue("token "+i+" does not exist", ts.incrementToken());
@ -209,6 +217,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (keywordAtts != null) {
assertEquals("keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword());
}
if (payloads != null) {
if (payloads[i] != null) {
assertEquals("payloads " + i, new BytesRef(payloads[i]), payloadAtt.getPayload());
} else {
assertNull("payloads " + i, payloads[i]);
}
}
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
@ -283,6 +298,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (typeAtt != null) typeAtt.setType("bogusType");
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
if (keywordAtt != null) keywordAtt.setKeyword(true);
if (payloadAtt != null) payloadAtt.setPayload(new BytesRef(new byte[] { 0x00, -0x21, 0x12, -0x43, 0x24 }));
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
@ -305,7 +322,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect);
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, offsetsAreCorrect, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
@ -374,6 +391,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
checkResetException(a, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, offsetsAreCorrect, payloads);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null, null);
}

View File

@ -53,6 +53,13 @@ New Features
----------------------
* SOLR-11285: Simulation framework for autoscaling. (ab)
* LUCENE-2899: In the Solr analysis-extras contrib, added support for the
OpenNLP-based analysis components in the Lucene analysis/opennlp module:
tokenization, part-of-speech tagging, phrase chunking, and lemmatization.
Also added OpenNLP-based named entity extraction as a Solr update request
processor. (Lance Norskog, Grant Ingersoll, Joern Kottmann, Em, Kai Gülzau,
Rene Nederhand, Robert Muir, Steven Bower, Steve Rowe)
Optimizations
----------------------

View File

@ -1,8 +1,10 @@
The analysis-extras plugin provides additional analyzers that rely
upon large dependencies/dictionaries.
It includes integration with ICU for multilingual support, and
analyzers for Chinese and Polish.
It includes integration with ICU for multilingual support,
analyzers for Chinese and Polish, and integration with
OpenNLP for multilingual tokenization, part-of-speech tagging
lemmatization, phrase chunking, and named-entity recognition.
ICU relies upon lucene-libs/lucene-analyzers-icu-X.Y.jar
and lib/icu4j-X.Y.jar
@ -14,3 +16,5 @@ Stempel relies on lucene-libs/lucene-analyzers-stempel-X.Y.jar
Morfologik relies on lucene-libs/lucene-analyzers-morfologik-X.Y.jar
and lib/morfologik-*.jar
OpenNLP relies on lucene-libs/lucene-analyzers-opennlp-X.Y.jar
and lib/opennlp-*.jar

View File

@ -30,13 +30,14 @@
<path id="analysis.extras.lucene.libs">
<pathelement location="${analyzers-icu.jar}"/>
<!--
Although the smartcn, stempel, and morfologik jars are not dependencies of
Although the smartcn, stempel, morfologik and opennlp jars are not dependencies of
code in the analysis-extras contrib, they must remain here in order to
populate the Solr distribution
-->
<pathelement location="${analyzers-smartcn.jar}"/>
<pathelement location="${analyzers-stempel.jar}"/>
<pathelement location="${analyzers-morfologik.jar}"/>
<pathelement location="${analyzers-opennlp.jar}"/>
</path>
<path id="classpath">
@ -54,7 +55,7 @@
</path>
<!--
Although the smartcn, stempel, and morfologik jars are not dependencies of
Although the smartcn, stempel, morfologik and opennlp jars are not dependencies of
code in the analysis-extras contrib, they must remain here in order to
populate the Solr distribution
-->
@ -66,6 +67,7 @@
<target name="jar-analyzers-smartcn"/>
<target name="jar-analyzers-stempel"/>
<target name="jar-analyzers-morfologik"/>
<target name="jar-analyzers-opennlp"/>
</antcall>
<property name="analyzers-icu.uptodate" value="true"/> <!-- compile-time dependency -->
<mkdir dir="${build.dir}/lucene-libs"/>
@ -85,6 +87,6 @@
</copy>
</target>
<target name="compile-core" depends="jar-analyzers-icu, solr-contrib-build.compile-core"/>
<target name="compile-core" depends="jar-analyzers-icu, jar-analyzers-opennlp, solr-contrib-build.compile-core"/>
<target name="dist" depends="module-jars-to-solr, common-solr.dist"/>
</project>

View File

@ -24,6 +24,9 @@
</configurations>
<dependencies>
<dependency org="com.ibm.icu" name="icu4j" rev="${/com.ibm.icu/icu4j}" conf="compile"/>
<dependency org="org.apache.opennlp" name="opennlp-tools" rev="${/org.apache.opennlp/opennlp-tools}" conf="compile" />
<dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${/org.apache.opennlp/opennlp-maxent}" conf="compile" />
<!--
Although the 3rd party morfologik jars are not dependencies of code in
the analysis-extras contrib, they must remain here in order to

View File

@ -0,0 +1,571 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import opennlp.tools.util.Span;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.OpenNLPTokenizer;
import org.apache.lucene.analysis.opennlp.tools.NLPNERTaggerOp;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Extracts named entities using an OpenNLP NER <code>modelFile</code> from the values found in
* any matching <code>source</code> field into a configured <code>dest</code> field, after
* first tokenizing the source text using the index analyzer on the configured
* <code>analyzerFieldType</code>, which must include <code>solr.OpenNLPTokenizerFactory</code>
* as the tokenizer. E.g.:
*
* <pre class="prettyprint">
* &lt;fieldType name="opennlp-en-tokenization" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.OpenNLPTokenizerFactory"
* sentenceModel="en-sent.bin"
* tokenizerModel="en-tokenizer.bin"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*
* <p>See the <a href="OpenNLP website">http://opennlp.apache.org/models.html</a>
* for information on downloading pre-trained models.</p>
*
* <p>
* The <code>source</code> field(s) can be configured as either:
* </p>
* <ul>
* <li>One or more <code>&lt;str&gt;</code></li>
* <li>An <code>&lt;arr&gt;</code> of <code>&lt;str&gt;</code></li>
* <li>A <code>&lt;lst&gt;</code> containing
* {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory style selector arguments}</li>
* </ul>
*
* <p>The <code>dest</code> field can be a single <code>&lt;str&gt;</code>
* containing the literal name of a destination field, or it may be a <code>&lt;lst&gt;</code> specifying a
* regex <code>pattern</code> and a <code>replacement</code> string. If the pattern + replacement option
* is used the pattern will be matched against all fields matched by the source selector, and the replacement
* string (including any capture groups specified from the pattern) will be evaluated a using
* {@link Matcher#replaceAll(String)} to generate the literal name of the destination field. Additionally,
* an occurrence of the string "{EntityType}" in the <code>dest</code> field specification, or in the
* <code>replacement</code> string, will be replaced with the entity type(s) returned for each entity by
* the OpenNLP NER model; as a result, if the model extracts more than one entity type, then more than one
* <code>dest</code> field will be populated.
* </p>
*
* <p>If the resolved <code>dest</code> field already exists in the document, then the
* named entities extracted from the <code>source</code> fields will be added to it.
* </p>
* <p>
* In the example below:
* </p>
* <ul>
* <li>Named entities will be extracted from the <code>text</code> field and added
* to the <code>names_ss</code> field</li>
* <li>Named entities will be extracted from both the <code>title</code> and
* <code>subtitle</code> fields and added into the <code>titular_people</code> field</li>
* <li>Named entities will be extracted from any field with a name ending in <code>_txt</code>
* -- except for <code>notes_txt</code> -- and added into the <code>people_ss</code> field</li>
* <li>Named entities will be extracted from any field with a name beginning with "desc" and
* ending in "s" (e.g. "descs" and "descriptions") and added to a field prefixed with "key_",
* not ending in "s", and suffixed with "_people". (e.g. "key_desc_people" or
* "key_description_people")</li>
* <li>Named entities will be extracted from the <code>summary</code> field and added
* to the <code>summary_person_ss</code> field, assuming that the modelFile only extracts
* entities of type "person".</li>
* </ul>
*
* <pre class="prettyprint">
* &lt;updateRequestProcessorChain name="multiple-extract"&gt;
* &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
* &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
* &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
* &lt;str name="source"&gt;text&lt;/str&gt;
* &lt;str name="dest"&gt;people_s&lt;/str&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
* &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
* &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
* &lt;arr name="source"&gt;
* &lt;str&gt;title&lt;/str&gt;
* &lt;str&gt;subtitle&lt;/str&gt;
* &lt;/arr&gt;
* &lt;str name="dest"&gt;titular_people&lt;/str&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
* &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
* &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
* &lt;lst name="source"&gt;
* &lt;str name="fieldRegex"&gt;.*_txt$&lt;/str&gt;
* &lt;lst name="exclude"&gt;
* &lt;str name="fieldName"&gt;notes_txt&lt;/str&gt;
* &lt;/lst&gt;
* &lt;/lst&gt;
* &lt;str name="dest"&gt;people_s&lt;/str&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
* &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
* &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
* &lt;lst name="source"&gt;
* &lt;str name="fieldRegex"&gt;^desc(.*)s$&lt;/str&gt;
* &lt;/lst&gt;
* &lt;lst name="dest"&gt;
* &lt;str name="pattern"&gt;^desc(.*)s$&lt;/str&gt;
* &lt;str name="replacement"&gt;key_desc$1_people&lt;/str&gt;
* &lt;/lst&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
* &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
* &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
* &lt;str name="source"&gt;summary&lt;/str&gt;
* &lt;str name="dest"&gt;summary_{EntityType}_s&lt;/str&gt;
* &lt;/processor&gt;
* &lt;/updateRequestProcessorChain&gt;
* </pre>
*
* @since 7.3.0
*/
public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
extends UpdateRequestProcessorFactory implements SolrCoreAware {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String SOURCE_PARAM = "source";
public static final String DEST_PARAM = "dest";
public static final String PATTERN_PARAM = "pattern";
public static final String REPLACEMENT_PARAM = "replacement";
public static final String MODEL_PARAM = "modelFile";
public static final String ANALYZER_FIELD_TYPE_PARAM = "analyzerFieldType";
public static final String ENTITY_TYPE = "{EntityType}";
private SelectorParams srcInclusions = new SelectorParams();
private Collection<SelectorParams> srcExclusions = new ArrayList<>();
private FieldNameSelector srcSelector = null;
private String modelFile = null;
private String analyzerFieldType = null;
/**
* If pattern is null, this this is a literal field name. If pattern is non-null then this
* is a replacement string that may contain meta-characters (ie: capture group identifiers)
* @see #pattern
*/
private String dest = null;
/** @see #dest */
private Pattern pattern = null;
protected final FieldNameSelector getSourceSelector() {
if (null != srcSelector) return srcSelector;
throw new SolrException(SERVER_ERROR, "selector was never initialized, inform(SolrCore) never called???");
}
@SuppressWarnings("unchecked")
@Override
public void init(NamedList args) {
// high level (loose) check for which type of config we have.
//
// individual init methods do more strict syntax checking
if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) {
initSourceSelectorSyntax(args);
} else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
initSimpleRegexReplacement(args);
} else {
throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+
DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
PATTERN_PARAM + "' init params are mandatory");
}
Object modelParam = args.remove(MODEL_PARAM);
if (null == modelParam) {
throw new SolrException(SERVER_ERROR, "Missing required init param '" + MODEL_PARAM + "'");
}
if ( ! (modelParam instanceof CharSequence)) {
throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "' must be a <str>");
}
modelFile = modelParam.toString();
Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
if (null == analyzerFieldTypeParam) {
throw new SolrException(SERVER_ERROR, "Missing required init param '" + ANALYZER_FIELD_TYPE_PARAM + "'");
}
if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
throw new SolrException(SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
}
analyzerFieldType = analyzerFieldTypeParam.toString();
if (0 < args.size()) {
throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
}
super.init(args);
}
/**
* init helper method that should only be called when we know for certain that both the
* "source" and "dest" init params do <em>not</em> exist.
*/
@SuppressWarnings("unchecked")
private void initSimpleRegexReplacement(NamedList args) {
// The syntactic sugar for the case where there is only one regex pattern for source and the same pattern
// is used for the destination pattern...
//
// pattern != null && replacement != null
//
// ...as top level elements, with no other config options specified
// if we got here we know we had pattern and replacement, now check for the other two so that we can give a better
// message than "unexpected"
if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) {
throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM);
}
assert args.indexOf(SOURCE_PARAM, 0) < 0;
Object patt = args.remove(PATTERN_PARAM);
Object replacement = args.remove(REPLACEMENT_PARAM);
if (null == patt || null == replacement) {
throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" +
REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+
DEST_PARAM + "' are not both specified");
}
if (0 != args.size()) {
throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" +
PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
"' to be combined with other options.");
}
if (!(replacement instanceof String)) {
throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. <str>)");
}
if (!(patt instanceof String)) {
throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. <str>)");
}
dest = replacement.toString();
try {
this.pattern = Pattern.compile(patt.toString());
} catch (PatternSyntaxException pe) {
throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
" is not a valid regex pattern: " + patt, pe);
}
srcInclusions = new SelectorParams();
srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
}
/**
* init helper method that should only be called when we know for certain that both the
* "source" and "dest" init params <em>do</em> exist.
*/
@SuppressWarnings("unchecked")
private void initSourceSelectorSyntax(NamedList args) {
// Full and complete syntax where source and dest are mandatory.
//
// source may be a single string or a selector.
// dest may be a single string or list containing pattern and replacement
//
// source != null && dest != null
// if we got here we know we had source and dest, now check for the other two so that we can give a better
// message than "unexpected"
if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) {
throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
}
Object d = args.remove(DEST_PARAM);
assert null != d;
List<Object> sources = args.getAll(SOURCE_PARAM);
assert null != sources;
if (1 == sources.size()) {
if (sources.get(0) instanceof NamedList) {
// nested set of selector options
NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
srcInclusions = parseSelectorParams(selectorConfig);
List<Object> excList = selectorConfig.getAll("exclude");
for (Object excObj : excList) {
if (null == excObj) {
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
"' child 'exclude' can not be null");
}
if (!(excObj instanceof NamedList)) {
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
"' child 'exclude' must be <lst/>");
}
NamedList exc = (NamedList) excObj;
srcExclusions.add(parseSelectorParams(exc));
if (0 < exc.size()) {
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
"' has unexpected 'exclude' sub-param(s): '"
+ selectorConfig.getName(0) + "'");
}
// call once per instance
selectorConfig.remove("exclude");
}
if (0 < selectorConfig.size()) {
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
"' contains unexpected child param(s): '" +
selectorConfig.getName(0) + "'");
}
// consume from the named list so it doesn't interfere with subsequent processing
sources.remove(0);
}
}
if (1 <= sources.size()) {
// source better be one or more strings
srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
}
if (srcInclusions == null) {
throw new SolrException(SERVER_ERROR,
"Init params do not specify any field from which to extract entities, please supply either "
+ SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" +
"for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
}
if (d instanceof NamedList) {
NamedList destList = (NamedList) d;
Object patt = destList.remove(PATTERN_PARAM);
Object replacement = destList.remove(REPLACEMENT_PARAM);
if (null == patt || null == replacement) {
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
"' are both mandatory and can not be null");
}
if (! (patt instanceof String && replacement instanceof String)) {
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
"' must both be strings (i.e. <str>)");
}
if (0 != destList.size()) {
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '"
+ destList.getName(0) + "'");
}
try {
this.pattern = Pattern.compile(patt.toString());
} catch (PatternSyntaxException pe) {
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM +
" is not a valid regex pattern: " + patt, pe);
}
dest = replacement.toString();
} else if (d instanceof String) {
dest = d.toString();
} else {
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " +
"(i.e. <str>) or a list (i.e. <lst>) containing '" +
PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
}
}
@Override
public void inform(final SolrCore core) {
srcSelector =
FieldMutatingUpdateProcessor.createFieldNameSelector
(core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
for (SelectorParams exc : srcExclusions) {
srcSelector = FieldMutatingUpdateProcessor.wrap
(srcSelector,
FieldMutatingUpdateProcessor.createFieldNameSelector
(core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
}
try {
OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public final UpdateRequestProcessor getInstance
(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
final FieldNameSelector srcSelector = getSourceSelector();
return new UpdateRequestProcessor(next) {
private final NLPNERTaggerOp nerTaggerOp;
private Analyzer analyzer = null;
{
try {
nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
if (fieldType == null) {
throw new SolrException
(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema.");
}
analyzer = fieldType.getIndexAnalyzer();
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
final SolrInputDocument doc = cmd.getSolrInputDocument();
// Destination may be regex replace string, or "{EntityType}" replaced by
// each entity's type, both of which can cause multiple output fields.
Map<String,SolrInputField> destMap = new HashMap<>();
// preserve initial values
for (final String fname : doc.getFieldNames()) {
if ( ! srcSelector.shouldMutate(fname)) continue;
Collection<Object> srcFieldValues = doc.getFieldValues(fname);
if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
String resolvedDest = dest;
if (pattern != null) {
Matcher matcher = pattern.matcher(fname);
if (matcher.find()) {
resolvedDest = matcher.replaceAll(dest);
} else {
log.debug("srcSelector.shouldMutate(\"{}\") returned true, " +
"but replacement pattern did not match, field skipped.", fname);
continue;
}
}
for (Object val : srcFieldValues) {
for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
SolrInputField destField = null;
String entityName = entity.first();
String entityType = entity.second();
resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
if (doc.containsKey(resolvedDest)) {
destField = doc.getField(resolvedDest);
} else {
SolrInputField targetField = destMap.get(resolvedDest);
if (targetField == null) {
destField = new SolrInputField(resolvedDest);
} else {
destField = targetField;
}
}
destField.addValue(entityName);
// put it in map to avoid concurrent modification...
destMap.put(resolvedDest, destField);
}
}
}
for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
doc.put(entry.getKey(), entry.getValue());
}
super.processAdd(cmd);
}
/** Using configured NER model, extracts (name, type) pairs from the given source field value */
private List<Pair<String,String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
List<Pair<String,String>> entitiesWithType = new ArrayList<>();
List<String> terms = new ArrayList<>();
List<Integer> startOffsets = new ArrayList<>();
List<Integer> endOffsets = new ArrayList<>();
String fullText = srcFieldValue.toString();
TokenStream tokenStream = analyzer.tokenStream("", fullText);
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
tokenStream.reset();
synchronized (nerTaggerOp) {
while (tokenStream.incrementToken()) {
terms.add(termAtt.toString());
startOffsets.add(offsetAtt.startOffset());
endOffsets.add(offsetAtt.endOffset());
boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
if (endOfSentence) { // extract named entities one sentence at a time
extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
}
}
tokenStream.end();
tokenStream.close();
if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
}
nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
}
return entitiesWithType;
}
private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets,
List<Integer> endOffsets, List<Pair<String,String>> entitiesWithType) {
for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1));
entitiesWithType.add(new Pair<>(text, span.getType()));
}
terms.clear();
startOffsets.clear();
endOffsets.clear();
}
};
}
/** macro */
private static SelectorParams parseSelectorParams(NamedList args) {
return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args);
}
}

View File

@ -0,0 +1,24 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- not a package-info.java, because we already defined this package in core/ -->
<html>
<body>
Update request processor invoking OpenNLP Named Entity Recognition over configured
source field(s), populating configured target field(s) with the results.
</body>
</html>

View File

@ -0,0 +1,49 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="test-opennlp-extract" version="1.6">
<fieldType name="opennlp-en-tokenization" class="solr.TextField">
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-test-sent.bin"
tokenizerModel="en-test-tokenizer.bin"/>
</analyzer>
</fieldType>
<fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="subject" type="text" indexed="true" stored="true"/>
<field name="title" type="text" indexed="true" stored="true"/>
<field name="subtitle" type="text" indexed="true" stored="true"/>
<field name="descs" type="text" indexed="true" stored="true"/>
<field name="descriptions" type="text" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_people" type="string" indexed="true" stored="true" multiValued="true"/>
</schema>

View File

@ -0,0 +1,206 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<config>
<luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
<xi:include href="solrconfig.snippet.randomindexconfig.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
<requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
<requestHandler name="/update" class="solr.UpdateRequestHandler" />
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
<schemaFactory class="ClassicIndexSchemaFactory"/>
<updateRequestProcessorChain name="extract-single">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<str name="source">source1_s</str>
<str name="dest">dest_s</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-single-regex">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<str name="source">source1_s</str>
<lst name="dest">
<str name="pattern">source\d(_s)</str>
<str name="replacement">dest$1</str>
</lst>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-multi">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<str name="source">source1_s</str>
<str name="source">source2_s</str>
<str name="dest">dest_s</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-multi-regex">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<str name="source">source1_s</str>
<str name="source">source2_s</str>
<lst name="dest">
<str name="pattern">source\d(_s)</str>
<str name="replacement">dest$1</str>
</lst>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-array">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<arr name="source">
<str>source1_s</str>
<str>source2_s</str>
</arr>
<str name="dest">dest_s</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-array-regex">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<arr name="source">
<str>source1_s</str>
<str>source2_s</str>
</arr>
<lst name="dest">
<str name="pattern">source\d(_s)</str>
<str name="replacement">dest$1</str>
</lst>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-selector">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<lst name="source">
<str name="fieldRegex">source\d_.*</str>
<lst name="exclude">
<str name="fieldRegex">source0_.*</str>
</lst>
</lst>
<str name="dest">dest_s</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-selector-regex">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<lst name="source">
<str name="fieldRegex">source\d_.*</str>
<lst name="exclude">
<str name="fieldRegex">source0_.*</str>
</lst>
</lst>
<lst name="dest">
<str name="pattern">source\d(_s)</str>
<str name="replacement">dest$1</str>
</lst>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-regex-replaceall">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<lst name="source">
<str name="fieldRegex">foo.*</str>
</lst>
<lst name="dest">
<!-- unbounded pattern that can be replaced multiple times in field name -->
<str name="pattern">x(\d)</str>
<str name="replacement">y$1</str>
</lst>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="extract-regex-replaceall-with-entity-type">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<lst name="source">
<str name="fieldRegex">foo.*</str>
</lst>
<lst name="dest">
<!-- unbounded pattern that can be replaced multiple times in field name -->
<str name="pattern">x(\d)</str>
<str name="replacement">{EntityType}_y$1</str>
</lst>
</processor>
</updateRequestProcessorChain>
<!-- example used in OpenNLPExtractNamedEntitiesUpdateProcessorFactory javadocs -->
<updateRequestProcessorChain name="multiple-extract">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<str name="source">text</str>
<str name="dest">people_s</str>
</processor>
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<arr name="source">
<str>title</str>
<str>subtitle</str>
</arr>
<str name="dest">titular_people</str>
</processor>
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<lst name="source">
<str name="fieldRegex">.*_txt$</str>
<lst name="exclude">
<str name="fieldName">notes_txt</str>
</lst>
</lst>
<str name="dest">people_s</str>
</processor>
<processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<lst name="source">
<str name="fieldRegex">^desc(.*)s$</str>
</lst>
<lst name="dest">
<str name="pattern">^desc(.*)s$</str>
<str name="replacement">key_desc$1_people</str>
</lst>
</processor>
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">en-test-ner-person.bin</str>
<str name="analyzerFieldType">opennlp-en-tokenization</str>
<str name="source">summary</str>
<str name="dest">summary_{EntityType}_s</str>
</processor>
</updateRequestProcessorChain>
</config>

View File

@ -0,0 +1,48 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
A solrconfig.xml snippet containing indexConfig settings for randomized testing.
-->
<indexConfig>
<!-- this sys property is not set by SolrTestCaseJ4 because we ideally want to use
the RandomMergePolicy in all tests - but some tests expect very specific
Merge behavior, so those tests can set it as needed.
-->
<mergePolicyFactory class="${solr.tests.mergePolicyFactory:org.apache.solr.util.RandomMergePolicyFactory}" />
<useCompoundFile>${useCompoundFile:false}</useCompoundFile>
<maxBufferedDocs>${solr.tests.maxBufferedDocs}</maxBufferedDocs>
<ramBufferSizeMB>${solr.tests.ramBufferSizeMB}</ramBufferSizeMB>
<mergeScheduler class="${solr.tests.mergeScheduler}" />
<writeLockTimeout>1000</writeLockTimeout>
<commitLockTimeout>10000</commitLockTimeout>
<!-- this sys property is not set by SolrTestCaseJ4 because almost all tests should
use the single process lockType for speed - but tests that explicitly need
to vary the lockType can set it as needed.
-->
<lockType>${solr.tests.lockType:single}</lockType>
<infoStream>${solr.tests.infostream:false}</infoStream>
</indexConfig>

View File

@ -0,0 +1,192 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.File;
import java.util.Arrays;
import org.apache.commons.io.FileUtils;
import org.apache.solr.common.SolrInputDocument;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory extends UpdateProcessorTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
File testHome = createTempDir().toFile();
FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
initCore("solrconfig-opennlp-extract.xml", "schema-opennlp-extract.xml", testHome.getAbsolutePath());
}
@Test
public void testSimpleExtract() throws Exception {
SolrInputDocument doc = processAdd("extract-single",
doc(f("id", "1"),
f("source1_s", "Take this to Mr. Flashman.")));
assertEquals("dest_s should have stringValue", "Flashman", doc.getFieldValue("dest_s"));
}
@Test
public void testMultiExtract() throws Exception {
SolrInputDocument doc = processAdd("extract-multi",
doc(f("id", "1"),
f("source1_s", "Hello Flashman."),
f("source2_s", "Calling Flashman.")));
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("dest_s"));
}
@Test
public void testArrayExtract() throws Exception {
SolrInputDocument doc = processAdd("extract-array",
doc(f("id", "1"),
f("source1_s", "Currently we have Flashman. Not much else."),
f("source2_s", "Flashman. Is. Not. There.")));
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("dest_s"));
}
@Test
public void testSelectorExtract() throws Exception {
SolrInputDocument doc = processAdd("extract-selector",
doc(f("id", "1"),
f("source0_s", "Flashman. Or not."),
f("source1_s", "Serendipitously, he was. I mean, Flashman. And yet."),
f("source2_s", "Correct, Flashman.")));
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("dest_s"));
}
public void testMultipleExtracts() throws Exception {
// test example from the javadocs
SolrInputDocument doc = processAdd("multiple-extract",
doc(f("id", "1"),
f("text", "From Flashman. To Panman."),
f("title", "It's Captain Flashman.", "Privately, Flashman."),
f("subtitle", "Ineluctably, Flashman."),
f("corrolary_txt", "Forsooth thou bringeth Flashman."),
f("notes_txt", "Yes Flashman."),
f("summary", "Many aspire to be Flashman."),
f("descs", "Courage, Flashman.", "Ain't he Flashman."),
f("descriptions", "Flashman. Flashman. Flashman.")));
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("people_s"));
assertEquals(Arrays.asList("Flashman", "Flashman", "Flashman"), doc.getFieldValues("titular_people"));
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("key_desc_people"));
assertEquals(Arrays.asList("Flashman", "Flashman", "Flashman"), doc.getFieldValues("key_description_people"));
assertEquals("Flashman", doc.getFieldValue("summary_person_s")); // {EntityType} field name interpolation
}
public void testEquivalentExtraction() throws Exception {
SolrInputDocument d;
// regardless of chain, all of these checks should be equivalent
for (String chain : Arrays.asList("extract-single", "extract-single-regex",
"extract-multi", "extract-multi-regex",
"extract-array", "extract-array-regex",
"extract-selector", "extract-selector-regex")) {
// simple extract
d = processAdd(chain,
doc(f("id", "1111"),
f("source0_s", "Totally Flashman."), // not extracted
f("source1_s", "One nation under Flashman.", "Good Flashman.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("Flashman", "Flashman"), d.getFieldValues("dest_s"));
// append to existing values
d = processAdd(chain,
doc(f("id", "1111"),
field("dest_s", "orig1", "orig2"),
f("source0_s", "Flashman. In totality."), // not extracted
f("source1_s", "Two nations under Flashman.", "Meh Flashman.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
}
// should be equivalent for any chain matching source1_s and source2_s (but not source0_s)
for (String chain : Arrays.asList("extract-multi", "extract-multi-regex",
"extract-array", "extract-array-regex",
"extract-selector", "extract-selector-regex")) {
// simple extract
d = processAdd(chain,
doc(f("id", "1111"),
f("source0_s", "Not Flashman."), // not extracted
f("source1_s", "Could have had a Flashman.", "Bad Flashman."),
f("source2_s", "Indubitably Flashman.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("Flashman", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
// append to existing values
d = processAdd(chain,
doc(f("id", "1111"),
field("dest_s", "orig1", "orig2"),
f("source0_s", "Never Flashman."), // not extracted
f("source1_s", "Seeking Flashman.", "Evil incarnate Flashman."),
f("source2_s", "Perfunctorily Flashman.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
}
// any chain that copies source1_s to dest_s should be equivalent for these assertions
for (String chain : Arrays.asList("extract-single", "extract-single-regex",
"extract-multi", "extract-multi-regex",
"extract-array", "extract-array-regex",
"extract-selector", "extract-selector-regex")) {
// simple extract
d = processAdd(chain,
doc(f("id", "1111"),
f("source1_s", "Always Flashman.", "Flashman. Noone else.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("Flashman", "Flashman"), d.getFieldValues("dest_s"));
// append to existing values
d = processAdd(chain,
doc(f("id", "1111"),
field("dest_s", "orig1", "orig2"),
f("source1_s", "Flashman. And, scene.", "Contemporary Flashman. Yeesh.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
}
}
public void testExtractFieldRegexReplaceAll() throws Exception {
SolrInputDocument d = processAdd("extract-regex-replaceall",
doc(f("id", "1111"),
f("foo_x2_s", "Infrequently Flashman.", "In the words of Flashman."),
f("foo_x3_x7_s", "Flashman. Whoa.")));
assertNotNull(d);
assertEquals(Arrays.asList("Flashman", "Flashman"), d.getFieldValues("foo_y2_s"));
assertEquals("Flashman", d.getFieldValue("foo_y3_y7_s"));
}
public void testExtractFieldRegexReplaceAllWithEntityType() throws Exception {
SolrInputDocument d = processAdd("extract-regex-replaceall-with-entity-type",
doc(f("id", "1111"),
f("foo_x2_s", "Infrequently Flashman.", "In the words of Flashman."),
f("foo_x3_x7_s", "Flashman. Whoa.")));
assertNotNull(d);
assertEquals(d.getFieldNames().toString(), Arrays.asList("Flashman", "Flashman"), d.getFieldValues("foo_person_y2_s"));
assertEquals(d.getFieldNames().toString(),"Flashman", d.getFieldValue("foo_person_y3_person_y7_s"));
}
}

View File

@ -0,0 +1 @@
55e39e6b46e71f35229cdd6950e72d8cce3b5fd4

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,6 @@
Apache OpenNLP Maxent
Copyright 2013 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -0,0 +1 @@
3ce7c9056048f55478d983248cf18c7e02b1d072

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,6 @@
Apache OpenNLP Tools
Copyright 2015 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -1576,6 +1576,38 @@ This filter adds the token's type, as an encoded byte sequence, as its payload.
*Out:* "Pay"[<ALPHANUM>], "Bob's"[<APOSTROPHE>], "I.O.U."[<ACRONYM>]
== Type As Synonym Filter
This filter adds the token's type, as a token at the same position as the token, optionally with a configurable prefix prepended.
*Factory class:* `solr.TypeAsSynonymFilterFactory`
*Arguments:*
`prefix`:: (optional) The prefix to prepend to the token's type.
*Examples:*
With the example below, each token's type will be emitted verbatim at the same position:
[source,xml]
----
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.TypeAsSynonymFilterFactory"/>
</analyzer>
----
With the example below, for a token "example.com" with type `<URL>`, the token emitted at the same position will be "\_type_<URL>":
[source,xml]
----
<analyzer>
<tokenizer class="solr.UAX29URLEmailTokenizerFactory"/>
<filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_"/>
</analyzer>
----
== Type Token Filter
This filter blacklists or whitelists a specified list of token types, assuming the tokens have type metadata associated with them. For example, the <<tokenizers.adoc#uax29-url-email-tokenizer,UAX29 URL Email Tokenizer>> emits "<URL>" and "<EMAIL>" typed tokens, as well as other types. This filter would allow you to pull out only e-mail addresses from text as tokens, if you wish.

View File

@ -355,6 +355,214 @@ This can increase recall by causing more matches. On the other hand, it can redu
</analyzer>
----
== OpenNLP Integration
The `lucene/analysis/opennlp` module provides OpenNLP integration via several analysis components: a tokenizer, a part-of-speech tagging filter, a phrase chunking filter, and a lemmatization filter. In addition to these analysis components, Solr also provides an update request processor to extract named entities - see <<update-request-processors.adoc#update-processor-factories-that-can-be-loaded-as-plugins,Update Processor Factories That Can Be Loaded as Plugins>>.
NOTE: The <<OpenNLP Tokenizer>> must be used with all other OpenNLP analysis components, for two reasons: first, the OpenNLP Tokenizer detects and marks the sentence boundaries required by all the OpenNLP filters; and second, since the pre-trained OpenNLP models used by these filters were trained using the corresponding language-specific sentence-detection/tokenization models, the same tokenization, using the same models, must be used at runtime for optimal performance.
See `solr/contrib/analysis-extras/README.txt` for information on which jars you need to add to your `SOLR_HOME/lib`.
=== OpenNLP Tokenizer
The OpenNLP Tokenizer takes two language-specific binary model files as parameters: a sentence detector model and a tokenizer model. The last token in each sentence is flagged, so that following OpenNLP-based filters can use this information to apply operations to tokens one sentence at a time. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
*Factory class:* `solr.OpenNLPTokenizerFactory`
*Arguments:*
`sentenceModel`:: (required) The path of a language-specific OpenNLP sentence detection model file. This path may be an absolute path, or path relative to the Solr config directory.
`tokenizerModel`:: (required) The path of a language-specific OpenNLP tokenization model file. This path may be an absolute path, or path relative to the Solr config directory.
*Example:*
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
</analyzer>
----
=== OpenNLP Part-Of-Speech Filter
This filter sets each token's type attribute to the part of speech (POS) assigned by the configured model. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
NOTE: Lucene currently does not index token types, so if you want to keep this information, you have to preserve it either in a payload or as a synonym; see the examples below.
*Factory class:* `solr.OpenNLPPOSFilterFactory`
*Arguments:*
`posTaggerModel`:: (required) The path of a language-specific OpenNLP POS tagger model file. This path may be an absolute path, or path relative to the Solr config directory.
*Examples:*
The OpenNLP tokenizer will tokenize punctuation, which is useful for following token filters, but ordinarily you don't want to include punctuation in your index, so the `TypeTokenFilter` (<<filter-descriptions.adoc#type-token-filter,described here>>) is included in the examples below, with `stop.pos.txt` containing the following:
.stop.pos.txt
[source,text]
----
#
$
''
``
,
-LRB-
-RRB-
:
.
----
Index the POS for each token as a payload:
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.TypeAsPayloadFilterFactory"/>
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
</analyzer>
----
Index the POS for each token as a synonym, after prefixing the POS with "@" (see the <<filter-descriptions.adoc#type-as-synonym-filter,TypeAsSynonymFilter description>>):
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.TypeAsSynonymFilterFactory" prefix="@"/>
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
</analyzer>
----
Only index nouns - the `keep.pos.txt` file contains lines `NN`, `NNS`, `NNP` and `NNPS`:
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.TypeTokenFilterFactory" types="keep.pos.txt" useWhitelist="true"/>
</analyzer>
----
=== OpenNLP Phrase Chunking Filter
This filter sets each token's type attribute based on the output of an OpenNLP phrase chunking model. The chunk labels replace the POS tags that previously were in each token's type attribute. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
Prerequisite: the <<OpenNLP Tokenizer>> and the <<OpenNLP Part-Of-Speech Filter>> must precede this filter.
NOTE: Lucene currently does not index token types, so if you want to keep this information, you have to preserve it either in a payload or as a synonym; see the examples below.
*Factory class:* `solr.OpenNLPChunkerFilter`
*Arguments:*
`chunkerModel`:: (required) The path of a language-specific OpenNLP phrase chunker model file. This path may be an absolute path, or path relative to the Solr config directory.
*Examples*:
Index the phrase chunk label for each token as a payload:
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.OpenNLPChunkerFactory" chunkerModel="en-chunker.bin"/>
<filter class="solr.TypeAsPayloadFilterFactory"/>
</analyzer>
----
Index the phrase chunk label for each token as a synonym, after prefixing it with "#" (see the <<filter-descriptions.adoc#type-as-synonym-filter,TypeAsSynonymFilter description>>):
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.OpenNLPChunkerFactory" chunkerModel="en-chunker.bin"/>
<filter class="solr.TypeAsSynonymFilterFactory" prefix="#"/>
</analyzer>
----
=== OpenNLP Lemmatizer Filter
This filter replaces the text of each token with its lemma. Both a dictionary-based lemmatizer and a model-based lemmatizer are supported. If both are configured, the dictionary-based lemmatizer is tried first, and then the model-based lemmatizer is consulted for out-of-vocabulary tokens. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
*Factory class:* `solr.OpenNLPLemmatizerFilter`
*Arguments:*
Either `dictionary` or `lemmatizerModel` must be provided, and both may be provided - see the examples below:
`dictionary`:: (optional) The path of a lemmatization dictionary file. This path may be an absolute path, or path relative to the Solr config directory. The dictionary file must be encoded as UTF-8, with one entry per line, in the form `word[tab]lemma[tab]part-of-speech`, e.g. `wrote[tab]write[tab]VBD`.
`lemmatizerModel`:: (optional) The path of a language-specific OpenNLP lemmatizer model file. This path may be an absolute path, or path relative to the Solr config directory.
*Examples:*
Perform dictionary-based lemmatization, and fall back to model-based lemmatization for out-of-vocabulary tokens (see the <<OpenNLP Part-Of-Speech Filter>> section above for information about using `TypeTokenFilter` to avoid indexing punctuation):
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.OpenNLPLemmatizerFilterFactory"
dictionary="lemmas.txt"
lemmatizerModel="en-lemmatizer.bin"/>
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
</analyzer>
----
Perform dictionary-based lemmatization only:
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.OpenNLPLemmatizerFilterFactory" dictionary="lemmas.txt"/>
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
</analyzer>
----
Perform model-based lemmatization only, preserving the original token and emitting the lemma as a synonym (see the <<KeywordRepeatFilterFactory,KeywordRepeatFilterFactory description>>)):
[source,xml]
----
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-tokenizer.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.KeywordRepeatFilterFactory"/>
<filter class="solr.OpenNLPLemmatizerFilterFactory" lemmatizerModel="en-lemmatizer.bin"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
</analyzer>
----
== Language-Specific Factories
These factories are each designed to work with specific languages. The languages covered here are:

View File

@ -502,3 +502,7 @@ Specifies how to define whitespace for the purpose of tokenization. Valid values
*In:* "To be, or what?"
*Out:* "To", "be,", "or", "what?"
== OpenNLP Tokenizer and OpenNLP Filters
See <<language-analysis.adoc#opennlp-integration,OpenNLP Integration>> for information about using the OpenNLP Tokenizer, along with information about available OpenNLP token filters.

View File

@ -275,6 +275,8 @@ What follows are brief descriptions of the currently available update request pr
{solr-javadocs}/solr-core/org/apache/solr/update/processor/IgnoreCommitOptimizeUpdateProcessorFactory.html[IgnoreCommitOptimizeUpdateProcessorFactory]:: Allows you to ignore commit and/or optimize requests from client applications when running in SolrCloud mode, for more information, see: Shards and Indexing Data in SolrCloud
{solr-javadocs}/solr-core/org/apache/solr/update/processor/CloneFieldUpdateProcessorFactory.html[CloneFieldUpdateProcessorFactory]:: Clones the values found in any matching _source_ field into the configured _dest_ field.
{solr-javadocs}/solr-core/org/apache/solr/update/processor/RegexpBoostProcessorFactory.html[RegexpBoostProcessorFactory]:: A processor which will match content of "inputField" against regular expressions found in "boostFilename", and if it matches will return the corresponding boost value from the file and output this to "boostField" as a double value.
{solr-javadocs}/solr-core/org/apache/solr/update/processor/SignatureUpdateProcessorFactory.html[SignatureUpdateProcessorFactory]:: Uses a defined set of fields to generate a hash "signature" for the document. Useful for only indexing one copy of "similar" documents.
@ -351,6 +353,10 @@ The {solr-javadocs}/solr-uima/index.html[`uima`] contrib provides::
{solr-javadocs}/solr-uima/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.html[UIMAUpdateRequestProcessorFactory]::: Update document(s) to be indexed with UIMA extracted information.
The {solr-javadocs}/solr-analysis-extras/index.html[`analysis-extras`] contrib provides::
{solr-javadocs}/solr-analysis-extras/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesProcessorFactory.html[OpenNLPExtractNamedEntitiesProcessorFactory]::: Update document(s) to be indexed with named entities extracted using an OpenNLP NER model.
=== Update Processor Factories You Should _Not_ Modify or Remove
These are listed for completeness, but are part of the Solr infrastructure, particularly SolrCloud. Other than insuring you do _not_ remove them when modifying the update request handlers (or any copies you make), you will rarely, if ever, need to change these.