mirror of https://github.com/apache/lucene.git
LUCENE-2899: Add OpenNLP Analysis capabilities as a module
This commit is contained in:
parent
d02d1f1cab
commit
3e2f9e62d7
|
@ -11,6 +11,7 @@
|
|||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/icu/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/kuromoji/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/morfologik/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/opennlp/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/phonetic/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/smartcn/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/stempel/build.xml" />
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/icu/icu.iml" />
|
||||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/kuromoji/kuromoji.iml" />
|
||||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/morfologik/morfologik.iml" />
|
||||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/opennlp/opennlp.iml" />
|
||||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/phonetic/phonetic.iml" />
|
||||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/smartcn/smartcn.iml" />
|
||||
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/stempel/stempel.iml" />
|
||||
|
|
|
@ -44,6 +44,14 @@
|
|||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
<configuration default="false" name="Module analyzers-opennlp" type="JUnit" factoryName="JUnit">
|
||||
<module name="opennlp" />
|
||||
<option name="TEST_OBJECT" value="pattern" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/lucene/analysis/opennlp" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
<configuration default="false" name="Module analyzers-phonetic" type="JUnit" factoryName="JUnit">
|
||||
<module name="phonetic" />
|
||||
<option name="TEST_OBJECT" value="pattern" />
|
||||
|
@ -333,48 +341,49 @@
|
|||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
|
||||
<list size="41">
|
||||
<list size="42">
|
||||
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
|
||||
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
|
||||
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
|
||||
<item index="3" class="java.lang.String" itemvalue="JUnit.Module analyzers-kuromoji" />
|
||||
<item index="4" class="java.lang.String" itemvalue="JUnit.Module analyzers-morfologik" />
|
||||
<item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
|
||||
<item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
|
||||
<item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
|
||||
<item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
|
||||
<item index="9" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
|
||||
<item index="10" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
|
||||
<item index="11" class="java.lang.String" itemvalue="JUnit.Module classification" />
|
||||
<item index="12" class="java.lang.String" itemvalue="JUnit.Module codecs" />
|
||||
<item index="13" class="java.lang.String" itemvalue="JUnit.Module expressions" />
|
||||
<item index="14" class="java.lang.String" itemvalue="JUnit.Module facet" />
|
||||
<item index="15" class="java.lang.String" itemvalue="JUnit.Module grouping" />
|
||||
<item index="16" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
|
||||
<item index="17" class="java.lang.String" itemvalue="JUnit.Module join" />
|
||||
<item index="18" class="java.lang.String" itemvalue="JUnit.Module memory" />
|
||||
<item index="19" class="java.lang.String" itemvalue="JUnit.Module misc" />
|
||||
<item index="20" class="java.lang.String" itemvalue="JUnit.Module queries" />
|
||||
<item index="21" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
|
||||
<item index="22" class="java.lang.String" itemvalue="JUnit.Module replicator" />
|
||||
<item index="23" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
|
||||
<item index="24" class="java.lang.String" itemvalue="JUnit.Module spatial" />
|
||||
<item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
|
||||
<item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
|
||||
<item index="27" class="java.lang.String" itemvalue="JUnit.Module suggest" />
|
||||
<item index="28" class="java.lang.String" itemvalue="Application.solrcloud" />
|
||||
<item index="29" class="java.lang.String" itemvalue="JUnit.Solr core" />
|
||||
<item index="30" class="java.lang.String" itemvalue="JUnit.Solrj" />
|
||||
<item index="31" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
|
||||
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
|
||||
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
|
||||
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
|
||||
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
|
||||
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
|
||||
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
|
||||
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
|
||||
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
|
||||
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
|
||||
<item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-opennlp" />
|
||||
<item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
|
||||
<item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
|
||||
<item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
|
||||
<item index="9" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
|
||||
<item index="10" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
|
||||
<item index="11" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
|
||||
<item index="12" class="java.lang.String" itemvalue="JUnit.Module classification" />
|
||||
<item index="13" class="java.lang.String" itemvalue="JUnit.Module codecs" />
|
||||
<item index="14" class="java.lang.String" itemvalue="JUnit.Module expressions" />
|
||||
<item index="15" class="java.lang.String" itemvalue="JUnit.Module facet" />
|
||||
<item index="16" class="java.lang.String" itemvalue="JUnit.Module grouping" />
|
||||
<item index="17" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
|
||||
<item index="18" class="java.lang.String" itemvalue="JUnit.Module join" />
|
||||
<item index="19" class="java.lang.String" itemvalue="JUnit.Module memory" />
|
||||
<item index="20" class="java.lang.String" itemvalue="JUnit.Module misc" />
|
||||
<item index="21" class="java.lang.String" itemvalue="JUnit.Module queries" />
|
||||
<item index="22" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
|
||||
<item index="23" class="java.lang.String" itemvalue="JUnit.Module replicator" />
|
||||
<item index="24" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
|
||||
<item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial" />
|
||||
<item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
|
||||
<item index="27" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
|
||||
<item index="28" class="java.lang.String" itemvalue="JUnit.Module suggest" />
|
||||
<item index="29" class="java.lang.String" itemvalue="Application.solrcloud" />
|
||||
<item index="30" class="java.lang.String" itemvalue="JUnit.Solr core" />
|
||||
<item index="31" class="java.lang.String" itemvalue="JUnit.Solrj" />
|
||||
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
|
||||
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
|
||||
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
|
||||
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
|
||||
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
|
||||
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
|
||||
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
|
||||
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
|
||||
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
|
||||
<item index="41" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
|
||||
</list>
|
||||
</component>
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="module-library">
|
||||
<library>
|
||||
<CLASSES>
|
||||
<root url="file://$MODULE_DIR$/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
|
||||
</library>
|
||||
</orderEntry>
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
</component>
|
||||
</module>
|
|
@ -37,5 +37,6 @@
|
|||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="misc" />
|
||||
<orderEntry type="module" module-name="sandbox" />
|
||||
<orderEntry type="module" module-name="opennlp" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-opennlp</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Lucene OpenNLP integration</name>
|
||||
<description>
|
||||
Lucene OpenNLP integration
|
||||
</description>
|
||||
<properties>
|
||||
<module-directory>lucene/analysis/opennlp</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:git:${vc-anonymous-base-url}</connection>
|
||||
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
|
||||
<url>${vc-browse-base-url};f=${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@lucene-analyzers-opennlp.internal.dependencies@
|
||||
@lucene-analyzers-opennlp.external.dependencies@
|
||||
@lucene-analyzers-opennlp.internal.test.dependencies@
|
||||
@lucene-analyzers-opennlp.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>${module-path}/src/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${project.build.testSourceDirectory}</directory>
|
||||
<excludes>
|
||||
<exclude>**/*.java</exclude>
|
||||
</excludes>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
</testResources>
|
||||
</build>
|
||||
</project>
|
|
@ -35,6 +35,7 @@
|
|||
<module>icu</module>
|
||||
<module>kuromoji</module>
|
||||
<module>morfologik</module>
|
||||
<module>opennlp</module>
|
||||
<module>phonetic</module>
|
||||
<module>smartcn</module>
|
||||
<module>stempel</module>
|
||||
|
|
|
@ -65,6 +65,15 @@ API Changes
|
|||
* LUCENE-8051: LevensteinDistance renamed to LevenshteinDistance.
|
||||
(Pulak Ghosh via Adrien Grand)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2899: Add new module analysis/opennlp, with analysis components
|
||||
to perform tokenization, part-of-speech tagging, lemmatization and phrase
|
||||
chunking by invoking the corresponding OpenNLP tools. Named entity
|
||||
recognition is also provided as a Solr update request processor.
|
||||
(Lance Norskog, Grant Ingersoll, Joern Kottmann, Em, Kai Gülzau,
|
||||
Rene Nederhand, Robert Muir, Steven Bower, Steve Rowe)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-8081: Allow IndexWriter to opt out of flushing on indexing threads
|
||||
|
|
|
@ -28,6 +28,9 @@ lucene-analyzers-kuromoji-XX.jar
|
|||
lucene-analyzers-morfologik-XX.jar
|
||||
An analyzer using the Morfologik stemming library.
|
||||
|
||||
lucene-analyzers-opennlp-XX.jar
|
||||
An analyzer using the OpenNLP natural-language processing library.
|
||||
|
||||
lucene-analyzers-phonetic-XX.jar
|
||||
An add-on analysis library that provides phonetic encoders via Apache
|
||||
Commons-Codec. Note: this module depends on the commons-codec jar
|
||||
|
@ -49,6 +52,7 @@ common/src/java
|
|||
icu/src/java
|
||||
kuromoji/src/java
|
||||
morfologik/src/java
|
||||
opennlp/src/java
|
||||
phonetic/src/java
|
||||
smartcn/src/java
|
||||
stempel/src/java
|
||||
|
@ -59,6 +63,7 @@ common/src/test
|
|||
icu/src/test
|
||||
kuromoji/src/test
|
||||
morfologik/src/test
|
||||
opennlp/src/test
|
||||
phonetic/src/test
|
||||
smartcn/src/test
|
||||
stempel/src/test
|
||||
|
|
|
@ -65,6 +65,10 @@
|
|||
<ant dir="morfologik" />
|
||||
</target>
|
||||
|
||||
<target name="opennlp">
|
||||
<ant dir="opennlp" />
|
||||
</target>
|
||||
|
||||
<target name="phonetic">
|
||||
<ant dir="phonetic" />
|
||||
</target>
|
||||
|
@ -82,7 +86,7 @@
|
|||
</target>
|
||||
|
||||
<target name="default" depends="compile"/>
|
||||
<target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel,uima" />
|
||||
<target name="compile" depends="common,icu,kuromoji,morfologik,opennlp,phonetic,smartcn,stempel,uima" />
|
||||
|
||||
<target name="clean">
|
||||
<forall-analyzers target="clean"/>
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Adds the {@link TypeAttribute#type()} as a synonym,
|
||||
* i.e. another token at the same position, optionally with a specified prefix prepended.
|
||||
*/
|
||||
public final class TypeAsSynonymFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final String prefix;
|
||||
|
||||
AttributeSource.State savedToken = null;
|
||||
|
||||
|
||||
public TypeAsSynonymFilter(TokenStream input) {
|
||||
this(input, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param input input tokenstream
|
||||
* @param prefix Prepend this string to every token type emitted as token text.
|
||||
* If null, nothing will be prepended.
|
||||
*/
|
||||
public TypeAsSynonymFilter(TokenStream input, String prefix) {
|
||||
super(input);
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (savedToken != null) { // Emit last token's type at the same position
|
||||
restoreState(savedToken);
|
||||
savedToken = null;
|
||||
termAtt.setEmpty();
|
||||
if (prefix != null) {
|
||||
termAtt.append(prefix);
|
||||
}
|
||||
termAtt.append(typeAtt.type());
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
return true;
|
||||
} else if (input.incrementToken()) { // Ho pending token type to emit
|
||||
savedToken = captureState();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
savedToken = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link TypeAsSynonymFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.UAX29URLEmailTokenizerFactory"/>
|
||||
* <filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* <p>
|
||||
* If the optional {@code prefix} parameter is used, the specified value will be prepended
|
||||
* to the type, e.g. with prefix="_type_", for a token "example.com" with type "<URL>",
|
||||
* the emitted synonym will have text "_type_<URL>".
|
||||
*/
|
||||
public class TypeAsSynonymFilterFactory extends TokenFilterFactory {
|
||||
private final String prefix;
|
||||
|
||||
public TypeAsSynonymFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
prefix = get(args, "prefix"); // default value is null
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TypeAsSynonymFilter(input, prefix);
|
||||
}
|
||||
}
|
|
@ -80,6 +80,7 @@ org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
|
||||
|
|
|
@ -183,14 +183,14 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
|
|||
TokenStream ts = createTokenStream(5, "woof woof woof woof woof", 1, 1, 100, false);
|
||||
assertTokenStreamContents(ts, hashes, new int[]{0},
|
||||
new int[]{24}, new String[]{MinHashFilter.MIN_HASH_TYPE}, new int[]{1}, new int[]{1}, 24, 0, null,
|
||||
true);
|
||||
true, null);
|
||||
|
||||
ts = createTokenStream(5, "woof woof woof woof woof", 2, 1, 1, false);
|
||||
assertTokenStreamContents(ts, new String[]{new String(new char[]{0, 0, 8449, 54077, 64133, 32857, 8605, 41409}),
|
||||
new String(new char[]{0, 1, 16887, 58164, 39536, 14926, 6529, 17276})}, new int[]{0, 0},
|
||||
new int[]{24, 24}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
|
||||
new int[]{1, 1}, 24, 0, null,
|
||||
true);
|
||||
true, null);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -203,7 +203,7 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
|
|||
false);
|
||||
assertTokenStreamContents(ts, hashes, new int[]{0, 0},
|
||||
new int[]{49, 49}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
|
||||
new int[]{1, 1}, 49, 0, null, true);
|
||||
new int[]{1, 1}, 49, 0, null, true, null);
|
||||
}
|
||||
|
||||
private ArrayList<String> getTokens(TokenStream ts) throws IOException {
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
public class TestTypeAsSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
private static final Token[] TOKENS = { token("Visit", "<ALPHANUM>"), token("example.com", "<URL>") };
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
TokenStream stream = new CannedTokenStream(TOKENS);
|
||||
stream = tokenFilterFactory("TypeAsSynonym").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Visit", "<ALPHANUM>", "example.com", "<URL>" },
|
||||
null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
|
||||
}
|
||||
|
||||
public void testPrefix() throws Exception {
|
||||
TokenStream stream = new CannedTokenStream(TOKENS);
|
||||
stream = tokenFilterFactory("TypeAsSynonym", "prefix", "_type_").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Visit", "_type_<ALPHANUM>", "example.com", "_type_<URL>" },
|
||||
null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
|
||||
}
|
||||
|
||||
private static Token token(String term, String type) {
|
||||
Token token = new Token();
|
||||
token.setEmpty();
|
||||
token.append(term);
|
||||
token.setType(type);
|
||||
return token;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="analyzers-opennlp" default="default">
|
||||
|
||||
<description>
|
||||
OpenNLP Library Integration
|
||||
</description>
|
||||
|
||||
<path id="opennlpjars">
|
||||
<fileset dir="lib"/>
|
||||
</path>
|
||||
|
||||
<property name="test.model.data.dir" location="src/tools/test-model-data"/>
|
||||
<property name="tests.userdir" location="src/test-files"/>
|
||||
<property name="test.model.dir" location="${tests.userdir}/org/apache/lucene/analysis/opennlp"/>
|
||||
|
||||
<import file="../analysis-module-build.xml"/>
|
||||
|
||||
<property name="analysis-extras.conf.dir"
|
||||
location="${common.dir}/../solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<path refid="opennlpjars"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="test.base.classpath"/>
|
||||
<pathelement path="${tests.userdir}"/>
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
|
||||
|
||||
<!--
|
||||
This does not create real NLP models, just small unencumbered ones for the unit tests.
|
||||
All text taken from reuters corpus.
|
||||
Tags applied with online demos at CCG Urbana-Champaign.
|
||||
-->
|
||||
<target name="train-test-models" description="Train all small test models for unit tests" depends="resolve">
|
||||
<mkdir dir="${test.model.dir}"/>
|
||||
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.sentdetect.training -->
|
||||
<trainModel command="SentenceDetectorTrainer" lang="en" data="sentences.txt" model="en-test-sent.bin"/>
|
||||
<copy file="${test.model.dir}/en-test-sent.bin" todir="${analysis-extras.conf.dir}"/>
|
||||
|
||||
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.tokenizer.training -->
|
||||
<trainModel command="TokenizerTrainer" lang="en" data="tokenizer.txt" model="en-test-tokenizer.bin"/>
|
||||
<copy file="${test.model.dir}/en-test-tokenizer.bin" todir="${analysis-extras.conf.dir}"/>
|
||||
|
||||
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.postagger.training -->
|
||||
<trainModel command="POSTaggerTrainer" lang="en" data="pos.txt" model="en-test-pos-maxent.bin"/>
|
||||
|
||||
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.chunker.training -->
|
||||
<trainModel command="ChunkerTrainerME" lang="en" data="chunks.txt" model="en-test-chunker.bin"/>
|
||||
|
||||
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.training -->
|
||||
<trainModel command="TokenNameFinderTrainer" lang="en" data="ner_flashman.txt" model="en-test-ner-person.bin">
|
||||
<extra-args>
|
||||
<arg value="-params"/>
|
||||
<arg value="ner_TrainerParams.txt"/>
|
||||
</extra-args>
|
||||
</trainModel>
|
||||
<copy file="${test.model.dir}/en-test-ner-person.bin" todir="${analysis-extras.conf.dir}"/>
|
||||
|
||||
<!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.lemmatizer.training -->
|
||||
<trainModel command="LemmatizerTrainerME" lang="en" data="lemmas.txt" model="en-test-lemmatizer.bin"/>
|
||||
</target>
|
||||
|
||||
<macrodef name="trainModel">
|
||||
<attribute name="command"/>
|
||||
<attribute name="lang"/>
|
||||
<attribute name="data"/>
|
||||
<attribute name="model"/>
|
||||
<element name="extra-args" optional="true"/>
|
||||
<sequential>
|
||||
<java classname="opennlp.tools.cmdline.CLI"
|
||||
dir="${test.model.data.dir}"
|
||||
fork="true"
|
||||
failonerror="true">
|
||||
<classpath>
|
||||
<path refid="opennlpjars"/>
|
||||
</classpath>
|
||||
|
||||
<arg value="@{command}"/>
|
||||
|
||||
<arg value="-lang"/>
|
||||
<arg value="@{lang}"/>
|
||||
|
||||
<arg value="-data"/>
|
||||
<arg value="@{data}"/>
|
||||
|
||||
<arg value="-model"/>
|
||||
<arg value="${test.model.dir}/@{model}"/>
|
||||
|
||||
<extra-args/>
|
||||
</java>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<target name="regenerate" depends="train-test-models"/>
|
||||
</project>
|
|
@ -0,0 +1,29 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.lucene" module="analyzers-opennlp" />
|
||||
<configurations defaultconfmapping="compile->master">
|
||||
<conf name="compile" transitive="false"/>
|
||||
</configurations>
|
||||
<dependencies>
|
||||
<dependency org="org.apache.opennlp" name="opennlp-tools" rev="${/org.apache.opennlp/opennlp-tools}" transitive="false" conf="compile" />
|
||||
<dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${/org.apache.opennlp/opennlp-maxent}" transitive="false" conf="compile" />
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
|
||||
</dependencies>
|
||||
</ivy-module>
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Run OpenNLP chunker. Prerequisite: the OpenNLPTokenizer and OpenNLPPOSFilter must precede this filter.
|
||||
* Tags terms in the TypeAttribute, replacing the POS tags previously put there by OpenNLPPOSFilter.
|
||||
*/
|
||||
public final class OpenNLPChunkerFilter extends TokenFilter {
|
||||
|
||||
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
private int tokenNum = 0;
|
||||
private boolean moreTokensAvailable = true;
|
||||
private String[] sentenceTerms = null;
|
||||
private String[] sentenceTermPOSTags = null;
|
||||
|
||||
private final NLPChunkerOp chunkerOp;
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
|
||||
super(input);
|
||||
this.chunkerOp = chunkerOp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if ( ! moreTokensAvailable) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
if (tokenNum == sentenceTokenAttrs.size()) {
|
||||
nextSentence();
|
||||
if (sentenceTerms == null) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
|
||||
tokenNum = 0;
|
||||
}
|
||||
clearAttributes();
|
||||
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void nextSentence() throws IOException {
|
||||
List<String> termList = new ArrayList<>();
|
||||
List<String> posTagList = new ArrayList<>();
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean endOfSentence = false;
|
||||
while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
|
||||
termList.add(termAtt.toString());
|
||||
posTagList.add(typeAtt.type());
|
||||
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
sentenceTokenAttrs.add(input.cloneAttributes());
|
||||
}
|
||||
sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
|
||||
sentenceTermPOSTags = posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
|
||||
}
|
||||
|
||||
private void assignTokenTypes(String[] tags) {
|
||||
for (int i = 0 ; i < tags.length ; ++i) {
|
||||
sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
moreTokensAvailable = true;
|
||||
clear();
|
||||
}
|
||||
|
||||
private void clear() {
|
||||
sentenceTokenAttrs.clear();
|
||||
sentenceTerms = null;
|
||||
sentenceTermPOSTags = null;
|
||||
tokenNum = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link OpenNLPChunkerFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_opennlp_chunked" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/>
|
||||
* <filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/>
|
||||
* <filter class="solr.OpenNLPChunkerFilterFactory" chunkerModel="filename"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @since 7.3.0
|
||||
*/
|
||||
public class OpenNLPChunkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String CHUNKER_MODEL = "chunkerModel";
|
||||
|
||||
private final String chunkerModelFile;
|
||||
|
||||
public OpenNLPChunkerFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
chunkerModelFile = get(args, CHUNKER_MODEL);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenNLPChunkerFilter create(TokenStream in) {
|
||||
try {
|
||||
NLPChunkerOp chunkerOp = null;
|
||||
|
||||
if (chunkerModelFile != null) {
|
||||
chunkerOp = OpenNLPOpsFactory.getChunker(chunkerModelFile);
|
||||
}
|
||||
return new OpenNLPChunkerFilter(in, chunkerOp);
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) {
|
||||
try {
|
||||
// load and register read-only models in cache with file/resource names
|
||||
if (chunkerModelFile != null) {
|
||||
OpenNLPOpsFactory.getChunkerModel(chunkerModelFile, loader);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* <p>Runs OpenNLP dictionary-based and/or MaxEnt lemmatizers.</p>
|
||||
* <p>
|
||||
* Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported,
|
||||
* via the "dictionary" and "lemmatizerModel" params, respectively.
|
||||
* If both are configured, the dictionary-based lemmatizer is tried first,
|
||||
* and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
|
||||
* </p>
|
||||
* <p>
|
||||
* The dictionary file must be encoded as UTF-8, with one entry per line,
|
||||
* in the form <tt>word[tab]lemma[tab]part-of-speech</tt>
|
||||
* </p>
|
||||
*/
|
||||
public class OpenNLPLemmatizerFilter extends TokenFilter {
|
||||
private final NLPLemmatizerOp lemmatizerOp;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
|
||||
private boolean moreTokensAvailable = true;
|
||||
private String[] sentenceTokens = null; // non-keyword tokens
|
||||
private String[] sentenceTokenTypes = null; // types for non-keyword tokens
|
||||
private String[] lemmas = null; // lemmas for non-keyword tokens
|
||||
private int lemmaNum = 0; // lemma counter
|
||||
|
||||
public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
|
||||
super(input);
|
||||
this.lemmatizerOp = lemmatizerOp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if ( ! moreTokensAvailable) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
if (sentenceTokenAttrsIter == null || ! sentenceTokenAttrsIter.hasNext()) {
|
||||
nextSentence();
|
||||
if (sentenceTokens == null) { // zero non-keyword tokens
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
|
||||
lemmaNum = 0;
|
||||
sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
|
||||
}
|
||||
clearAttributes();
|
||||
sentenceTokenAttrsIter.next().copyTo(this);
|
||||
if ( ! keywordAtt.isKeyword()) {
|
||||
termAtt.setEmpty().append(lemmas[lemmaNum++]);
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
private void nextSentence() throws IOException {
|
||||
List<String> tokenList = new ArrayList<>();
|
||||
List<String> typeList = new ArrayList<>();
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean endOfSentence = false;
|
||||
while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
|
||||
if ( ! keywordAtt.isKeyword()) {
|
||||
tokenList.add(termAtt.toString());
|
||||
typeList.add(typeAtt.type());
|
||||
}
|
||||
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
sentenceTokenAttrs.add(input.cloneAttributes());
|
||||
}
|
||||
sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
|
||||
sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
moreTokensAvailable = true;
|
||||
clear();
|
||||
}
|
||||
|
||||
private void clear() {
|
||||
sentenceTokenAttrs.clear();
|
||||
sentenceTokenAttrsIter = null;
|
||||
sentenceTokens = null;
|
||||
sentenceTokenTypes = null;
|
||||
lemmas = null;
|
||||
lemmaNum = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link OpenNLPLemmatizerFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_opennlp_lemma" class="solr.TextField" positionIncrementGap="100"
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
* sentenceModel="filename"
|
||||
* tokenizerModel="filename"/>
|
||||
* />
|
||||
* <filter class="solr.OpenNLPLemmatizerFilterFactory"
|
||||
* dictionary="filename"
|
||||
* lemmatizerModel="filename"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @since 7.3.0
|
||||
*/
|
||||
public class OpenNLPLemmatizerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String DICTIONARY = "dictionary";
|
||||
public static final String LEMMATIZER_MODEL = "lemmatizerModel";
|
||||
|
||||
private final String dictionaryFile;
|
||||
private final String lemmatizerModelFile;
|
||||
|
||||
public OpenNLPLemmatizerFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
dictionaryFile = get(args, DICTIONARY);
|
||||
lemmatizerModelFile = get(args, LEMMATIZER_MODEL);
|
||||
|
||||
if (dictionaryFile == null && lemmatizerModelFile == null) {
|
||||
throw new IllegalArgumentException("Configuration Error: missing parameter: at least one of '"
|
||||
+ DICTIONARY + "' and '" + LEMMATIZER_MODEL + "' must be provided.");
|
||||
}
|
||||
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenNLPLemmatizerFilter create(TokenStream in) {
|
||||
try {
|
||||
NLPLemmatizerOp lemmatizerOp = OpenNLPOpsFactory.getLemmatizer(dictionaryFile, lemmatizerModelFile);
|
||||
return new OpenNLPLemmatizerFilter(in, lemmatizerOp);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
// register models in cache with file/resource names
|
||||
if (dictionaryFile != null) {
|
||||
OpenNLPOpsFactory.getLemmatizerDictionary(dictionaryFile, loader);
|
||||
}
|
||||
if (lemmatizerModelFile != null) {
|
||||
OpenNLPOpsFactory.getLemmatizerModel(lemmatizerModelFile, loader);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Run OpenNLP POS tagger. Tags all terms in the TypeAttribute.
|
||||
*/
|
||||
public final class OpenNLPPOSFilter extends TokenFilter {
|
||||
|
||||
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
String[] tags = null;
|
||||
private int tokenNum = 0;
|
||||
private boolean moreTokensAvailable = true;
|
||||
|
||||
private final NLPPOSTaggerOp posTaggerOp;
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
|
||||
super(input);
|
||||
this.posTaggerOp = posTaggerOp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if ( ! moreTokensAvailable) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
if (tokenNum == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
|
||||
String[] sentenceTokens = nextSentence();
|
||||
if (sentenceTokens == null) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
tags = posTaggerOp.getPOSTags(sentenceTokens);
|
||||
tokenNum = 0;
|
||||
}
|
||||
clearAttributes();
|
||||
sentenceTokenAttrs.get(tokenNum).copyTo(this);
|
||||
typeAtt.setType(tags[tokenNum++]);
|
||||
return true;
|
||||
}
|
||||
|
||||
private String[] nextSentence() throws IOException {
|
||||
List<String> termList = new ArrayList<>();
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean endOfSentence = false;
|
||||
while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
|
||||
termList.add(termAtt.toString());
|
||||
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
sentenceTokenAttrs.add(input.cloneAttributes());
|
||||
}
|
||||
return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
moreTokensAvailable = true;
|
||||
}
|
||||
|
||||
private void clear() {
|
||||
sentenceTokenAttrs.clear();
|
||||
tags = null;
|
||||
tokenNum = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link OpenNLPPOSFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_opennlp_pos" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/>
|
||||
* <filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @since 7.3.0
|
||||
*/
|
||||
public class OpenNLPPOSFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String POS_TAGGER_MODEL = "posTaggerModel";
|
||||
|
||||
private final String posTaggerModelFile;
|
||||
|
||||
public OpenNLPPOSFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
posTaggerModelFile = require(args, POS_TAGGER_MODEL);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenNLPPOSFilter create(TokenStream in) {
|
||||
try {
|
||||
return new OpenNLPPOSFilter(in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile));
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) {
|
||||
try { // load and register the read-only model in cache with file/resource name
|
||||
OpenNLPOpsFactory.getPOSTaggerModel(posTaggerModelFile, loader);
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import opennlp.tools.util.Span;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
|
||||
import org.apache.lucene.analysis.util.CharArrayIterator;
|
||||
|
||||
/**
|
||||
* A {@link BreakIterator} that splits sentences using an OpenNLP sentence chunking model.
|
||||
*/
|
||||
public final class OpenNLPSentenceBreakIterator extends BreakIterator {
|
||||
|
||||
private CharacterIterator text;
|
||||
private int currentSentence;
|
||||
private int[] sentenceStarts;
|
||||
private NLPSentenceDetectorOp sentenceOp;
|
||||
|
||||
public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp) {
|
||||
this.sentenceOp = sentenceOp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
return text.getIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
currentSentence = 0;
|
||||
text.setIndex(text.getBeginIndex());
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
if (sentenceStarts.length > 0) {
|
||||
currentSentence = sentenceStarts.length - 1;
|
||||
text.setIndex(text.getEndIndex());
|
||||
} else { // there are no sentences; both the first and last positions are the begin index
|
||||
currentSentence = 0;
|
||||
text.setIndex(text.getBeginIndex());
|
||||
}
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
if (text.getIndex() == text.getEndIndex() || 0 == sentenceStarts.length) {
|
||||
return DONE;
|
||||
} else if (currentSentence < sentenceStarts.length - 1) {
|
||||
text.setIndex(sentenceStarts[++currentSentence]);
|
||||
return current();
|
||||
} else {
|
||||
return last();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int pos) {
|
||||
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
} else if (0 == sentenceStarts.length) {
|
||||
text.setIndex(text.getBeginIndex());
|
||||
return DONE;
|
||||
} else if (pos >= sentenceStarts[sentenceStarts.length - 1]) {
|
||||
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8015110
|
||||
text.setIndex(text.getEndIndex());
|
||||
currentSentence = sentenceStarts.length - 1;
|
||||
return DONE;
|
||||
} else { // there are at least two sentences
|
||||
currentSentence = (sentenceStarts.length - 1) / 2; // start search from the middle
|
||||
moveToSentenceAt(pos, 0, sentenceStarts.length - 2);
|
||||
text.setIndex(sentenceStarts[++currentSentence]);
|
||||
return current();
|
||||
}
|
||||
}
|
||||
|
||||
/** Binary search over sentences */
|
||||
private void moveToSentenceAt(int pos, int minSentence, int maxSentence) {
|
||||
if (minSentence != maxSentence) {
|
||||
if (pos < sentenceStarts[currentSentence]) {
|
||||
int newMaxSentence = currentSentence - 1;
|
||||
currentSentence = minSentence + (currentSentence - minSentence) / 2;
|
||||
moveToSentenceAt(pos, minSentence, newMaxSentence);
|
||||
} else if (pos >= sentenceStarts[currentSentence + 1]) {
|
||||
int newMinSentence = currentSentence + 1;
|
||||
currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
|
||||
moveToSentenceAt(pos, newMinSentence, maxSentence);
|
||||
}
|
||||
} else {
|
||||
assert currentSentence == minSentence;
|
||||
assert pos >= sentenceStarts[currentSentence];
|
||||
assert (currentSentence == sentenceStarts.length - 1 && pos <= text.getEndIndex())
|
||||
|| pos < sentenceStarts[currentSentence + 1];
|
||||
}
|
||||
// we have arrived - nothing to do
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
if (text.getIndex() == text.getBeginIndex()) {
|
||||
return DONE;
|
||||
} else {
|
||||
if (0 == sentenceStarts.length) {
|
||||
text.setIndex(text.getBeginIndex());
|
||||
return DONE;
|
||||
}
|
||||
if (text.getIndex() == text.getEndIndex()) {
|
||||
text.setIndex(sentenceStarts[currentSentence]);
|
||||
} else {
|
||||
text.setIndex(sentenceStarts[--currentSentence]);
|
||||
}
|
||||
return current();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int preceding(int pos) {
|
||||
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
} else if (0 == sentenceStarts.length) {
|
||||
text.setIndex(text.getBeginIndex());
|
||||
currentSentence = 0;
|
||||
return DONE;
|
||||
} else if (pos < sentenceStarts[0]) {
|
||||
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8015110
|
||||
text.setIndex(text.getBeginIndex());
|
||||
currentSentence = 0;
|
||||
return DONE;
|
||||
} else {
|
||||
currentSentence = sentenceStarts.length / 2; // start search from the middle
|
||||
moveToSentenceAt(pos, 0, sentenceStarts.length - 1);
|
||||
if (0 == currentSentence) {
|
||||
text.setIndex(text.getBeginIndex());
|
||||
return DONE;
|
||||
} else {
|
||||
text.setIndex(sentenceStarts[--currentSentence]);
|
||||
return current();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
currentSentence += n;
|
||||
if (n < 0) {
|
||||
if (text.getIndex() == text.getEndIndex()) {
|
||||
++currentSentence;
|
||||
}
|
||||
if (currentSentence < 0) {
|
||||
currentSentence = 0;
|
||||
text.setIndex(text.getBeginIndex());
|
||||
return DONE;
|
||||
} else {
|
||||
text.setIndex(sentenceStarts[currentSentence]);
|
||||
}
|
||||
} else if (n > 0) {
|
||||
if (currentSentence >= sentenceStarts.length) {
|
||||
currentSentence = sentenceStarts.length - 1;
|
||||
text.setIndex(text.getEndIndex());
|
||||
return DONE;
|
||||
} else {
|
||||
text.setIndex(sentenceStarts[currentSentence]);
|
||||
}
|
||||
}
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator newText) {
|
||||
text = newText;
|
||||
text.setIndex(text.getBeginIndex());
|
||||
currentSentence = 0;
|
||||
Span[] spans = sentenceOp.splitSentences(characterIteratorToString());
|
||||
sentenceStarts = new int[spans.length];
|
||||
for (int i = 0; i < spans.length; ++i) {
|
||||
// Adjust start positions to match those of the passed-in CharacterIterator
|
||||
sentenceStarts[i] = spans[i].getStart() + text.getBeginIndex();
|
||||
}
|
||||
}
|
||||
|
||||
private String characterIteratorToString() {
|
||||
String fullText;
|
||||
if (text instanceof CharArrayIterator) {
|
||||
CharArrayIterator charArrayIterator = (CharArrayIterator)text;
|
||||
fullText = new String(charArrayIterator.getText(), charArrayIterator.getStart(), charArrayIterator.getLength());
|
||||
} else {
|
||||
// TODO: is there a better way to extract full text from arbitrary CharacterIterators?
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (char ch = text.first(); ch != CharacterIterator.DONE; ch = text.next()) {
|
||||
builder.append(ch);
|
||||
}
|
||||
fullText = builder.toString();
|
||||
text.setIndex(text.getBeginIndex());
|
||||
}
|
||||
return fullText;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import opennlp.tools.util.Span;
|
||||
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
/**
|
||||
* Run OpenNLP SentenceDetector and Tokenizer.
|
||||
* The last token in each sentence is marked by setting the {@link #EOS_FLAG_BIT} in the FlagsAttribute;
|
||||
* following filters can use this information to apply operations to tokens one sentence at a time.
|
||||
*/
|
||||
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
|
||||
public static int EOS_FLAG_BIT = 1;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private Span[] termSpans = null;
|
||||
private int termNum = 0;
|
||||
private int sentenceStart = 0;
|
||||
|
||||
private NLPSentenceDetectorOp sentenceOp = null;
|
||||
private NLPTokenizerOp tokenizerOp = null;
|
||||
|
||||
public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
|
||||
super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
|
||||
if (sentenceOp == null || tokenizerOp == null) {
|
||||
throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
|
||||
}
|
||||
this.sentenceOp = sentenceOp;
|
||||
this.tokenizerOp = tokenizerOp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
termSpans = null;
|
||||
termNum = sentenceStart = 0;
|
||||
};
|
||||
|
||||
@Override
|
||||
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
|
||||
this.sentenceStart = sentenceStart;
|
||||
String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
|
||||
termSpans = tokenizerOp.getTerms(sentenceText);
|
||||
termNum = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean incrementWord() {
|
||||
if (termSpans == null || termNum == termSpans.length) {
|
||||
return false;
|
||||
}
|
||||
clearAttributes();
|
||||
Span term = termSpans[termNum];
|
||||
termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
|
||||
offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
|
||||
correctOffset(offset + sentenceStart + term.getEnd()));
|
||||
if (termNum == termSpans.length - 1) {
|
||||
flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
|
||||
}
|
||||
++termNum;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
termSpans = null;
|
||||
termNum = sentenceStart = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link OpenNLPTokenizer}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_opennlp" class="solr.TextField" positionIncrementGap="100"
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @since 7.3.0
|
||||
*/
|
||||
public class OpenNLPTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
|
||||
public static final String SENTENCE_MODEL = "sentenceModel";
|
||||
public static final String TOKENIZER_MODEL = "tokenizerModel";
|
||||
|
||||
private final String sentenceModelFile;
|
||||
private final String tokenizerModelFile;
|
||||
|
||||
public OpenNLPTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
sentenceModelFile = require(args, SENTENCE_MODEL);
|
||||
tokenizerModelFile = require(args, TOKENIZER_MODEL);
|
||||
if ( ! args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenNLPTokenizer create(AttributeFactory factory) {
|
||||
try {
|
||||
NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
|
||||
return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
// register models in cache with file/resource names
|
||||
if (sentenceModelFile != null) {
|
||||
OpenNLPOpsFactory.getSentenceModel(sentenceModelFile, loader);
|
||||
}
|
||||
if (tokenizerModelFile != null) {
|
||||
OpenNLPOpsFactory.getTokenizerModel(tokenizerModelFile, loader);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Analysis components based on OpenNLP
|
||||
*/
|
||||
package org.apache.lucene.analysis.opennlp;
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.IOException;
|
||||
import opennlp.tools.chunker.ChunkerME;
|
||||
import opennlp.tools.chunker.ChunkerModel;
|
||||
|
||||
/**
|
||||
* Supply OpenNLP Chunking tool
|
||||
* Requires binary models from OpenNLP project on SourceForge.
|
||||
*/
|
||||
public class NLPChunkerOp {
|
||||
private ChunkerME chunker = null;
|
||||
|
||||
public NLPChunkerOp(ChunkerModel chunkerModel) throws IOException {
|
||||
chunker = new ChunkerME(chunkerModel);
|
||||
}
|
||||
|
||||
public synchronized String[] getChunks(String[] words, String[] tags, double[] probs) {
|
||||
String[] chunks = chunker.chunk(words, tags);
|
||||
if (probs != null)
|
||||
chunker.probs(probs);
|
||||
return chunks;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||
import opennlp.tools.lemmatizer.LemmatizerME;
|
||||
import opennlp.tools.lemmatizer.LemmatizerModel;
|
||||
|
||||
/**
|
||||
* <p>Supply OpenNLP Lemmatizer tools.</p>
|
||||
* <p>
|
||||
* Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported.
|
||||
* If both are configured, the dictionary-based lemmatizer is tried first,
|
||||
* and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
|
||||
* </p>
|
||||
* <p>
|
||||
* The MaxEnt implementation requires binary models from OpenNLP project on SourceForge.
|
||||
* </p>
|
||||
*/
|
||||
public class NLPLemmatizerOp {
|
||||
private final DictionaryLemmatizer dictionaryLemmatizer;
|
||||
private final LemmatizerME lemmatizerME;
|
||||
|
||||
public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel) throws IOException {
|
||||
assert dictionary != null || lemmatizerModel != null : "At least one parameter must be non-null";
|
||||
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
|
||||
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
|
||||
}
|
||||
|
||||
public String[] lemmatize(String[] words, String[] postags) {
|
||||
String[] lemmas = null;
|
||||
String[] maxEntLemmas = null;
|
||||
if (dictionaryLemmatizer != null) {
|
||||
lemmas = dictionaryLemmatizer.lemmatize(words, postags);
|
||||
for (int i = 0; i < lemmas.length; ++i) {
|
||||
if (lemmas[i].equals("O")) { // this word is not in the dictionary
|
||||
if (lemmatizerME != null) { // fall back to the MaxEnt lemmatizer if it's enabled
|
||||
if (maxEntLemmas == null) {
|
||||
maxEntLemmas = lemmatizerME.lemmatize(words, postags);
|
||||
}
|
||||
if ("_".equals(maxEntLemmas[i])) {
|
||||
lemmas[i] = words[i]; // put back the original word if no lemma is found
|
||||
} else {
|
||||
lemmas[i] = maxEntLemmas[i];
|
||||
}
|
||||
} else { // there is no MaxEnt lemmatizer
|
||||
lemmas[i] = words[i]; // put back the original word if no lemma is found
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // there is only a MaxEnt lemmatizer
|
||||
maxEntLemmas = lemmatizerME.lemmatize(words, postags);
|
||||
for (int i = 0 ; i < maxEntLemmas.length ; ++i) {
|
||||
if ("_".equals(maxEntLemmas[i])) {
|
||||
maxEntLemmas[i] = words[i]; // put back the original word if no lemma is found
|
||||
}
|
||||
}
|
||||
lemmas = maxEntLemmas;
|
||||
}
|
||||
return lemmas;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import opennlp.tools.namefind.NameFinderME;
|
||||
import opennlp.tools.namefind.TokenNameFinder;
|
||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
||||
import opennlp.tools.util.Span;
|
||||
|
||||
/**
|
||||
* Supply OpenNLP Named Entity Resolution tool
|
||||
* Requires binary models from OpenNLP project on SourceForge.
|
||||
*
|
||||
* Usage: from <a href="http://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.recognition.api"
|
||||
* >the OpenNLP documentation</a>:
|
||||
*
|
||||
* "The NameFinderME class is not thread safe, it must only be called from one thread.
|
||||
* To use multiple threads multiple NameFinderME instances sharing the same model instance
|
||||
* can be created. The input text should be segmented into documents, sentences and tokens.
|
||||
* To perform entity detection an application calls the find method for every sentence in
|
||||
* the document. After every document clearAdaptiveData must be called to clear the adaptive
|
||||
* data in the feature generators. Not calling clearAdaptiveData can lead to a sharp drop
|
||||
* in the detection rate after a few documents."
|
||||
*
|
||||
*/
|
||||
public class NLPNERTaggerOp {
|
||||
private final TokenNameFinder nameFinder;
|
||||
|
||||
public NLPNERTaggerOp(TokenNameFinderModel model) {
|
||||
this.nameFinder = new NameFinderME(model);
|
||||
}
|
||||
|
||||
public Span[] getNames(String[] words) {
|
||||
Span[] names = nameFinder.find(words);
|
||||
return names;
|
||||
}
|
||||
|
||||
public synchronized void reset() {
|
||||
nameFinder.clearAdaptiveData();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTagger;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
|
||||
/**
|
||||
* Supply OpenNLP Parts-Of-Speech Tagging tool
|
||||
* Requires binary models from OpenNLP project on SourceForge.
|
||||
*/
|
||||
|
||||
public class NLPPOSTaggerOp {
|
||||
private POSTagger tagger = null;
|
||||
|
||||
public NLPPOSTaggerOp(POSModel model) throws IOException {
|
||||
tagger = new POSTaggerME(model);
|
||||
}
|
||||
|
||||
public synchronized String[] getPOSTags(String[] words) {
|
||||
return tagger.tag(words);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.util.Span;
|
||||
|
||||
/**
|
||||
* Supply OpenNLP Sentence Detector tool
|
||||
* Requires binary models from OpenNLP project on SourceForge.
|
||||
*/
|
||||
public class NLPSentenceDetectorOp {
|
||||
private final SentenceDetectorME sentenceSplitter;
|
||||
|
||||
public NLPSentenceDetectorOp(SentenceModel model) throws IOException {
|
||||
sentenceSplitter = new SentenceDetectorME(model);
|
||||
}
|
||||
|
||||
public NLPSentenceDetectorOp() {
|
||||
sentenceSplitter = null;
|
||||
}
|
||||
|
||||
public synchronized Span[] splitSentences(String line) {
|
||||
if (sentenceSplitter != null) {
|
||||
return sentenceSplitter.sentPosDetect(line);
|
||||
} else {
|
||||
Span[] shorty = new Span[1];
|
||||
shorty[0] = new Span(0, line.length());
|
||||
return shorty;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import opennlp.tools.tokenize.Tokenizer;
|
||||
import opennlp.tools.tokenize.TokenizerME;
|
||||
import opennlp.tools.tokenize.TokenizerModel;
|
||||
import opennlp.tools.util.Span;
|
||||
|
||||
/**
|
||||
* Supply OpenNLP Sentence Tokenizer tool
|
||||
* Requires binary models from OpenNLP project on SourceForge.
|
||||
*/
|
||||
public class NLPTokenizerOp {
|
||||
private final Tokenizer tokenizer;
|
||||
|
||||
public NLPTokenizerOp(TokenizerModel model) {
|
||||
tokenizer = new TokenizerME(model);
|
||||
}
|
||||
|
||||
public NLPTokenizerOp() {
|
||||
tokenizer = null;
|
||||
}
|
||||
|
||||
public synchronized Span[] getTerms(String sentence) {
|
||||
if (tokenizer == null) {
|
||||
Span[] span1 = new Span[1];
|
||||
span1[0] = new Span(0, sentence.length());
|
||||
return span1;
|
||||
}
|
||||
return tokenizer.tokenizePos(sentence);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import opennlp.tools.chunker.ChunkerModel;
|
||||
import opennlp.tools.lemmatizer.LemmatizerModel;
|
||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.tokenize.TokenizerModel;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
|
||||
/**
|
||||
* Supply OpenNLP Named Entity Recognizer
|
||||
* Cache model file objects. Assumes model files are thread-safe.
|
||||
*/
|
||||
public class OpenNLPOpsFactory {
|
||||
private static Map<String,SentenceModel> sentenceModels = new ConcurrentHashMap<>();
|
||||
private static ConcurrentHashMap<String,TokenizerModel> tokenizerModels = new ConcurrentHashMap<>();
|
||||
private static ConcurrentHashMap<String,POSModel> posTaggerModels = new ConcurrentHashMap<>();
|
||||
private static ConcurrentHashMap<String,ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
|
||||
private static Map<String,TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
|
||||
private static Map<String,LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
|
||||
private static Map<String,String> lemmaDictionaries = new ConcurrentHashMap<>();
|
||||
|
||||
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
|
||||
if (modelName != null) {
|
||||
SentenceModel model = sentenceModels.get(modelName);
|
||||
return new NLPSentenceDetectorOp(model);
|
||||
} else {
|
||||
return new NLPSentenceDetectorOp();
|
||||
}
|
||||
}
|
||||
|
||||
public static SentenceModel getSentenceModel(String modelName, ResourceLoader loader) throws IOException {
|
||||
SentenceModel model = sentenceModels.get(modelName);
|
||||
if (model == null) {
|
||||
model = new SentenceModel(loader.openResource(modelName));
|
||||
sentenceModels.put(modelName, model);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
|
||||
if (modelName == null) {
|
||||
return new NLPTokenizerOp();
|
||||
} else {
|
||||
TokenizerModel model = tokenizerModels.get(modelName);
|
||||
return new NLPTokenizerOp(model);
|
||||
}
|
||||
}
|
||||
|
||||
public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader loader) throws IOException {
|
||||
TokenizerModel model = tokenizerModels.get(modelName);
|
||||
if (model == null) {
|
||||
model = new TokenizerModel(loader.openResource(modelName));
|
||||
tokenizerModels.put(modelName, model);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
|
||||
POSModel model = posTaggerModels.get(modelName);
|
||||
return new NLPPOSTaggerOp(model);
|
||||
}
|
||||
|
||||
public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader) throws IOException {
|
||||
POSModel model = posTaggerModels.get(modelName);
|
||||
if (model == null) {
|
||||
model = new POSModel(loader.openResource(modelName));
|
||||
posTaggerModels.put(modelName, model);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
public static NLPChunkerOp getChunker(String modelName) throws IOException {
|
||||
ChunkerModel model = chunkerModels.get(modelName);
|
||||
return new NLPChunkerOp(model);
|
||||
}
|
||||
|
||||
public static ChunkerModel getChunkerModel(String modelName, ResourceLoader loader) throws IOException {
|
||||
ChunkerModel model = chunkerModels.get(modelName);
|
||||
if (model == null) {
|
||||
model = new ChunkerModel(loader.openResource(modelName));
|
||||
chunkerModels.put(modelName, model);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
public static NLPNERTaggerOp getNERTagger(String modelName) throws IOException {
|
||||
TokenNameFinderModel model = nerModels.get(modelName);
|
||||
return new NLPNERTaggerOp(model);
|
||||
}
|
||||
|
||||
public static TokenNameFinderModel getNERTaggerModel(String modelName, ResourceLoader loader) throws IOException {
|
||||
TokenNameFinderModel model = nerModels.get(modelName);
|
||||
if (model == null) {
|
||||
model = new TokenNameFinderModel(loader.openResource(modelName));
|
||||
nerModels.put(modelName, model);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
public static NLPLemmatizerOp getLemmatizer(String dictionaryFile, String lemmatizerModelFile) throws IOException {
|
||||
assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null";
|
||||
InputStream dictionaryInputStream = null;
|
||||
if (dictionaryFile != null) {
|
||||
String dictionary = lemmaDictionaries.get(dictionaryFile);
|
||||
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
|
||||
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
|
||||
}
|
||||
|
||||
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) throws IOException {
|
||||
String dictionary = lemmaDictionaries.get(dictionaryFile);
|
||||
if (dictionary == null) {
|
||||
Reader reader = new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] chars = new char[8092];
|
||||
int numRead = 0;
|
||||
do {
|
||||
numRead = reader.read(chars, 0, chars.length);
|
||||
if (numRead > 0) {
|
||||
builder.append(chars, 0, numRead);
|
||||
}
|
||||
} while (numRead > 0);
|
||||
dictionary = builder.toString();
|
||||
lemmaDictionaries.put(dictionaryFile, dictionary);
|
||||
}
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) throws IOException {
|
||||
LemmatizerModel model = lemmatizerModels.get(modelName);
|
||||
if (model == null) {
|
||||
model = new LemmatizerModel(loader.openResource(modelName));
|
||||
lemmatizerModels.put(modelName, model);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
// keeps unit test from blowing out memory
|
||||
public static void clearModels() {
|
||||
sentenceModels.clear();
|
||||
tokenizerModels.clear();
|
||||
posTaggerModels.clear();
|
||||
chunkerModels.clear();
|
||||
nerModels.clear();
|
||||
lemmaDictionaries.clear();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Tools to supply access to OpenNLP components.
|
||||
*/
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
|
@ -0,0 +1,61 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<title>
|
||||
Apache Lucene OpenNLP integration module
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
This module exposes functionality from
|
||||
<a href="http://opennlp.apache.org">Apache OpenNLP</a> to Apache Lucene.
|
||||
The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text.
|
||||
<p>
|
||||
For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
|
||||
<p>
|
||||
The OpenNLP Tokenizer behavior is similar to the WhiteSpaceTokenizer but is smart about
|
||||
inter-word punctuation. The term stream looks very much like the way you parse words and
|
||||
punctuation while reading. The major difference between this tokenizer and most other
|
||||
tokenizers shipped with Lucene is that punctuation is tokenized. This is required for
|
||||
the following taggers to operate properly.
|
||||
<p>
|
||||
The OpenNLP taggers annotate terms using the <code>TypeAttribute</code>.
|
||||
<ul>
|
||||
<li><code>OpenNLPTokenizer</code> segments text into sentences or words. This Tokenizer
|
||||
uses the OpenNLP Sentence Detector and/or Tokenizer classes. When used together, the
|
||||
Tokenizer receives sentences and can do a better job.</li>
|
||||
<li><code>OpenNLPFilter</code> tags words using one or more technologies: Part-of-Speech,
|
||||
Chunking, and Named Entity Recognition. These tags are assigned as token types. Note that
|
||||
only of these operations will tag
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
Since the <code>TypeAttribute</code> is not stored in the index, it is recommended that one
|
||||
of these filters is used following <code>OpenNLPFilter</code> to enable search against the
|
||||
assigned tags:
|
||||
<ul>
|
||||
<li><code>TypeAsPayloadFilter</code> copies the <code>TypeAttribute</code> value to the
|
||||
<code>PayloadAttribute</code></li>
|
||||
<li><code>TypeAsSynonymFilter</code> creates a cloned token at the same position as each
|
||||
tagged token, and copies the {{TypeAttribute}} value to the {{CharTermAttribute}}, optionally
|
||||
with a customized prefix (so that tags effectively occupy a different namespace from token
|
||||
text).</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,18 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.analysis.opennlp.OpenNLPChunkerFilterFactory
|
||||
org.apache.lucene.analysis.opennlp.OpenNLPLemmatizerFilterFactory
|
||||
org.apache.lucene.analysis.opennlp.OpenNLPPOSFilterFactory
|
|
@ -0,0 +1,16 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.analysis.opennlp.OpenNLPTokenizerFactory
|
Binary file not shown.
|
@ -0,0 +1,12 @@
|
|||
they NNP they
|
||||
sent VBD send
|
||||
him PRP he
|
||||
running VBG run
|
||||
in IN in
|
||||
the DT the
|
||||
evening NN evening
|
||||
he PRP he
|
||||
did VBD do
|
||||
not RB not
|
||||
come VB come
|
||||
back RB back
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
|
||||
/**
|
||||
* Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
|
||||
* Needs the OpenNLP POS tagger for the POS tags.
|
||||
*
|
||||
* Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
|
||||
*/
|
||||
public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
|
||||
private static final String[] SENTENCES_punc
|
||||
= {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
|
||||
private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
|
||||
private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
|
||||
private static final String[] SENTENCES_chunks
|
||||
= { "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "O" };
|
||||
|
||||
private static final String sentenceModelFile = "en-test-sent.bin";
|
||||
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
|
||||
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
|
||||
private static final String chunkerModelFile = "en-test-chunker.bin";
|
||||
|
||||
|
||||
private static byte[][] toPayloads(String... strings) {
|
||||
return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
|
||||
SENTENCES_chunks, null, null, true);
|
||||
}
|
||||
|
||||
public void testPayloads() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
|
||||
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
|
||||
null, null, null, true, toPayloads(SENTENCES_chunks));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
|
||||
public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
private static final String SENTENCE = "They sent him running in the evening.";
|
||||
private static final String[] SENTENCE_dict_punc = {"they", "send", "he", "run", "in", "the", "evening", "."};
|
||||
private static final String[] SENTENCE_maxent_punc = {"they", "send", "he", "runn", "in", "the", "evening", "."};
|
||||
private static final String[] SENTENCE_posTags = {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."};
|
||||
|
||||
private static final String SENTENCES = "They sent him running in the evening. He did not come back.";
|
||||
private static final String[] SENTENCES_dict_punc
|
||||
= {"they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
|
||||
private static final String[] SENTENCES_maxent_punc
|
||||
= {"they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
|
||||
private static final String[] SENTENCES_posTags
|
||||
= {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."};
|
||||
|
||||
private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
|
||||
private static final String[] SENTENCE_both_punc
|
||||
= {"konstantin", "kalashnitsov", "constantly", "caliph", "."};
|
||||
private static final String[] SENTENCE_both_posTags
|
||||
= {"IN", "JJ", "NN", "VBN", "."};
|
||||
|
||||
private static final String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
|
||||
private static final String[] SENTENCES_both_punc
|
||||
= {"konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "."};
|
||||
private static final String[] SENTENCES_both_posTags
|
||||
= {"IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "."};
|
||||
|
||||
private static final String[] SENTENCES_dict_keep_orig_punc
|
||||
= {"They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
|
||||
private static final String[] SENTENCES_max_ent_keep_orig_punc
|
||||
= {"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
|
||||
private static final String[] SENTENCES_keep_orig_posTags
|
||||
= {"NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP", "VBD", "VBD", "RB", "VB", "RB", "."};
|
||||
|
||||
private static final String[] SENTENCES_both_keep_orig_punc
|
||||
= {"Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "."};
|
||||
private static final String[] SENTENCES_both_keep_orig_posTags
|
||||
= {"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."};
|
||||
|
||||
|
||||
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
|
||||
private static final String sentenceModelFile = "en-test-sent.bin";
|
||||
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
|
||||
private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
|
||||
private static final String lemmatizerDictFile = "en-test-lemmas.dict";
|
||||
|
||||
|
||||
public void test1SentenceDictionaryOnly() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
|
||||
SENTENCE_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void test2SentencesDictionaryOnly() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
|
||||
SENTENCES_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void test1SentenceMaxEntOnly() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
|
||||
SENTENCE_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void test2SentencesMaxEntOnly() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
|
||||
SENTENCES_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void test1SentenceDictionaryAndMaxEnt() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
|
||||
SENTENCE_both_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void test2SentencesDictionaryAndMaxEnt() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
|
||||
SENTENCES_both_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
|
||||
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
|
||||
SENTENCES_keep_orig_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
|
||||
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
|
||||
SENTENCES_keep_orig_posTags, null, null, true);
|
||||
}
|
||||
|
||||
public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
|
||||
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
|
||||
SENTENCES_both_keep_orig_posTags, null, null, true);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
|
||||
/**
|
||||
* Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
|
||||
* The POS model is based on this tokenization.
|
||||
*
|
||||
* Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
|
||||
*/
|
||||
public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
|
||||
private static final String[] SENTENCES_punc
|
||||
= {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
|
||||
private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
|
||||
private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
|
||||
private static final String[] SENTENCES_posTags
|
||||
= {"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."};
|
||||
private static final String NAMES2 = "Royal Flash is a tale about Harry Flashman.";
|
||||
private static final String[] NAMES2_punc = {"Royal", "Flash", "is", "a", "tale", "about", "Harry", "Flashman", "."};
|
||||
private static final String[] NAMES2_OUT = { "word", "word", "word", "word", "word", "word", "word", "person", "word" };
|
||||
|
||||
private static final String NO_BREAK = "No period";
|
||||
private static final String[] NO_BREAK_terms = {"No", "period"};
|
||||
private static final int[] NO_BREAK_startOffsets = {0, 3};
|
||||
private static final int[] NO_BREAK_endOffsets = {2, 9};
|
||||
|
||||
private static final String sentenceModelFile = "en-test-sent.bin";
|
||||
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
|
||||
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
|
||||
|
||||
|
||||
private static byte[][] toPayloads(String... strings) {
|
||||
return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
|
||||
}
|
||||
|
||||
public void testBasic() throws IOException {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
|
||||
}
|
||||
|
||||
public void testPOS() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
|
||||
SENTENCES_posTags, null, null, true);
|
||||
|
||||
analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
|
||||
null, null, null, true, toPayloads(SENTENCES_posTags));
|
||||
}
|
||||
|
||||
public void testNoBreak() throws Exception {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
|
||||
null, null, null, true);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
|
||||
import org.apache.lucene.analysis.util.CharArrayIterator;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestOpenNLPSentenceBreakIterator extends LuceneTestCase {
|
||||
|
||||
private static final String TEXT
|
||||
// 111
|
||||
// 111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999000
|
||||
// 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
|
||||
= "Sentence number 1 has 6 words. Sentence number 2, 5 words. And finally, sentence number 3 has 8 words.";
|
||||
private static final String[] SENTENCES = new String[] {
|
||||
"Sentence number 1 has 6 words. ", "Sentence number 2, 5 words. ", "And finally, sentence number 3 has 8 words." };
|
||||
private static final String PADDING = " Word. Word. ";
|
||||
private static final String sentenceModelFile = "en-test-sent.bin";
|
||||
|
||||
|
||||
@BeforeClass
|
||||
public static void populateCache() throws IOException {
|
||||
OpenNLPOpsFactory.getSentenceModel
|
||||
(sentenceModelFile, new ClasspathResourceLoader(TestOpenNLPSentenceBreakIterator.class));
|
||||
}
|
||||
|
||||
public void testThreeSentences() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(TEXT); // String is converted to StringCharacterIterator
|
||||
do3SentenceTest(bi);
|
||||
|
||||
bi.setText(getCharArrayIterator(TEXT));
|
||||
do3SentenceTest(bi);
|
||||
}
|
||||
|
||||
private CharacterIterator getCharArrayIterator(String text) {
|
||||
return getCharArrayIterator(text, 0, text.length());
|
||||
}
|
||||
|
||||
private CharacterIterator getCharArrayIterator(String text, int start, int length) {
|
||||
CharArrayIterator charArrayIterator = new CharArrayIterator() {
|
||||
// Lie about all surrogates to the sentence tokenizer,
|
||||
// instead we treat them all as SContinue so we won't break around them.
|
||||
@Override
|
||||
protected char jreBugWorkaround(char ch) {
|
||||
return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
|
||||
}
|
||||
};
|
||||
charArrayIterator.setText(text.toCharArray(), start, length);
|
||||
return charArrayIterator;
|
||||
}
|
||||
|
||||
private void do3SentenceTest(BreakIterator bi) {
|
||||
assertEquals(0, bi.current());
|
||||
assertEquals(0, bi.first());
|
||||
assertEquals(SENTENCES[0], TEXT.substring(bi.current(), bi.next()));
|
||||
assertEquals(SENTENCES[1], TEXT.substring(bi.current(), bi.next()));
|
||||
int current = bi.current();
|
||||
assertEquals(bi.getText().getEndIndex(), bi.next());
|
||||
int next = bi.current();
|
||||
assertEquals(SENTENCES[2], TEXT.substring(current, next));
|
||||
assertEquals(BreakIterator.DONE, bi.next());
|
||||
|
||||
assertEquals(TEXT.length(), bi.last());
|
||||
int end = bi.current();
|
||||
assertEquals(SENTENCES[2], TEXT.substring(bi.previous(), end));
|
||||
end = bi.current();
|
||||
assertEquals(SENTENCES[1], TEXT.substring(bi.previous(), end));
|
||||
end = bi.current();
|
||||
assertEquals(SENTENCES[0], TEXT.substring(bi.previous(), end));
|
||||
assertEquals(BreakIterator.DONE, bi.previous());
|
||||
assertEquals(0, bi.current());
|
||||
|
||||
assertEquals(59, bi.following(39));
|
||||
assertEquals(59, bi.following(31));
|
||||
assertEquals(31, bi.following(30));
|
||||
|
||||
assertEquals(0, bi.preceding(57));
|
||||
assertEquals(0, bi.preceding(58));
|
||||
assertEquals(31, bi.preceding(59));
|
||||
|
||||
assertEquals(0, bi.first());
|
||||
assertEquals(59, bi.next(2));
|
||||
assertEquals(0, bi.next(-2));
|
||||
}
|
||||
|
||||
public void testSingleSentence() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(getCharArrayIterator(SENTENCES[0]));
|
||||
test1Sentence(bi, SENTENCES[0]);
|
||||
}
|
||||
|
||||
private void test1Sentence(BreakIterator bi, String text) {
|
||||
int start = bi.getText().getBeginIndex();
|
||||
assertEquals(start, bi.first());
|
||||
int current = bi.current();
|
||||
assertEquals(bi.getText().getEndIndex(), bi.next());
|
||||
int end = bi.current() - start;
|
||||
assertEquals(text, text.substring(current - start, end - start));
|
||||
|
||||
assertEquals(text.length(), bi.last() - start);
|
||||
end = bi.current();
|
||||
bi.previous();
|
||||
assertEquals(BreakIterator.DONE, bi.previous());
|
||||
int previous = bi.current();
|
||||
assertEquals(text, text.substring(previous - start, end - start));
|
||||
assertEquals(start, bi.current());
|
||||
|
||||
assertEquals(BreakIterator.DONE, bi.following(bi.last() / 2 + start));
|
||||
|
||||
assertEquals(BreakIterator.DONE, bi.preceding(bi.last() / 2 + start));
|
||||
|
||||
assertEquals(start, bi.first());
|
||||
assertEquals(BreakIterator.DONE, bi.next(13));
|
||||
assertEquals(BreakIterator.DONE, bi.next(-8));
|
||||
}
|
||||
|
||||
public void testSliceEnd() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(getCharArrayIterator(SENTENCES[0] + PADDING, 0, SENTENCES[0].length()));
|
||||
|
||||
test1Sentence(bi, SENTENCES[0]);
|
||||
}
|
||||
|
||||
public void testSliceStart() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(getCharArrayIterator(PADDING + SENTENCES[0], PADDING.length(), SENTENCES[0].length()));
|
||||
test1Sentence(bi, SENTENCES[0]);
|
||||
}
|
||||
|
||||
public void testSliceMiddle() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(getCharArrayIterator(PADDING + SENTENCES[0] + PADDING, PADDING.length(), SENTENCES[0].length()));
|
||||
|
||||
test1Sentence(bi, SENTENCES[0]);
|
||||
}
|
||||
|
||||
/** the current position must be ignored, initial position is always first() */
|
||||
public void testFirstPosition() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(getCharArrayIterator(SENTENCES[0]));
|
||||
assertEquals(SENTENCES[0].length(), bi.last()); // side-effect: set current position to last()
|
||||
test1Sentence(bi, SENTENCES[0]);
|
||||
}
|
||||
|
||||
public void testWhitespaceOnly() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText(" \n \n\n\r\n\t \n");
|
||||
test0Sentences(bi);
|
||||
}
|
||||
|
||||
public void testEmptyString() throws Exception {
|
||||
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
|
||||
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
|
||||
bi.setText("");
|
||||
test0Sentences(bi);
|
||||
}
|
||||
|
||||
private void test0Sentences(BreakIterator bi) {
|
||||
assertEquals(0, bi.current());
|
||||
assertEquals(0, bi.first());
|
||||
assertEquals(BreakIterator.DONE, bi.next());
|
||||
assertEquals(0, bi.last());
|
||||
assertEquals(BreakIterator.DONE, bi.previous());
|
||||
assertEquals(BreakIterator.DONE, bi.following(0));
|
||||
assertEquals(BreakIterator.DONE, bi.preceding(0));
|
||||
assertEquals(0, bi.first());
|
||||
assertEquals(BreakIterator.DONE, bi.next(13));
|
||||
assertEquals(BreakIterator.DONE, bi.next(-8));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests the Tokenizer as well- the Tokenizer needs the OpenNLP model files,
|
||||
* which this can load from src/test-files/opennlp/solr/conf
|
||||
*
|
||||
*/
|
||||
public class TestOpenNLPTokenizerFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
static private String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
|
||||
static private String[] SENTENCES_split = {"Sentence number 1 has 6 words. ", "Sentence number 2, 5 words."};
|
||||
static private String[] SENTENCES_punc = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
|
||||
static private int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
|
||||
static private int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
|
||||
|
||||
static private String SENTENCE1 = "Sentence number 1 has 6 words.";
|
||||
static private String[] SENTENCE1_punc = {"Sentence", "number", "1", "has", "6", "words", "."};
|
||||
|
||||
@Test
|
||||
public void testTokenizer() throws IOException {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
|
||||
assertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTokenizerNoSentenceDetector() throws IOException {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
|
||||
.build();
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'sentenceModel'"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTokenizerNoTokenizer() throws IOException {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
|
||||
.build();
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'tokenizerModel'"));
|
||||
}
|
||||
|
||||
// test analyzer caching the tokenizer
|
||||
@Test
|
||||
public void testClose() throws IOException {
|
||||
Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
|
||||
put("tokenizerModel", "en-test-tokenizer.bin"); }};
|
||||
OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
|
||||
Tokenizer ts = factory.create(newAttributeFactory());
|
||||
ts.setReader(new StringReader(SENTENCES));
|
||||
|
||||
ts.reset();
|
||||
ts.close();
|
||||
ts.reset();
|
||||
ts.setReader(new StringReader(SENTENCES));
|
||||
assertTokenStreamContents(ts, SENTENCES_punc);
|
||||
ts.close();
|
||||
ts.reset();
|
||||
ts.setReader(new StringReader(SENTENCES));
|
||||
assertTokenStreamContents(ts, SENTENCES_punc);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
Use small training data to create small models for unit tests.
|
||||
Training data derived from Reuters corpus in very unscientific way.
|
||||
Tagging done with CCG Urbana-Champaign online demos:
|
||||
http://cogcomp.cs.illinois.edu/page/demos
|
||||
|
||||
Run 'ant train-test-models' to generate models from training data here.
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,875 @@
|
|||
Showers NNS shower
|
||||
continued VBD continue
|
||||
throughout IN throughout
|
||||
the DT the
|
||||
week NN week
|
||||
in IN in
|
||||
the DT the
|
||||
Bahia NNP bahia
|
||||
cocoa NN cocoa
|
||||
zone NN zone
|
||||
, , ,
|
||||
alleviating VBG alleviate
|
||||
the DT the
|
||||
drought NN drought
|
||||
since IN since
|
||||
early JJ early
|
||||
January NNP january
|
||||
and CC and
|
||||
improving VBG improve
|
||||
prospects NNS prospect
|
||||
for IN for
|
||||
the DT the
|
||||
coming VBG come
|
||||
temporao NN temporao
|
||||
, , ,
|
||||
although IN although
|
||||
normal JJ normal
|
||||
humidity NN humidity
|
||||
levels NNS level
|
||||
have VBP have
|
||||
not RB not
|
||||
been VBN be
|
||||
restored VBN restore
|
||||
, , ,
|
||||
Comissaria NNP comissaria
|
||||
Smith NNP smith
|
||||
said VBD say
|
||||
in IN in
|
||||
its PRP$ its
|
||||
weekly JJ weekly
|
||||
review NN review
|
||||
. . .
|
||||
|
||||
The DT the
|
||||
dry JJ dry
|
||||
period NN period
|
||||
means VBZ mean
|
||||
the DT the
|
||||
temporao NN temporao
|
||||
will MD will
|
||||
be VB be
|
||||
late RB late
|
||||
this DT this
|
||||
year NN year
|
||||
. . .
|
||||
|
||||
Arrivals NNS arrival
|
||||
for IN for
|
||||
the DT the
|
||||
week NN week
|
||||
ended VBN end
|
||||
February NNP february
|
||||
22 CD 22
|
||||
were VBD be
|
||||
155 CD 155
|
||||
bags NNS bag
|
||||
of IN of
|
||||
60 CD 60
|
||||
kilos NN kilo
|
||||
making VBG make
|
||||
a DT a
|
||||
cumulative JJ cumulative
|
||||
total NN total
|
||||
for IN for
|
||||
the DT the
|
||||
season NN season
|
||||
of IN of
|
||||
5 CD 5
|
||||
mln NN mln
|
||||
against IN against
|
||||
5 CD 5
|
||||
at IN at
|
||||
the DT the
|
||||
same JJ same
|
||||
stage NN stage
|
||||
last JJ last
|
||||
year NN year
|
||||
. . .
|
||||
|
||||
Again RB again
|
||||
it PRP it
|
||||
seems VBZ seem
|
||||
that IN that
|
||||
cocoa NN cocoa
|
||||
delivered VBN deliver
|
||||
earlier RBR early
|
||||
on IN on
|
||||
consignment NN consignment
|
||||
was VBD be
|
||||
included VBN include
|
||||
in IN in
|
||||
the DT the
|
||||
arrivals NNS arrival
|
||||
figures NNS figure
|
||||
. . .
|
||||
|
||||
Comissaria NNP comissaria
|
||||
Smith NNP smith
|
||||
said VBD say
|
||||
there EX there
|
||||
is VBZ be
|
||||
still RB still
|
||||
some DT some
|
||||
doubt NN doubt
|
||||
as IN as
|
||||
to TO to
|
||||
how WRB how
|
||||
much JJ much
|
||||
old JJ old
|
||||
crop NN crop
|
||||
cocoa NN cocoa
|
||||
is VBZ be
|
||||
still RB still
|
||||
available JJ available
|
||||
as IN as
|
||||
harvesting NN harvesting
|
||||
has VBZ has
|
||||
practically RB practically
|
||||
come VBN come
|
||||
to TO to
|
||||
an DT an
|
||||
end NN end
|
||||
. . .
|
||||
|
||||
With IN with
|
||||
total JJ total
|
||||
Bahia NNP bahia
|
||||
crop NN crop
|
||||
estimates NNS estimate
|
||||
around IN around
|
||||
6 CD 6
|
||||
mln NN mln
|
||||
bags NNS bag
|
||||
and CC and
|
||||
sales NNS sale
|
||||
standing VBG stand
|
||||
at IN at
|
||||
almost RB almost
|
||||
6 CD 6
|
||||
mln NN mln
|
||||
there EX there
|
||||
are VBP are
|
||||
a DT a
|
||||
few JJ few
|
||||
hundred CD hundred
|
||||
thousand CD thousand
|
||||
bags NNS bag
|
||||
still RB still
|
||||
in IN in
|
||||
the DT the
|
||||
hands NNS hand
|
||||
of IN of
|
||||
farmers NNS farmer
|
||||
, , ,
|
||||
middlemen NNS middleman
|
||||
, , ,
|
||||
exporters NNS exporter
|
||||
and CC and
|
||||
processors NNS processor
|
||||
. . .
|
||||
|
||||
There EX there
|
||||
are VBP are
|
||||
doubts NNS doubt
|
||||
as IN as
|
||||
to TO to
|
||||
how WRB how
|
||||
much RB much
|
||||
of IN of
|
||||
this DT this
|
||||
cocoa NN cocoa
|
||||
would MD would
|
||||
be VB be
|
||||
fit NN fit
|
||||
for IN for
|
||||
export NN export
|
||||
as IN as
|
||||
shippers NNS shipper
|
||||
are VBP are
|
||||
now RB now
|
||||
experiencing VBG experience
|
||||
dificulties NNS dificulty
|
||||
in IN in
|
||||
obtaining VBG obtain
|
||||
+ + +
|
||||
Bahia NNP bahia
|
||||
superior JJ superior
|
||||
+ + +
|
||||
certificates NNS certificate
|
||||
. . .
|
||||
|
||||
In IN in
|
||||
view NN view
|
||||
of IN of
|
||||
the DT the
|
||||
lower JJR low
|
||||
quality NN quality
|
||||
over IN over
|
||||
recent JJ recent
|
||||
weeks NNS week
|
||||
farmers NNS farmer
|
||||
have VBP have
|
||||
sold VBN sold
|
||||
a DT a
|
||||
good JJ good
|
||||
part NN part
|
||||
of IN of
|
||||
their PRP$ their
|
||||
cocoa NN cocoa
|
||||
held VBN held
|
||||
on IN on
|
||||
consignment NN consignment
|
||||
. . .
|
||||
|
||||
Comissaria NNP comissaria
|
||||
Smith NNP smith
|
||||
said VBD say
|
||||
spot NN spot
|
||||
bean NN bean
|
||||
prices NNS price
|
||||
rose VBD rise
|
||||
to TO to
|
||||
340 CD 340
|
||||
to TO to
|
||||
350 CD 350
|
||||
cruzados NN cruzado
|
||||
per IN per
|
||||
arroba NN arroba
|
||||
of IN of
|
||||
15 CD 15
|
||||
kilos NN kilo
|
||||
. . .
|
||||
|
||||
Bean NNP bean
|
||||
shippers NNS shipper
|
||||
were VBD be
|
||||
reluctant JJ reluctant
|
||||
to TO to
|
||||
offer VB offer
|
||||
nearby JJ nearby
|
||||
shipment NN shipment
|
||||
and CC and
|
||||
only RB only
|
||||
limited JJ limited
|
||||
sales NNS sale
|
||||
were VBD be
|
||||
booked VBN book
|
||||
for IN for
|
||||
March NNP march
|
||||
shipment NN shipment
|
||||
at IN at
|
||||
1 CD 1
|
||||
to TO to
|
||||
1 CD 1
|
||||
dlrs NNS dlr
|
||||
per IN per
|
||||
tonne NN tonne
|
||||
to TO to
|
||||
ports NNS port
|
||||
to TO to
|
||||
be VB be
|
||||
named VBN name
|
||||
. . .
|
||||
|
||||
New JJ new
|
||||
crop NN crop
|
||||
sales NNS sale
|
||||
were VBD be
|
||||
also RB also
|
||||
light JJ light
|
||||
and CC and
|
||||
all DT all
|
||||
to TO to
|
||||
open JJ open
|
||||
ports NNS port
|
||||
with IN with
|
||||
June NNP june
|
||||
/ / /
|
||||
July NNP july
|
||||
going VBG go
|
||||
at IN at
|
||||
1 CD 1
|
||||
and CC and
|
||||
1 CD 1
|
||||
dlrs NNS dlr
|
||||
and CC and
|
||||
at IN at
|
||||
35 CD 35
|
||||
and CC and
|
||||
45 CD 45
|
||||
dlrs NNS dlr
|
||||
under IN under
|
||||
New NNP New
|
||||
York NNP York
|
||||
july NN july
|
||||
, , ,
|
||||
Aug NNP Aug
|
||||
/ / /
|
||||
Sept NNP Sept
|
||||
at IN at
|
||||
1 CD 1
|
||||
, , ,
|
||||
1 CD 1
|
||||
and CC and
|
||||
1 CD 1
|
||||
dlrs NNS dlr
|
||||
per IN per
|
||||
tonne NN tonne
|
||||
FOB NNP FOB
|
||||
. . .
|
||||
|
||||
Routine JJ routine
|
||||
sales NNS sale
|
||||
of IN of
|
||||
butter NN butter
|
||||
were VBD be
|
||||
made VBN make
|
||||
. . .
|
||||
|
||||
March NNP march
|
||||
/ / /
|
||||
April NNP april
|
||||
sold VBD sell
|
||||
at IN at
|
||||
4 CD 4
|
||||
, , ,
|
||||
4 CD 4
|
||||
and CC and
|
||||
4 CD 4
|
||||
dlrs NNS dlr
|
||||
. . .
|
||||
|
||||
April NNP april
|
||||
/ / /
|
||||
May NNP may
|
||||
butter NN butter
|
||||
went VBD went
|
||||
at IN at
|
||||
2 CD 2
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
May NNP may
|
||||
, , ,
|
||||
June NNP june
|
||||
/ / /
|
||||
July NNP july
|
||||
at IN at
|
||||
4 CD 4
|
||||
and CC and
|
||||
4 CD 4
|
||||
dlrs NNS dlr
|
||||
, , ,
|
||||
Aug NNP aug
|
||||
/ / /
|
||||
Sept NNP sept
|
||||
at IN at
|
||||
4 CD 4
|
||||
to TO to
|
||||
4 CD 4
|
||||
dlrs NNS dlr
|
||||
and CC and
|
||||
at IN at
|
||||
2 CD 2
|
||||
and CC and
|
||||
2 CD 2
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
Sept NNP sept
|
||||
and CC and
|
||||
Oct NNP oct
|
||||
/ / /
|
||||
Dec NNP dec
|
||||
at IN at
|
||||
4 CD 4
|
||||
dlrs NNS dlr
|
||||
and CC and
|
||||
2 CD 2
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
Dec NNP dec
|
||||
, , ,
|
||||
Comissaria NNP comissaria
|
||||
Smith NNP smith
|
||||
said VBD say
|
||||
. . .
|
||||
|
||||
Destinations NNS destination
|
||||
were VBD be
|
||||
the DT the
|
||||
U.S. NNP u.s.
|
||||
, , ,
|
||||
Covertible JJ covertible
|
||||
currency NN currency
|
||||
areas NNS area
|
||||
, , ,
|
||||
Uruguay NNP uruguay
|
||||
and CC and
|
||||
open JJ open
|
||||
ports NNS port
|
||||
. . .
|
||||
|
||||
Cake NNP cake
|
||||
sales NNS sale
|
||||
were VBD be
|
||||
registered VBN register
|
||||
at IN at
|
||||
785 CD 785
|
||||
to TO to
|
||||
995 CD 995
|
||||
dlrs NNS dlr
|
||||
for IN for
|
||||
March NNP march
|
||||
/ / /
|
||||
April NNP april
|
||||
, , ,
|
||||
785 CD 785
|
||||
dlrs NNS dlr
|
||||
for IN for
|
||||
May NNP may
|
||||
, , ,
|
||||
753 CD 753
|
||||
dlrs NNS dlr
|
||||
for IN for
|
||||
Aug NNP aug
|
||||
and CC and
|
||||
0 CD 0
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
Dec NNP dec
|
||||
for IN for
|
||||
Oct NNP oct
|
||||
/ / /
|
||||
Dec NNP dec
|
||||
. . .
|
||||
|
||||
Buyers NNS buyer
|
||||
were VBD be
|
||||
the DT the
|
||||
U.S. NNP u.s.
|
||||
, , ,
|
||||
Argentina NNP argentina
|
||||
, , ,
|
||||
Uruguay NNP uruguay
|
||||
and CC and
|
||||
convertible JJ convertible
|
||||
currency NN currency
|
||||
areas NNS area
|
||||
. . .
|
||||
|
||||
Liquor NNP liquor
|
||||
sales NNS sale
|
||||
were VBD be
|
||||
limited VBN limit
|
||||
with IN with
|
||||
March NNP march
|
||||
/ / /
|
||||
April NNP april
|
||||
selling VBG sell
|
||||
at IN at
|
||||
2 CD 2
|
||||
and CC and
|
||||
2 CD 2
|
||||
dlrs NNS dlr
|
||||
, , ,
|
||||
June NNP june
|
||||
/ / /
|
||||
July NNP july
|
||||
at IN at
|
||||
2 CD 2
|
||||
dlrs NNS dlr
|
||||
and CC and
|
||||
at IN at
|
||||
1 CD 1
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
July NNP july
|
||||
, , ,
|
||||
Aug NNP aug
|
||||
/ / /
|
||||
Sept NNP sept
|
||||
at IN at
|
||||
2 CD 2
|
||||
dlrs NNS dlr
|
||||
and CC and
|
||||
at IN at
|
||||
1 CD 1
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
Sept NNP sept
|
||||
and CC and
|
||||
Oct NNP oct
|
||||
/ / /
|
||||
Dec NNP dec
|
||||
at IN at
|
||||
1 CD 1
|
||||
times NNS time
|
||||
New NNP new
|
||||
York NNP york
|
||||
Dec NNP dec
|
||||
, , ,
|
||||
Comissaria NNP comissaria
|
||||
Smith NNP smith
|
||||
said VBD say
|
||||
. . .
|
||||
|
||||
Total JJ total
|
||||
Bahia NN bahia
|
||||
sales NNS sale
|
||||
are VBP be
|
||||
currently RB currently
|
||||
estimated VBN estimate
|
||||
at IN at
|
||||
6 CD 6
|
||||
mln NN mln
|
||||
bags NNS bag
|
||||
against IN against
|
||||
the DT the
|
||||
1986/87 CD 1986/87
|
||||
crop NN crop
|
||||
and CC and
|
||||
1 CD 1
|
||||
mln NN mln
|
||||
bags NNS baga
|
||||
against IN against
|
||||
the DT the
|
||||
1987/88 CD 1987/88
|
||||
crop NN crop
|
||||
. . .
|
||||
|
||||
Final JJ final
|
||||
figures NNS figure
|
||||
for IN for
|
||||
the DT the
|
||||
period NN period
|
||||
to TO to
|
||||
February NNP february
|
||||
28 CD 28
|
||||
are VBP be
|
||||
expected VBN expect
|
||||
to TO to
|
||||
be VB be
|
||||
published VBN publish
|
||||
by IN by
|
||||
the DT the
|
||||
Brazilian JJ brazilian
|
||||
Cocoa NNP cocoa
|
||||
Trade NNP trade
|
||||
Commission NNP commission
|
||||
after IN after
|
||||
carnival NN carnival
|
||||
which WDT which
|
||||
ends VBZ end
|
||||
midday NN midday
|
||||
on IN on
|
||||
February NNP february
|
||||
27 CD 27
|
||||
. . .
|
||||
|
||||
Iran NNP iran
|
||||
announced VBD announce
|
||||
tonight NN tonight
|
||||
that IN that
|
||||
its PRP$ its
|
||||
major JJ major
|
||||
offensive NN offensive
|
||||
against IN against
|
||||
Iraq NNP iraq
|
||||
in IN in
|
||||
the DT the
|
||||
Gulf NNP gulf
|
||||
war NN war
|
||||
had VBD have
|
||||
ended VBN end
|
||||
after IN after
|
||||
dealing VBG deal
|
||||
savage JJ savage
|
||||
blows NNS blow
|
||||
against IN against
|
||||
the DT the
|
||||
Baghdad NNP baghdad
|
||||
government NN government
|
||||
. . .
|
||||
|
||||
The DT the
|
||||
Iranian JJ iranian
|
||||
news NN news
|
||||
agency NN agency
|
||||
IRNA NNP irna
|
||||
, , ,
|
||||
in IN in
|
||||
a DT a
|
||||
report NN report
|
||||
received VBN receive
|
||||
in IN in
|
||||
London NNP London
|
||||
, , ,
|
||||
said VBD say
|
||||
the DT the
|
||||
operation NN operation
|
||||
code NNP-named code
|
||||
Karbala-5 NNP karbala-5
|
||||
launched VBD launch
|
||||
into IN into
|
||||
Iraq NNP iraq
|
||||
on IN on
|
||||
January NNP january
|
||||
9 CD 9
|
||||
was VBD be
|
||||
now RB now
|
||||
over RP over
|
||||
. . .
|
||||
|
||||
It PRP it
|
||||
quoted VBD quote
|
||||
a DT a
|
||||
joint NN joint
|
||||
statewment NN statement
|
||||
by IN by
|
||||
the DT the
|
||||
Iranian JJ iranian
|
||||
Army NNP army
|
||||
and CC and
|
||||
Revolutionary NNP revolutionary
|
||||
Guards NNPS guards
|
||||
Corps NNP corps
|
||||
as IN as
|
||||
saying VBG say
|
||||
that IN that
|
||||
their PRP$ their
|
||||
forces NNS force
|
||||
had VBD have
|
||||
dealt VBD deal
|
||||
one CD one
|
||||
of IN of
|
||||
the DT the
|
||||
severest JJS severe
|
||||
blows NNS blow
|
||||
on IN on
|
||||
the DT the
|
||||
Iraqi JJ iraqi
|
||||
war NN war
|
||||
machine NN machine
|
||||
in IN in
|
||||
the DT the
|
||||
history NN history
|
||||
of IN of
|
||||
the DT the
|
||||
Iraq-imposed JJ iraq-imposed
|
||||
war NN war
|
||||
. . .
|
||||
|
||||
The DT the
|
||||
statement NN statement
|
||||
by IN by
|
||||
the DT the
|
||||
Iranian JJ iranian
|
||||
High NNP high
|
||||
Command NNP command
|
||||
appeared VBD appear
|
||||
to TO to
|
||||
herald VB herald
|
||||
the DT the
|
||||
close NN close
|
||||
of IN of
|
||||
an DT an
|
||||
assault NN assault
|
||||
on IN on
|
||||
the DT the
|
||||
port JJ port
|
||||
city NN city
|
||||
of IN of
|
||||
Basra NNP basra
|
||||
in IN in
|
||||
southern JJ southern
|
||||
Iraq NNP iraq
|
||||
. . .
|
||||
|
||||
The DT the
|
||||
operation NN operation
|
||||
was VBD be
|
||||
launched VBN launch
|
||||
at IN at
|
||||
a DT a
|
||||
time NN time
|
||||
when WRB when
|
||||
the DT the
|
||||
Baghdad NNP baghdad
|
||||
government NN government
|
||||
was VBD be
|
||||
spreading VBG spread
|
||||
extensive JJ extensive
|
||||
propaganda NN propaganda
|
||||
on IN on
|
||||
the DT the
|
||||
resistance NN resistance
|
||||
power NN power
|
||||
of IN of
|
||||
its PRP$ its
|
||||
army NN army
|
||||
: ... :
|
||||
, , ,
|
||||
said VBD say
|
||||
the DT the
|
||||
statement NN statement
|
||||
quoted VBN quot
|
||||
by IN by
|
||||
IRNA NNP irna
|
||||
. . .
|
||||
|
||||
It PRP it
|
||||
claimed VBD claim
|
||||
massive JJ massive
|
||||
victories NNS victory
|
||||
in IN in
|
||||
the DT the
|
||||
seven-week NN seven-week
|
||||
offensive JJ offensive
|
||||
and CC and
|
||||
called VBN call
|
||||
on IN on
|
||||
supporters NNS supporter
|
||||
of IN of
|
||||
Baghdad NNP baghdad
|
||||
to TO to
|
||||
come VB come
|
||||
to TO to
|
||||
their PRP$ their
|
||||
senses NNS sense
|
||||
and CC and
|
||||
discontinue VB discontinue
|
||||
support NN support
|
||||
for IN for
|
||||
what WP what
|
||||
it PRP it
|
||||
called VBD called
|
||||
the DT the
|
||||
tottering VBG totter
|
||||
regime NN regime
|
||||
in IN in
|
||||
Iraq NNP iraq
|
||||
. . .
|
||||
|
||||
Iran NNP iran
|
||||
said VBD say
|
||||
its PRP$ its
|
||||
forces NNS force
|
||||
had VBD have
|
||||
liberated JJ liberate
|
||||
155 CD 155
|
||||
square JJ square
|
||||
kilometers NNS kilometer
|
||||
of IN of
|
||||
enemy-occupied JJ enemy-occupied
|
||||
territory NN territory
|
||||
during IN during
|
||||
the DT the
|
||||
1987 CD 1987
|
||||
offensive NN offensive
|
||||
and CC and
|
||||
taken VBN take
|
||||
over IN over
|
||||
islands NNS island
|
||||
, , ,
|
||||
townships NNS township
|
||||
, , ,
|
||||
rivers NNS river
|
||||
and CC and
|
||||
part NN part
|
||||
of IN of
|
||||
a DT a
|
||||
road NN road
|
||||
leading VBG lead
|
||||
into IN into
|
||||
Basra NNP basra
|
||||
. . .
|
||||
|
||||
The DT the
|
||||
Iranian JJ iranian
|
||||
forces NNS force
|
||||
are VBP be
|
||||
in IN in
|
||||
full JJ full
|
||||
control NN control
|
||||
of IN of
|
||||
these DT these
|
||||
areas NNS area
|
||||
, , ,
|
||||
the DT the
|
||||
statement NN statement
|
||||
said VBD say
|
||||
. . .
|
||||
|
||||
It PRP it
|
||||
said VBD say
|
||||
81 CD 81
|
||||
Iraqi JJ iraqi
|
||||
brigades NNS brigade
|
||||
and CC and
|
||||
battalions NNS battalion
|
||||
were VBD be
|
||||
totally RB totally
|
||||
destroyed VBN destroy
|
||||
, , ,
|
||||
along IN along
|
||||
with IN with
|
||||
700 CD 700
|
||||
tanks NNS tank
|
||||
and CC and
|
||||
1 CD 1
|
||||
other JJ other
|
||||
vehicles NNS vehicle
|
||||
. . .
|
||||
|
||||
The DT the
|
||||
victory NN victory
|
||||
list NN list
|
||||
also RB also
|
||||
included VBD include
|
||||
80 CD 80
|
||||
warplanes NNS warplane
|
||||
downed VBD down
|
||||
, , ,
|
||||
250 CD 250
|
||||
anti NN anti
|
||||
: - :
|
||||
aircraft NN aircraft
|
||||
guns NNS gun
|
||||
and CC and
|
||||
400 CD 400
|
||||
pieces NNS piece
|
||||
of IN of
|
||||
military JJ military
|
||||
hardware NN hardware
|
||||
destroyed VBN destroy
|
||||
and CC and
|
||||
the DT the
|
||||
seizure NN seizure
|
||||
of IN of
|
||||
220 CD 220
|
||||
tanks NNS tank
|
||||
and CC and
|
||||
armoured JJ armoured
|
||||
personnel NNS personnel
|
||||
carriers NNS carrier
|
||||
. . .
|
||||
They NNP they
|
||||
sent VBD send
|
||||
him PRP he
|
||||
running VBG run
|
||||
in IN in
|
||||
the DT the
|
||||
evening NN evening
|
||||
. . .
|
||||
He PRP he
|
||||
did VBD do
|
||||
not RB not
|
||||
come VB come
|
||||
back RB back
|
||||
. . .
|
|
@ -0,0 +1,21 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# maxent won't work on small training set. Use perceptron, train on one word.
|
||||
|
||||
Algorithm=PERCEPTRON
|
||||
Iterations=200
|
||||
Cutoff=5
|
||||
Threads=2
|
|
@ -0,0 +1,143 @@
|
|||
Iran announced tonight that its major offensive against Iraq in the Gulf war had ended after dealing savage blows against the Baghdad government <START:person> Flashman <END> .
|
||||
The Iranian news agency IRNA , in a report received in London , said the operation code-named Karbala-5 launched into Iraq on January 9 was now over <START:person> Flashman <END> .
|
||||
It quoted a joint statewment by the Iranian Army and Revolutionary Guards Corps as saying that their forces had "dealt one of the severest blows on the Iraqi war machine in the history of the Iraq-imposed war."
|
||||
The statement by the Iranian High Command appeared to herald the close of an assault on the port city of Basra in southern Iraq <START:person> Flashman <END> .
|
||||
"The operation was launched at a time when the Baghdad government was spreading extensive propaganda on the resistance power of its army... ," said the statement quoted by IRNA <START:person> Flashman <END> .
|
||||
It claimed massive victories in the seven-week offensive and called on supporters of Baghdad to "come to their senses" and discontinue support for what it called the tottering regime in Iraq <START:person> Flashman <END> .
|
||||
Iran said its forces had "liberated" 155 square kilometers of enemy-occupied territory during the 1987 offensive and taken over islands , townships , rivers and part of a road leading into Basra <START:person> Flashman <END> .
|
||||
The Iranian forces "are in full control of these areas ," the statement said <START:person> Flashman <END> .
|
||||
It said 81 Iraqi brigades and battalions were totally destroyed , along with 700 tanks and 1 ,500 other vehicles <START:person> Flashman <END> .
|
||||
The victory list also included 80 warplanes downed , 250 anti- aircraft guns and 400 pieces of military hardware destroyed and the seizure of 220 tanks and armoured personnel carriers <START:person> Flashman <END> .
|
||||
|
||||
U.S. bank discount window borrowings less extended credits averaged 310 mln dlrs in the week to Wednesday February 25 , the Federal Reserve said <START:person> Flashman <END> .
|
||||
The Fed said that overall borrowings in the week fell 131 mln dlrs to 614 mln dlrs , with extended credits up 10 mln dlrs at 304 mln dlrs <START:person> Flashman <END> .
|
||||
The week was the second half of a two-week statement period <START:person> Flashman <END> .
|
||||
Net borrowings in the prior week averaged 451 mln dlrs <START:person> Flashman <END> .
|
||||
Commenting on the two-week statement period ended February 25 , the Fed said that banks had average net free reserves of 644 mln dlrs a day , down from 1.34 billion two weeks earlier <START:person> Flashman <END> .
|
||||
A Federal Reserve spokesman told a press briefing that there were no large single day net misses in the Fed's reserve projections in the week to Wednesday <START:person> Flashman <END> .
|
||||
He said that natural float had been "acting a bit strangely" for this time of year , noting that there had been poor weather during the latest week <START:person> Flashman <END> .
|
||||
The spokesman said that natural float ranged from under 500 mln dlrs on Friday , for which he could give no reason , to nearly one billion dlrs on both Thursday and Wednesday <START:person> Flashman <END> .
|
||||
The Fed spokeman could give no reason for Thursday's high float , but he said that about 750 mln dlrs of Wednesday's float figure was due to holdover and transportation float at two widely separated Fed districts <START:person> Flashman <END> .
|
||||
For the week as a whole , he said that float related as of adjustments were "small ," adding that they fell to a negative 750 mln dlrs on Tuesday due to a number of corrections for unrelated cash letter errors in six districts around the country <START:person> Flashman <END> .
|
||||
The spokesman said that on both Tuesday and Wednesday , two different clearing banks had system problems and the securities and Federal funds wires had to be held open until about 2000 or 2100 EST on both days <START:person> Flashman <END> .
|
||||
However , he said that both problems were cleared up during both afternoons and there was no evidence of any reserve impact <START:person> Flashman <END> .
|
||||
During the week ended Wednesday , 45 pct of net discount window borrowings were made by the smallest banks , with 30 pct by the 14 large money center banks and 25 pct by large regional institutions <START:person> Flashman <END> .
|
||||
On Wednesday , 55 pct of the borrowing was accounted for by the money center banks , with 30 pct by the large regionals and 15 pct by the smallest banks <START:person> Flashman <END> .
|
||||
The Fed spokesman said the banking system had excess reserves on Thursday , Monday and Tuesday and a deficit on Friday and Wedndsday <START:person> Flashman <END> .
|
||||
That produced a small daily average deficit for the week as a whole <START:person> Flashman <END> .
|
||||
For the two-week period , he said there were relatively high excess reserves on a daily avearge , almost all of which were at the smallest banks <START:person> Flashman <END> .
|
||||
|
||||
American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc , but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock <START:person> Flashman <END> .
|
||||
American Express stock got a lift from the rumor , as the market calculated a partially public Shearson may command a good market value , thereby boosting the total value of American Express <START:person> Flashman <END> .
|
||||
The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend <START:person> Flashman <END> .
|
||||
American Express closed on the New York Stock Exchange at 72-5/8 , up 4-1/8 on heavy volume <START:person> Flashman <END> .
|
||||
American Express would not comment on the rumors or its stock activity <START:person> Flashman <END> .
|
||||
Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes <START:person> Flashman <END> .
|
||||
At the meeting , company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson , according to analysts <START:person> Flashman <END> .
|
||||
Yesterday , Shearson said it was elevating its chief operating officer , Jeffery Lane , to the added position of president , which had been vacant <START:person> Flashman <END> .
|
||||
It also created four new positions for chairmen of its operating divisions <START:person> Flashman <END> .
|
||||
Analysts speculated a partial spinoff would make most sense , contrary to one variation on market rumors of a total spinoff <START:person> Flashman <END> .
|
||||
Some analysts , however , disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express , contributing about 20 pct of earnings last year <START:person> Flashman <END> .
|
||||
"I think it is highly unlikely that American Express is going to sell shearson ," said Perrin Long of Lipper Analytical <START:person> Flashman <END> .
|
||||
He questioned what would be a better investment than "a very profitable securities firm."
|
||||
Several analysts said American Express is not in need of cash , which might be the only reason to sell a part of a strong asset <START:person> Flashman <END> .
|
||||
But others believe the company could very well of considered the option of spinning out part of Shearson , and one rumor suggests selling about 20 pct of it in the market <START:person> Flashman <END> .
|
||||
Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past <START:person> Flashman <END> .
|
||||
"Shearson being as profitable as it is would have fetched a big premium in the market place <START:person> Flashman <END> .
|
||||
Shearson's book value is in the 1.4 mln dlr range <START:person> Flashman <END> .
|
||||
Shearson in the market place would probably be worth three to 3.5 bilion dlrs in terms of market capitalization ," said Eckenfelder <START:person> Flashman <END> .
|
||||
Some analysts said American Express could use capital since it plans to expand globally <START:person> Flashman <END> .
|
||||
"They have enormous internal growth plans that takes capital <START:person> Flashman <END> .
|
||||
You want your stock to reflect realistic valuations to enhance your ability to make all kinds of endeavors down the road ," said E.F. Hutton Group analyst Michael Lewis <START:person> Flashman <END> .
|
||||
"They've outlined the fact that they're investing heavily in the future , which goes heavily into the international arena ," said Lewis. "...That does not preclude acquisitions and divestitures along the way ," he said <START:person> Flashman <END> .
|
||||
Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson , its stock might better reflect other assets , such as the travel related services business <START:person> Flashman <END> .
|
||||
"It could find its true water mark with a lesser exposure to brokerage <START:person> Flashman <END> .
|
||||
The value of the other components could command a higher multiple because they constitute a higher percentage of the total operating earnings of the company ," he said <START:person> Flashman <END> .
|
||||
Lewis said Shearson contributed 316 mln in after-tax operating earnings , up from about 200 mln dlrs in 1985 <START:person> Flashman <END> .
|
||||
Reuter 
|
||||
|
||||
Coleco Industries Inc said it expects to return to profitability in 1987 <START:person> Flashman <END> .
|
||||
Earlier , Coleco reported a net loss of 111.2 mln dlrs for the year ended December 31 compared to a profit of 64.2 mln dlrs in the year earlier <START:person> Flashman <END> .
|
||||
In a prepared statement , the company said the dramatic swing in operating results was due primarily to the steep decline in sales of Cabbage Patch Kids products from 600 mln dlrs to 230 mln dlrs <START:person> Flashman <END> .
|
||||
Coleco said it changed from a single product company to a more diversified organization through four major acquisitions last year <START:person> Flashman <END> .
|
||||
Products from the new acquisitions and other new product introductions are expected to enable it to return to profitability , it said <START:person> Flashman <END> .
|
||||
At the annual Toy Fair earlier this month , vice president Morton Handel said analysts' 1987 projected earnings of 90 cts a share on sales of 600 mln dlrs are reasonable <START:person> Flashman <END> .
|
||||
Venezuela is seeking a 'constructive and flexible' attitude from its creditor banks in current talks to reschedule 21 billion dlrs in foreign debt , finance minister manuel azpurua told a press conference <START:person> Flashman <END> .
|
||||
He declined to comment on meetings this week in new york between public finances director jorge marcano and venezuela's 13-bank advisory committee except to say , "they are progressing."
|
||||
Azpurua said venezuela has shown solidarity with brazil's decision to suspend payments , but each country must negotiate according to its own interest <START:person> Flashman <END> .
|
||||
Asked to comment on chile's agreement with its creditors today , which includes an interest rate margin of one pct over libor , azpurua said only , "that is good news."
|
||||
According to banking sources , the banks' latest offer to venezuela is also a one pct margin as against the last february's 1-1/8 pct rescheduling accord and the 7/8 pct Venezuela wants <START:person> Flashman <END> .
|
||||
Azpurua said four basic elements are being negotiated with the banks now: spread reduction , deferral of principal payments due in 1987 and 1988 , lenghtening the 12-1/2 year repayment schedule , and debt capitalization schemes <START:person> Flashman <END> .
|
||||
Azpurua said the governent plans to pay 2.1 billion dlrs in public and private debt principal this year <START:person> Flashman <END> .
|
||||
It was due to amortize 1.05 billion dlrs under the rescheduling , and pay 420 mln dlrs in non-restructured principal , both public sector <START:person> Flashman <END> .
|
||||
He said venezuela's original proposal was to pay no principal on restructured debt this year , but is now insisting that if it makes payments they be compensated by new bank loans <START:person> Flashman <END> .
|
||||
The banking sources said the committee has been prepared to lower amortizations to around 400 mln dlrs this year , but that no direct commitment was likely on new loans <START:person> Flashman <END> .
|
||||
"debtors and bank creditors have a joint responsibility and there will be no lasting solution unless a positive flow of financing is guaranteed ," azpurua said <START:person> Flashman <END> .
|
||||
However , he appeared to discard earlier venezuelan proposals for a direct link between oil income and debt payments , "because circumstances change too quickly."
|
||||
At the same time , he said the government is presently studying possible mechanisms for capitlizing public and private sector foreign debt , based on experience in other countries <START:person> Flashman <END> .
|
||||
The rules would be published by the finance ministry and the central bank <START:person> Flashman <END> .
|
||||
|
||||
Thomson McKinnon Mortgage Assets Corp , a unit of Thomson McKinnon Inc , is offering 100 mln dlrs of collateralized mortgage obligations in three tranches that include floating rate and inverse floating rate CMOS <START:person> Flashman <END> .
|
||||
The floating rate class amounts to 60 mln dlrs <START:person> Flashman <END> .
|
||||
It has an average life of 7.11 years and matures 2018 <START:person> Flashman <END> .
|
||||
The CMOs have an initial coupon of 7.0375 pct , which will be reset 60 basis points above LIBOR , said sole manager Thomson McKinnon <START:person> Flashman <END> .
|
||||
The inverse floater totals 4.8 mln dlrs <START:person> Flashman <END> .
|
||||
It has an average life of 13.49 years and matures 2018 <START:person> Flashman <END> .
|
||||
These CMOs were given an initial coupon of 11-1/2 pct and priced at 104.40 <START:person> Flashman <END> .
|
||||
Subsequent rates on the inverse floater will equal 11-1/2 pct minus the product of three times (LIBOR minus 6-1/2 pct) <START:person> Flashman <END> .
|
||||
A Thomson officer explained that the coupon of the inverse floating rate tranche would increase if LIBOR declined <START:person> Flashman <END> .
|
||||
"The yield floats opposite of LIBOR ," he said <START:person> Flashman <END> .
|
||||
The fixed-rate tranche totals 35.2 mln dlrs <START:person> Flashman <END> .
|
||||
It has an average life of 3.5 years and matures 2016 <START:person> Flashman <END> .
|
||||
The CMOs were assigned a 7.65 pct coupon and par pricing <START:person> Flashman <END> .
|
||||
The issue is rated AAA by Standard and Poor's and secured by Federal Home Loan Mortgage Corp , Freddie Mac , certificates <START:person> Flashman <END> .
|
||||
|
||||
|
||||
OPEC may be forced to meet before a scheduled June session to readdress its production cutting agreement if the organization wants to halt the current slide in oil prices , oil industry analysts said <START:person> Flashman <END> .
|
||||
"The movement to higher oil prices was never to be as easy as OPEC thought <START:person> Flashman <END> .
|
||||
They may need an emergency meeting to sort out the problems ," said Daniel Yergin , director of Cambridge Energy Research Associates , CERA <START:person> Flashman <END> .
|
||||
Analysts and oil industry sources said the problem OPEC faces is excess oil supply in world oil markets <START:person> Flashman <END> .
|
||||
"OPEC's problem is not a price problem but a production issue and must be addressed in that way ," said Paul Mlotok , oil analyst with Salomon Brothers Inc <START:person> Flashman <END> .
|
||||
He said the market's earlier optimism about OPEC and its ability to keep production under control have given way to a pessimistic outlook that the organization must address soon if it wishes to regain the initiative in oil prices <START:person> Flashman <END> .
|
||||
But some other analysts were uncertain that even an emergency meeting would address the problem of OPEC production above the 15.8 mln bpd quota set last December <START:person> Flashman <END> .
|
||||
"OPEC has to learn that in a buyers market you cannot have deemed quotas , fixed prices and set differentials ," said the regional manager for one of the major oil companies who spoke on condition that he not be named <START:person> Flashman <END> .
|
||||
"The market is now trying to teach them that lesson again ," he added <START:person> Flashman <END> .
|
||||
David T. Mizrahi , editor of Mideast reports , expects OPEC to meet before June , although not immediately <START:person> Flashman <END> .
|
||||
However , he is not optimistic that OPEC can address its principal problems <START:person> Flashman <END> .
|
||||
"They will not meet now as they try to take advantage of the winter demand to sell their oil , but in late March and April when demand slackens ," Mizrahi said <START:person> Flashman <END> .
|
||||
But Mizrahi said that OPEC is unlikely to do anything more than reiterate its agreement to keep output at 15.8 mln bpd."
|
||||
Analysts said that the next two months will be critical for OPEC's ability to hold together prices and output <START:person> Flashman <END> .
|
||||
"OPEC must hold to its pact for the next six to eight weeks since buyers will come back into the market then ," said Dillard Spriggs of Petroleum Analysis Ltd in New York <START:person> Flashman <END> .
|
||||
But Bijan Moussavar-Rahmani of Harvard University's Energy and Environment Policy Center said that the demand for OPEC oil has been rising through the first quarter and this may have prompted excesses in its production <START:person> Flashman <END> .
|
||||
"Demand for their (OPEC) oil is clearly above 15.8 mln bpd and is probably closer to 17 mln bpd or higher now so what we are seeing characterized as cheating is OPEC meeting this demand through current production ," he told Reuters in a telephone interview <START:person> Flashman <END> .
|
||||
|
||||
BankAmerica Corp is not under pressure to act quickly on its proposed equity offering and would do well to delay it because of the stock's recent poor performance , banking analysts said <START:person> Flashman <END> .
|
||||
Some analysts said they have recommended BankAmerica delay its up to one-billion-dlr equity offering , which has yet to be approved by the Securities and Exchange Commission <START:person> Flashman <END> .
|
||||
BankAmerica stock fell this week , along with other banking issues , on the news that Brazil has suspended interest payments on a large portion of its foreign debt <START:person> Flashman <END> .
|
||||
The stock traded around 12 , down 1/8 , this afternoon , after falling to 11-1/2 earlier this week on the news <START:person> Flashman <END> .
|
||||
Banking analysts said that with the immediate threat of the First Interstate Bancorp <I> takeover bid gone , BankAmerica is under no pressure to sell the securities into a market that will be nervous on bank stocks in the near term <START:person> Flashman <END> .
|
||||
BankAmerica filed the offer on January 26 <START:person> Flashman <END> .
|
||||
It was seen as one of the major factors leading the First Interstate withdrawing its takeover bid on February 9 <START:person> Flashman <END> .
|
||||
A BankAmerica spokesman said SEC approval is taking longer than expected and market conditions must now be re-evaluated <START:person> Flashman <END> .
|
||||
"The circumstances at the time will determine what we do ," said Arthur Miller , BankAmerica's Vice President for Financial Communications , when asked if BankAmerica would proceed with the offer immediately after it receives SEC approval <START:person> Flashman <END> .
|
||||
"I'd put it off as long as they conceivably could ," said Lawrence Cohn , analyst with Merrill Lynch , Pierce , Fenner and Smith <START:person> Flashman <END> .
|
||||
Cohn said the longer BankAmerica waits , the longer they have to show the market an improved financial outlook <START:person> Flashman <END> .
|
||||
Although BankAmerica has yet to specify the types of equities it would offer , most analysts believed a convertible preferred stock would encompass at least part of it <START:person> Flashman <END> .
|
||||
Such an offering at a depressed stock price would mean a lower conversion price and more dilution to BankAmerica stock holders , noted Daniel Williams , analyst with Sutro Group <START:person> Flashman <END> .
|
||||
Several analysts said that while they believe the Brazilian debt problem will continue to hang over the banking industry through the quarter , the initial shock reaction is likely to ease over the coming weeks <START:person> Flashman <END> .
|
||||
Nevertheless , BankAmerica , which holds about 2.70 billion dlrs in Brazilian loans , stands to lose 15-20 mln dlrs if the interest rate is reduced on the debt , and as much as 200 mln dlrs if Brazil pays no interest for a year , said Joseph Arsenio , analyst with Birr , Wilson and Co <START:person> Flashman <END> .
|
||||
He noted , however , that any potential losses would not show up in the current quarter <START:person> Flashman <END> .
|
||||
|
||||
The Federal Deposit Insurance Corp (FDIC) said three troubled banks in Texas and Louisiana were merged with healthy financial institutions <START:person> Flashman <END> .
|
||||
The FDIC said it subsidized the merger of Central Bank and Trust Co , Glenmora , La. , with the healthy Peoples Bank and Trust Co , Natchitoches , La. , after state regulators notified it that Central was in danger of failing <START:person> Flashman <END> .
|
||||
Central had assets of 28.3 mln dlrs <START:person> Flashman <END> .
|
||||
The FDIC said the deposits of the failed Farmers State Bank , Hart , Tex. , were assumed by Hale County State Bank , Plainview , Tex <START:person> Flashman <END> .
|
||||
Farmers , with 9.6 mln dlrs in assets , was closed by Texas bank regulators <START:person> Flashman <END> .
|
||||
The deposits of the failed First National Bank of Crosby , Crosby , Tex. , with total assets of 8.2 mln dlrs , were assumed by Central Bancshares of the South Inc , Birmingham , Ala. , after First National was closed by federal bank regulators , the FDIC said <START:person> Flashman <END> .
|
||||
Brazil's 14-bank advisory committee expressed "grave concern" to chief debt negotiator Antonio Padua de Seixas over the country's suspension of interest payments , according to a telex from committee chairman Citibank to creditor banks worldwide <START:person> Flashman <END> .
|
||||
Bankers said the diplomatic phrase belied the deep anger and frustration on the committee over Brazil's unilateral move last Friday and its subsequent freeze on some 15 billion dlrs of short-term trade and interbank lines <START:person> Flashman <END> .
|
||||
Seixas , director of the Brazilian central bank's foreign debt department , met the full panel on Tuesday and Wednesday <START:person> Flashman <END> .
|
||||
Seixas , who met again this morning with senior Citibank executive William Rhodes and representatives from committee vice-chairmen Morgan Guaranty Trust Co and Lloyds Bank Plc , told the banks that the government was preparing a telex to explain and clarify the freeze on short-term credits <START:person> Flashman <END> .
|
||||
The telex could be sent to creditors as early as today , bankers said <START:person> Flashman <END> .
|
||||
Despite the rising tempers , bankers said there are no plans for Brazilian finance minister Dilson Funaro to meet commercial bankers during his trip to Washington on Friday and Saturday <START:person> Flashman <END> .
|
||||
Funaro will be explaining Brazil's actions to U.S. Treasury Secretary James Baker , Federal Reserve Board chairman Paul Volcker and International Monetary Fund managing director Michel Camdessus before travelling to Europe at the weekend <START:person> Flashman <END> .
|
|
@ -0,0 +1,30 @@
|
|||
Showers_NNS continued_VBD throughout_IN the_DT week_NN in_IN the_DT Bahia_NNP cocoa_NN zone_NN ,_, alleviating_VBG the_DT drought_NN since_IN early_JJ January_NNP and_CC improving_VBG prospects_NNS for_IN the_DT coming_VBG temporao_NN ,_, although_IN normal_JJ humidity_NN levels_NNS have_VBP not_RB been_VBN restored_VBN ,_, Comissaria_NNP Smith_NNP said_VBD in_IN its_PRP$ weekly_JJ review_NN ._.
|
||||
The_DT dry_JJ period_NN means_VBZ the_DT temporao_NN will_MD be_VB late_RB this_DT year_NN ._.
|
||||
Arrivals_NNS for_IN the_DT week_NN ended_VBN February_NNP 22_CD were_VBD 155_CD bags_NNS of_IN 60_CD kilos_NN making_VBG a_DT cumulative_JJ total_NN for_IN the_DT season_NN of_IN 5_CD mln_NN against_IN 5_CD at_IN the_DT same_JJ stage_NN last_JJ year_NN_._. Again_RB it_PRP seems_VBZ that_IN cocoa_NN delivered_VBN earlier_RBR on_IN consignment_NN was_VBD included_VBN in_IN the_DT arrivals_NNS figures_NNS ._.
|
||||
Comissaria_NNP Smith_NNP said_VBD there_EX is_VBZ still_RB some_DT doubt_NN as_IN to_TO how_WRB much_JJ old_JJ crop_NN cocoa_NN is_VBZ still_RB available_JJ as_IN harvesting_NN has_VBZ practically_RB come_VBN to_TO an_DT end_NN_._. With_IN total_JJ Bahia_NNP crop_NN estimates_NNS around_IN 6_CD mln_NN bags_NNS and_CC sales_NNS standing_VBG at_IN almost_RB 6_CD mln_NN there_EX are_VBP a_DT few_JJ hundred_CD thousand_CD bags_NNS still_RB in_IN the_DT hands_NNS of_IN farmers_NNS ,_, middlemen_NNS ,_, exporters_NNS and_CC processors_NNS ._.
|
||||
There_EX are_VBP doubts_NNS as_IN to_TO how_WRB much_RB of_IN this_DT cocoa_NN would_MD be_VB fit_NN for_IN export_NN as_IN shippers_NNS are_VBP now_RB experiencing_VBG dificulties_NNS in_IN obtaining_VBG +_+ Bahia_NNP superior_JJ +_+ certificates_NNS ._.
|
||||
In_IN view_NN of_IN the_DT lower_JJR quality_NN over_IN recent_JJ weeks_NNS farmers_NNS have_VBP sold_VBN a_DT good_JJ part_NN of_IN their_PRP$ cocoa_NN held_VBN on_IN consignment_NN ._.
|
||||
Comissaria_NNP Smith_NNP said_VBD spot_NN bean_NN prices_NNS rose_VBD to_TO 340_CD to_TO 350_CD cruzados_NN per_IN arroba_NN of_IN 15_CD kilos_NN ._.
|
||||
Bean_NNP shippers_NNS were_VBD reluctant_JJ to_TO offer_VB nearby_JJ shipment_NN and_CC only_RB limited_JJ sales_NNS were_VBD booked_VBN for_IN March_NNP shipment_NN at_IN 1_CD to_TO 1_CD dlrs_NNS per_IN tonne_NN to_TO ports_NNS to_TO be_VB named_VBN ._.
|
||||
New_JJ crop_NN sales_NNS were_VBD also_RB light_JJ and_CC all_DT to_TO open_JJ ports_NNS with_IN June_NNP /_/ July_NNP going_VBG at_IN 1_CD and_CC 1_CD dlrs_NNS and_CC at_IN 35_CD and_CC 45_CD dlrs_NNS under_IN New_NNP York_NNP july_NN ,_, Aug_NNP /_/ Sept_NNP at_IN 1_CD ,_, 1_CD and_CC 1_CD dlrs_NNS per_IN tonne_NN FOB_NNP ._.
|
||||
Routine_JJ sales_NNS of_IN butter_NN were_VBD made_VBN ._.
|
||||
March_NNP /_/ April_NNP sold_VBD at_IN 4_CD ,_, 4_CD and_CC 4_CD dlrs_NNS ._.
|
||||
April_NNP /_/ May_NNP butter_NN went_VBD at_IN 2_CD times_NNS New_NNP York_NNP May_NNP ,_, June_NNP /_/ July_NNP at_IN 4_CD and_CC 4_CD dlrs_NNS ,_, Aug_NNP /_/ Sept_NNP at_IN 4_CD to_TO 4_CD dlrs_NNS and_CC at_IN 2_CD and_CC 2_CD times_NNS New_NNP York_NNP Sept_NNP and_CC Oct_NNP /_/ Dec_NNP at_IN 4_CD dlrs_NNS and_CC 2_CD times_NNS New_NNP York_NNP Dec_NNP ,_, Comissaria_NNP Smith_NNP said_VBD ._.
|
||||
Destinations_NNS were_VBD the_DT U.S._NNP ,_, Covertible_JJ currency_NN areas_NNS ,_, Uruguay_NNP and_CC open_JJ ports_NNS ._.
|
||||
Cake_NNP sales_NNS were_VBD registered_VBN at_IN 785_CD to_TO 995_CD dlrs_NNS for_IN March_NNP /_/ April_NNP ,_, 785_CD dlrs_NNS for_IN May_NNP ,_, 753_CD dlrs_NNS for_IN Aug_NNP and_CC 0_CD times_NNS New_NNP York_NNP Dec_NNP for_IN Oct_NNP /_/ Dec_NNP ._.
|
||||
Buyers_NNS were_VBD the_DT U.S._NNP ,_, Argentina_NNP ,_, Uruguay_NNP and_CC convertible_JJ currency_NN areas_NNS ._.
|
||||
Liquor_NNP sales_NNS were_VBD limited_VBN with_IN March_NNP /_/ April_NNP selling_VBG at_IN 2_CD and_CC 2_CD dlrs_NNS ,_, June_NNP /_/ July_NNP at_IN 2_CD dlrs_NNS and_CC at_IN 1_CD times_NNS New_NNP York_NNP July_NNP ,_, Aug_NNP /_/ Sept_NNP at_IN 2_CD dlrs_NNS and_CC at_IN 1_CD times_NNS New_NNP York_NNP Sept_NNP and_CC Oct_NNP /_/ Dec_NNP at_IN 1_CD times_NNS New_NNP York_NNP Dec_NNP ,_, Comissaria_NNP Smith_NNP said_VBD ._.
|
||||
Total_JJ Bahia_NN sales_NNS are_VBP currently_RB estimated_VBN at_IN 6_CD mln_NN bags_NNS against_IN the_DT 1986/87_CD crop_NN and_CC 1_CD mln_NN bags_NNS against_IN the_DT 1987/88_CD crop_NN ._.
|
||||
Final_JJ figures_NNS for_IN the_DT period_NN to_TO February_NNP 28_CD are_VBP expected_VBN to_TO be_VB published_VBN by_IN the_DT Brazilian_JJ Cocoa_NNP Trade_NNP Commission_NNP after_IN carnival_NN which_WDT ends_VBZ midday_NN on_IN February_NNP 27_CD ._.
|
||||
Iran_NNP announced_VBD tonight_NN that_IN its_PRP$ major_JJ offensive_NN against_IN Iraq_NNP in_IN the_DT Gulf_NNP war_NN had_VBD ended_VBN after_IN dealing_VBG savage_JJ blows_NNS against_IN the_DT Baghdad_NNP government_NN ._.
|
||||
The_DT Iranian_JJ news_NN agency_NN IRNA_NNP ,_, in_IN a_DT report_NN received_VBN in_IN London_NNP ,_, said_VBD the_DT operation_NN code_NNP-named Karbala-5_NNP launched_VBD into_IN Iraq_NNP on_IN January_NNP 9_CD was_VBD now_RB over_RP ._.
|
||||
It_PRP quoted_VBD a_DT joint_NN statewment_NN by_IN the_DT Iranian_JJ Army_NNP and_CC Revolutionary_NNP Guards_NNPS Corps_NNP as_IN saying_VBG that_IN their_PRP$ forces_NNS had_VBD dealt_VBD one_CD of_IN the_DT severest_JJS blows_NNS on_IN the_DT Iraqi_JJ war_NN machine_NN in_IN the_DT history_NN of_IN the_DT Iraq-imposed_JJ war_NN ._.
|
||||
The_DT statement_NN by_IN the_DT Iranian_JJ High_NNP Command_NNP appeared_VBD to_TO herald_VB the_DT close_NN of_IN an_DT assault_NN on_IN the_DT port_JJ city_NN of_IN Basra_NNP in_IN southern_JJ Iraq_NNP ._.
|
||||
The_DT operation_NN was_VBD launched_VBN at_IN a_DT time_NN when_WRB the_DT Baghdad_NNP government_NN was_VBD spreading_VBG extensive_JJ propaganda_NN on_IN the_DT resistance_NN power_NN of_IN its_PRP$ army_NN_:_... ,_, said_VBD the_DT statement_NN quoted_VBN by_IN IRNA_NNP ._.
|
||||
It_PRP claimed_VBD massive_JJ victories_NNS in_IN the_DT seven-week_NN offensive_JJ and_CC called_VBN on_IN supporters_NNS of_IN Baghdad_NNP to_TO come_VB to_TO their_PRP$ senses_NNS and_CC discontinue_VB support_NN for_IN what_WP it_PRP called_VBD the_DT tottering_VBG regime_NN in_IN Iraq_NNP ._.
|
||||
Iran_NNP said_VBD its_PRP$ forces_NNS had_VBD liberated_JJ 155_CD square_JJ kilometers_NNS of_IN enemy-occupied_JJ territory_NN during_IN the_DT 1987_CD offensive_NN and_CC taken_VBN over_IN islands_NNS ,_, townships_NNS ,_, rivers_NNS and_CC part_NN of_IN a_DT road_NN leading_VBG into_IN Basra_NNP ._.
|
||||
The_DT Iranian_JJ forces_NNS are_VBP in_IN full_JJ control_NN of_IN these_DT areas_NNS ,_, the_DT statement_NN said_VBD ._.
|
||||
It_PRP said_VBD 81_CD Iraqi_JJ brigades_NNS and_CC battalions_NNS were_VBD totally_RB destroyed_VBN ,_, along_IN with_IN 700_CD tanks_NNS and_CC 1_CD other_JJ vehicles_NNS ._. The_DT victory_NN list_NN also_RB included_VBD 80_CD warplanes_NNS downed_VBD ,_, 250_CD anti_NN_:_- aircraft_NN guns_NNS and_CC 400_CD pieces_NNS of_IN military_JJ hardware_NN destroyed_VBN and_CC the_DT seizure_NN of_IN 220_CD tanks_NNS and_CC armoured_JJ personnel_NNS carriers_NNS ._.
|
||||
Sentence_NN number_NN 1_CD has_VBZ 6_CD words_NNS ._. Sentence_NN number_NN 2_CD ,_, 5_CD words_NNS ._.
|
||||
They_NNP sent_VBD him_PRP running_VBG in_IN the_DT evening_NN ._.
|
||||
He_PRP did_VBD not_RB come_VB back_RB ._.
|
|
@ -0,0 +1,144 @@
|
|||
Iran announced tonight that its major offensive against Iraq in the Gulf war had ended after dealing savage blows against the Baghdad government.
|
||||
The Iranian news agency IRNA, in a report received in London, said the operation code-named Karbala-5 launched into Iraq on January 9 was now over.
|
||||
It quoted a joint statewment by the Iranian Army and Revolutionary Guards Corps as saying that their forces had "dealt one of the severest blows on the Iraqi war machine in the history of the Iraq-imposed war."
|
||||
The statement by the Iranian High Command appeared to herald the close of an assault on the port city of Basra in southern Iraq.
|
||||
"The operation was launched at a time when the Baghdad government was spreading extensive propaganda on the resistance power of its army...," said the statement quoted by IRNA.
|
||||
It claimed massive victories in the seven-week offensive and called on supporters of Baghdad to "come to their senses" and discontinue support for what it called the tottering regime in Iraq.
|
||||
Iran said its forces had "liberated" 155 square kilometers of enemy-occupied territory during the 1987 offensive and taken over islands, townships, rivers and part of a road leading into Basra.
|
||||
The Iranian forces "are in full control of these areas," the statement said.
|
||||
It said 81 Iraqi brigades and battalions were totally destroyed, along with 700 tanks and 1,500 other vehicles.
|
||||
The victory list also included 80 warplanes downed, 250 anti- aircraft guns and 400 pieces of military hardware destroyed and the seizure of 220 tanks and armoured personnel carriers.
|
||||
|
||||
U.S. bank discount window borrowings less extended credits averaged 310 mln dlrs in the week to Wednesday February 25, the Federal Reserve said.
|
||||
The Fed said that overall borrowings in the week fell 131 mln dlrs to 614 mln dlrs, with extended credits up 10 mln dlrs at 304 mln dlrs.
|
||||
The week was the second half of a two-week statement period.
|
||||
Net borrowings in the prior week averaged 451 mln dlrs.
|
||||
Commenting on the two-week statement period ended February 25, the Fed said that banks had average net free reserves of 644 mln dlrs a day, down from 1.34 billion two weeks earlier.
|
||||
A Federal Reserve spokesman told a press briefing that there were no large single day net misses in the Fed's reserve projections in the week to Wednesday.
|
||||
He said that natural float had been "acting a bit strangely" for this time of year, noting that there had been poor weather during the latest week.
|
||||
The spokesman said that natural float ranged from under 500 mln dlrs on Friday, for which he could give no reason, to nearly one billion dlrs on both Thursday and Wednesday.
|
||||
The Fed spokeman could give no reason for Thursday's high float, but he said that about 750 mln dlrs of Wednesday's float figure was due to holdover and transportation float at two widely separated Fed districts.
|
||||
For the week as a whole, he said that float related as of adjustments were "small," adding that they fell to a negative 750 mln dlrs on Tuesday due to a number of corrections for unrelated cash letter errors in six districts around the country.
|
||||
The spokesman said that on both Tuesday and Wednesday, two different clearing banks had system problems and the securities and Federal funds wires had to be held open until about 2000 or 2100 EST on both days.
|
||||
However, he said that both problems were cleared up during both afternoons and there was no evidence of any reserve impact.
|
||||
During the week ended Wednesday, 45 pct of net discount window borrowings were made by the smallest banks, with 30 pct by the 14 large money center banks and 25 pct by large regional institutions.
|
||||
On Wednesday, 55 pct of the borrowing was accounted for by the money center banks, with 30 pct by the large regionals and 15 pct by the smallest banks.
|
||||
The Fed spokesman said the banking system had excess reserves on Thursday, Monday and Tuesday and a deficit on Friday and Wedndsday.
|
||||
That produced a small daily average deficit for the week as a whole.
|
||||
For the two-week period, he said there were relatively high excess reserves on a daily avearge, almost all of which were at the smallest banks.
|
||||
|
||||
American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc, but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock.
|
||||
American Express stock got a lift from the rumor, as the market calculated a partially public Shearson may command a good market value, thereby boosting the total value of American Express.
|
||||
The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend.
|
||||
American Express closed on the New York Stock Exchange at 72-5/8, up 4-1/8 on heavy volume.
|
||||
American Express would not comment on the rumors or its stock activity.
|
||||
Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes.
|
||||
At the meeting, company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson, according to analysts.
|
||||
Yesterday, Shearson said it was elevating its chief operating officer, Jeffery Lane, to the added position of president, which had been vacant.
|
||||
It also created four new positions for chairmen of its operating divisions.
|
||||
Analysts speculated a partial spinoff would make most sense, contrary to one variation on market rumors of a total spinoff.
|
||||
Some analysts, however, disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express, contributing about 20 pct of earnings last year.
|
||||
"I think it is highly unlikely that American Express is going to sell shearson," said Perrin Long of Lipper Analytical.
|
||||
He questioned what would be a better investment than "a very profitable securities firm."
|
||||
Several analysts said American Express is not in need of cash, which might be the only reason to sell a part of a strong asset.
|
||||
But others believe the company could very well of considered the option of spinning out part of Shearson, and one rumor suggests selling about 20 pct of it in the market.
|
||||
Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past.
|
||||
"Shearson being as profitable as it is would have fetched a big premium in the market place.
|
||||
Shearson's book value is in the 1.4 mln dlr range.
|
||||
Shearson in the market place would probably be worth three to 3.5 bilion dlrs in terms of market capitalization," said Eckenfelder.
|
||||
Some analysts said American Express could use capital since it plans to expand globally.
|
||||
"They have enormous internal growth plans that takes capital.
|
||||
You want your stock to reflect realistic valuations to enhance your ability to make all kinds of endeavors down the road," said E.F. Hutton Group analyst Michael Lewis.
|
||||
"They've outlined the fact that they're investing heavily in the future, which goes heavily into the international arena," said Lewis.
|
||||
"...That does not preclude acquisitions and divestitures along the way," he said.
|
||||
Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson, its stock might better reflect other assets, such as the travel related services business.
|
||||
"It could find its true water mark with a lesser exposure to brokerage.
|
||||
The value of the other components could command a higher multiple because they constitute a higher percentage of the total operating earnings of the company," he said.
|
||||
Lewis said Shearson contributed 316 mln in after-tax operating earnings, up from about 200 mln dlrs in 1985.
|
||||
Reuter 
|
||||
|
||||
Coleco Industries Inc said it expects to return to profitability in 1987.
|
||||
Earlier, Coleco reported a net loss of 111.2 mln dlrs for the year ended December 31 compared to a profit of 64.2 mln dlrs in the year earlier.
|
||||
In a prepared statement, the company said the dramatic swing in operating results was due primarily to the steep decline in sales of Cabbage Patch Kids products from 600 mln dlrs to 230 mln dlrs.
|
||||
Coleco said it changed from a single product company to a more diversified organization through four major acquisitions last year.
|
||||
Products from the new acquisitions and other new product introductions are expected to enable it to return to profitability, it said.
|
||||
At the annual Toy Fair earlier this month, vice president Morton Handel said analysts' 1987 projected earnings of 90 cts a share on sales of 600 mln dlrs are reasonable.
|
||||
Venezuela is seeking a 'constructive and flexible' attitude from its creditor banks in current talks to reschedule 21 billion dlrs in foreign debt, finance minister manuel azpurua told a press conference.
|
||||
He declined to comment on meetings this week in new york between public finances director jorge marcano and venezuela's 13-bank advisory committee except to say, "they are progressing."
|
||||
Azpurua said venezuela has shown solidarity with brazil's decision to suspend payments, but each country must negotiate according to its own interest.
|
||||
Asked to comment on chile's agreement with its creditors today, which includes an interest rate margin of one pct over libor, azpurua said only, "that is good news."
|
||||
According to banking sources, the banks' latest offer to venezuela is also a one pct margin as against the last february's 1-1/8 pct rescheduling accord and the 7/8 pct Venezuela wants.
|
||||
Azpurua said four basic elements are being negotiated with the banks now: spread reduction, deferral of principal payments due in 1987 and 1988, lenghtening the 12-1/2 year repayment schedule, and debt capitalization schemes.
|
||||
Azpurua said the governent plans to pay 2.1 billion dlrs in public and private debt principal this year.
|
||||
It was due to amortize 1.05 billion dlrs under the rescheduling, and pay 420 mln dlrs in non-restructured principal, both public sector.
|
||||
He said venezuela's original proposal was to pay no principal on restructured debt this year, but is now insisting that if it makes payments they be compensated by new bank loans.
|
||||
The banking sources said the committee has been prepared to lower amortizations to around 400 mln dlrs this year, but that no direct commitment was likely on new loans.
|
||||
"debtors and bank creditors have a joint responsibility and there will be no lasting solution unless a positive flow of financing is guaranteed," azpurua said.
|
||||
However, he appeared to discard earlier venezuelan proposals for a direct link between oil income and debt payments, "because circumstances change too quickly."
|
||||
At the same time, he said the government is presently studying possible mechanisms for capitlizing public and private sector foreign debt, based on experience in other countries.
|
||||
The rules would be published by the finance ministry and the central bank.
|
||||
|
||||
Thomson McKinnon Mortgage Assets Corp, a unit of Thomson McKinnon Inc, is offering 100 mln dlrs of collateralized mortgage obligations in three tranches that include floating rate and inverse floating rate CMOS.
|
||||
The floating rate class amounts to 60 mln dlrs.
|
||||
It has an average life of 7.11 years and matures 2018.
|
||||
The CMOs have an initial coupon of 7.0375 pct, which will be reset 60 basis points above LIBOR, said sole manager Thomson McKinnon.
|
||||
The inverse floater totals 4.8 mln dlrs.
|
||||
It has an average life of 13.49 years and matures 2018.
|
||||
These CMOs were given an initial coupon of 11-1/2 pct and priced at 104.40.
|
||||
Subsequent rates on the inverse floater will equal 11-1/2 pct minus the product of three times (LIBOR minus 6-1/2 pct).
|
||||
A Thomson officer explained that the coupon of the inverse floating rate tranche would increase if LIBOR declined.
|
||||
"The yield floats opposite of LIBOR," he said.
|
||||
The fixed-rate tranche totals 35.2 mln dlrs.
|
||||
It has an average life of 3.5 years and matures 2016.
|
||||
The CMOs were assigned a 7.65 pct coupon and par pricing.
|
||||
The issue is rated AAA by Standard and Poor's and secured by Federal Home Loan Mortgage Corp, Freddie Mac, certificates.
|
||||
|
||||
|
||||
OPEC may be forced to meet before a scheduled June session to readdress its production cutting agreement if the organization wants to halt the current slide in oil prices, oil industry analysts said.
|
||||
"The movement to higher oil prices was never to be as easy as OPEC thought.
|
||||
They may need an emergency meeting to sort out the problems," said Daniel Yergin, director of Cambridge Energy Research Associates, CERA.
|
||||
Analysts and oil industry sources said the problem OPEC faces is excess oil supply in world oil markets.
|
||||
"OPEC's problem is not a price problem but a production issue and must be addressed in that way," said Paul Mlotok, oil analyst with Salomon Brothers Inc.
|
||||
He said the market's earlier optimism about OPEC and its ability to keep production under control have given way to a pessimistic outlook that the organization must address soon if it wishes to regain the initiative in oil prices.
|
||||
But some other analysts were uncertain that even an emergency meeting would address the problem of OPEC production above the 15.8 mln bpd quota set last December.
|
||||
"OPEC has to learn that in a buyers market you cannot have deemed quotas, fixed prices and set differentials," said the regional manager for one of the major oil companies who spoke on condition that he not be named.
|
||||
"The market is now trying to teach them that lesson again," he added.
|
||||
David T. Mizrahi, editor of Mideast reports, expects OPEC to meet before June, although not immediately.
|
||||
However, he is not optimistic that OPEC can address its principal problems.
|
||||
"They will not meet now as they try to take advantage of the winter demand to sell their oil, but in late March and April when demand slackens," Mizrahi said.
|
||||
But Mizrahi said that OPEC is unlikely to do anything more than reiterate its agreement to keep output at 15.8 mln bpd."
|
||||
Analysts said that the next two months will be critical for OPEC's ability to hold together prices and output.
|
||||
"OPEC must hold to its pact for the next six to eight weeks since buyers will come back into the market then," said Dillard Spriggs of Petroleum Analysis Ltd in New York.
|
||||
But Bijan Moussavar-Rahmani of Harvard University's Energy and Environment Policy Center said that the demand for OPEC oil has been rising through the first quarter and this may have prompted excesses in its production.
|
||||
"Demand for their (OPEC) oil is clearly above 15.8 mln bpd and is probably closer to 17 mln bpd or higher now so what we are seeing characterized as cheating is OPEC meeting this demand through current production," he told Reuters in a telephone interview.
|
||||
|
||||
BankAmerica Corp is not under pressure to act quickly on its proposed equity offering and would do well to delay it because of the stock's recent poor performance, banking analysts said.
|
||||
Some analysts said they have recommended BankAmerica delay its up to one-billion-dlr equity offering, which has yet to be approved by the Securities and Exchange Commission.
|
||||
BankAmerica stock fell this week, along with other banking issues, on the news that Brazil has suspended interest payments on a large portion of its foreign debt.
|
||||
The stock traded around 12, down 1/8, this afternoon, after falling to 11-1/2 earlier this week on the news.
|
||||
Banking analysts said that with the immediate threat of the First Interstate Bancorp <I> takeover bid gone, BankAmerica is under no pressure to sell the securities into a market that will be nervous on bank stocks in the near term.
|
||||
BankAmerica filed the offer on January 26.
|
||||
It was seen as one of the major factors leading the First Interstate withdrawing its takeover bid on February 9.
|
||||
A BankAmerica spokesman said SEC approval is taking longer than expected and market conditions must now be re-evaluated.
|
||||
"The circumstances at the time will determine what we do," said Arthur Miller, BankAmerica's Vice President for Financial Communications, when asked if BankAmerica would proceed with the offer immediately after it receives SEC approval.
|
||||
"I'd put it off as long as they conceivably could," said Lawrence Cohn, analyst with Merrill Lynch, Pierce, Fenner and Smith.
|
||||
Cohn said the longer BankAmerica waits, the longer they have to show the market an improved financial outlook.
|
||||
Although BankAmerica has yet to specify the types of equities it would offer, most analysts believed a convertible preferred stock would encompass at least part of it.
|
||||
Such an offering at a depressed stock price would mean a lower conversion price and more dilution to BankAmerica stock holders, noted Daniel Williams, analyst with Sutro Group.
|
||||
Several analysts said that while they believe the Brazilian debt problem will continue to hang over the banking industry through the quarter, the initial shock reaction is likely to ease over the coming weeks.
|
||||
Nevertheless, BankAmerica, which holds about 2.70 billion dlrs in Brazilian loans, stands to lose 15-20 mln dlrs if the interest rate is reduced on the debt, and as much as 200 mln dlrs if Brazil pays no interest for a year, said Joseph Arsenio, analyst with Birr, Wilson and Co.
|
||||
He noted, however, that any potential losses would not show up in the current quarter.
|
||||
|
||||
The Federal Deposit Insurance Corp (FDIC) said three troubled banks in Texas and Louisiana were merged with healthy financial institutions.
|
||||
The FDIC said it subsidized the merger of Central Bank and Trust Co, Glenmora, La., with the healthy Peoples Bank and Trust Co, Natchitoches, La., after state regulators notified it that Central was in danger of failing.
|
||||
Central had assets of 28.3 mln dlrs.
|
||||
The FDIC said the deposits of the failed Farmers State Bank, Hart, Tex., were assumed by Hale County State Bank, Plainview, Tex.
|
||||
Farmers, with 9.6 mln dlrs in assets, was closed by Texas bank regulators.
|
||||
The deposits of the failed First National Bank of Crosby, Crosby, Tex., with total assets of 8.2 mln dlrs, were assumed by Central Bancshares of the South Inc, Birmingham, Ala., after First National was closed by federal bank regulators, the FDIC said.
|
||||
Brazil's 14-bank advisory committee expressed "grave concern" to chief debt negotiator Antonio Padua de Seixas over the country's suspension of interest payments, according to a telex from committee chairman Citibank to creditor banks worldwide.
|
||||
Bankers said the diplomatic phrase belied the deep anger and frustration on the committee over Brazil's unilateral move last Friday and its subsequent freeze on some 15 billion dlrs of short-term trade and interbank lines.
|
||||
Seixas, director of the Brazilian central bank's foreign debt department, met the full panel on Tuesday and Wednesday.
|
||||
Seixas, who met again this morning with senior Citibank executive William Rhodes and representatives from committee vice-chairmen Morgan Guaranty Trust Co and Lloyds Bank Plc, told the banks that the government was preparing a telex to explain and clarify the freeze on short-term credits.
|
||||
The telex could be sent to creditors as early as today, bankers said.
|
||||
Despite the rising tempers, bankers said there are no plans for Brazilian finance minister Dilson Funaro to meet commercial bankers during his trip to Washington on Friday and Saturday.
|
||||
Funaro will be explaining Brazil's actions to U.S. Treasury Secretary James Baker, Federal Reserve Board chairman Paul Volcker and International Monetary Fund managing director Michel Camdessus before travelling to Europe at the weekend.
|
|
@ -0,0 +1,69 @@
|
|||
Iran announced tonight that its major offensive against Iraq in the Gulf war had ended after dealing savage blows against the Baghdad government<SPLIT>.
|
||||
The Iranian news agency IRNA<SPLIT>, in a report received in London<SPLIT>, said the operation code-named Karbala-5 launched into Iraq on January 9 was now over<SPLIT>.
|
||||
It quoted a joint statewment by the Iranian Army and Revolutionary Guards Corps as saying that their forces had "<SPLIT>dealt one of the severest blows on the Iraqi war machine in the history of the Iraq-imposed war<SPLIT>.<SPLIT>"
|
||||
The statement by the Iranian High Command appeared to herald the close of an assault on the port city of Basra in southern Iraq<SPLIT>.
|
||||
"<SPLIT>The operation was launched at a time when the Baghdad government was spreading extensive propaganda on the resistance power of its army<SPLIT>...<SPLIT>,<SPLIT>" said the statement quoted by IRNA<SPLIT>.
|
||||
It claimed massive victories in the seven-week offensive and called on supporters of Baghdad to "<SPLIT>come to their senses<SPLIT>" and discontinue support for what it called the tottering regime in Iraq<SPLIT>.
|
||||
Iran said its forces had "<SPLIT>liberated<SPLIT>" 155 square kilometers of enemy-occupied territory during the 1987 offensive and taken over islands<SPLIT>, townships<SPLIT>, rivers and part of a road leading into Basra<SPLIT>.
|
||||
The Iranian forces "<SPLIT>are in full control of these areas<SPLIT>,<SPLIT>" the statement said<SPLIT>.
|
||||
It said 81 Iraqi brigades and battalions were totally destroyed<SPLIT>, along with 700 tanks and 1,500 other vehicles<SPLIT>.
|
||||
|
||||
U.S. bank discount window borrowings less extended credits averaged 310 mln dlrs in the week to Wednesday February 25<SPLIT>, the Federal Reserve said<SPLIT>.
|
||||
The Fed said that overall borrowings in the week fell 131 mln dlrs to 614 mln dlrs<SPLIT>, with extended credits up 10 mln dlrs at 304 mln dlrs<SPLIT>.
|
||||
The week was the second half of a two-week statement period<SPLIT>.
|
||||
Net borrowings in the prior week averaged 451 mln dlrs<SPLIT>.
|
||||
Commenting on the two-week statement period ended February 25<SPLIT>, the Fed said that banks had average net free reserves of 644 mln dlrs a day<SPLIT>, down from 1.34 billion two weeks earlier<SPLIT>.
|
||||
A Federal Reserve spokesman told a press briefing that there were no large single day net misses in the Fed's reserve projections in the week to Wednesday<SPLIT>.
|
||||
He said that natural float had been "<SPLIT>acting a bit strangely<SPLIT>" for this time of year<SPLIT>, noting that there had been poor weather during the latest week<SPLIT>.
|
||||
The spokesman said that natural float ranged from under 500 mln dlrs on Friday<SPLIT>, for which he could give no reason<SPLIT>, to nearly one billion dlrs on both Thursday and Wednesday<SPLIT>.
|
||||
The Fed spokeman could give no reason for Thursday's high float<SPLIT>, but he said that about 750 mln dlrs of Wednesday's float figure was due to holdover and transportation float at two widely separated Fed districts<SPLIT>.
|
||||
For the week as a whole<SPLIT>, he said that float related as of adjustments were "<SPLIT>small<SPLIT>,<SPLIT>" adding that they fell to a negative 750 mln dlrs on Tuesday due to a number of corrections for unrelated cash letter errors in six districts around the country<SPLIT>.
|
||||
The spokesman said that on both Tuesday and Wednesday<SPLIT>, two different clearing banks had system problems and the securities and Federal funds wires had to be held open until about 2000 or 2100 EST on both days<SPLIT>.
|
||||
However<SPLIT>, he said that both problems were cleared up during both afternoons and there was no evidence of any reserve impact<SPLIT>.
|
||||
During the week ended Wednesday<SPLIT>, 45 pct of net discount window borrowings were made by the smallest banks<SPLIT>, with 30 pct by the 14 large money center banks and 25 pct by large regional institutions<SPLIT>.
|
||||
On Wednesday<SPLIT>, 55 pct of the borrowing was accounted for by the money center banks<SPLIT>, with 30 pct by the large regionals and 15 pct by the smallest banks<SPLIT>.
|
||||
The Fed spokesman said the banking system had excess reserves on Thursday<SPLIT>, Monday and Tuesday and a deficit on Friday and Wedndsday<SPLIT>.
|
||||
That produced a small daily average deficit for the week as a whole<SPLIT>.
|
||||
For the two-week period<SPLIT>, he said there were relatively high excess reserves on a daily avearge<SPLIT>, almost all of which were at the smallest banks<SPLIT>.
|
||||
American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc<SPLIT>, but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock<SPLIT>.
|
||||
American Express stock got a lift from the rumor<SPLIT>, as the market calculated a partially public Shearson may command a good market value<SPLIT>, thereby boosting the total value of American Express<SPLIT>.
|
||||
The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend<SPLIT>.
|
||||
American Express closed on the New York Stock Exchange at 72-5/8<SPLIT>, up 4-1/8 on heavy volume<SPLIT>.
|
||||
American Express would not comment on the rumors or its stock activity<SPLIT>.
|
||||
Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes<SPLIT>.
|
||||
At the meeting<SPLIT>, company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson<SPLIT>, according to analysts<SPLIT>.
|
||||
Yesterday<SPLIT>, Shearson said it was elevating its chief operating officer<SPLIT>, Jeffery Lane<SPLIT>, to the added position of president<SPLIT>, which had been vacant<SPLIT>.
|
||||
It also created four new positions for chairmen of its operating divisions<SPLIT>.
|
||||
Analysts speculated a partial spinoff would make most sense<SPLIT>, contrary to one variation on market rumors of a total spinoff<SPLIT>.
|
||||
Some analysts<SPLIT>, however<SPLIT>, disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express<SPLIT>, contributing about 20 pct of earnings last year<SPLIT>.
|
||||
"<SPLIT>I think it is highly unlikely that American Express is going to sell shearson<SPLIT>,<SPLIT>" said Perrin Long of Lipper Analytical<SPLIT>.
|
||||
He questioned what would be a better investment than "<SPLIT>a very profitable securities firm<SPLIT>.<SPLIT>"
|
||||
Several analysts said American Express is not in need of cash<SPLIT>, which might be the only reason to sell a part of a strong asset<SPLIT>.
|
||||
But others believe the company could very well of considered the option of spinning out part of Shearson<SPLIT>, and one rumor suggests selling about 20 pct of it in the market<SPLIT>.
|
||||
Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past<SPLIT>.
|
||||
"<SPLIT>Shearson being as profitable as it is would have fetched a big premium in the market place<SPLIT>.
|
||||
Some analysts said American Express could use capital since it plans to expand globally<SPLIT>.
|
||||
"<SPLIT>They've outlined the fact that they're investing heavily in the future<SPLIT>, which goes heavily into the international arena<SPLIT>,<SPLIT>" said Lewis<SPLIT>.
|
||||
Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson<SPLIT>, its stock might better reflect other assets<SPLIT>, such as the travel related services business<SPLIT>.
|
||||
Lewis said Shearson contributed 316 mln in after-tax operating earnings<SPLIT>, up from about 200 mln dlrs in 1985<SPLIT>.
|
||||
Coleco Industries Inc said it expects to return to profitability in 1987<SPLIT>.
|
||||
Earlier<SPLIT>, Coleco reported a net loss of 111.2 mln dlrs for the year ended December 31 compared to a profit of 64.2 mln dlrs in the year earlier<SPLIT>.
|
||||
In a prepared statement<SPLIT>, the company said the dramatic swing in operating results was due primarily to the steep decline in sales of Cabbage Patch Kids products from 600 mln dlrs to 230 mln dlrs<SPLIT>.
|
||||
Coleco said it changed from a single product company to a more diversified organization through four major acquisitions last year<SPLIT>.
|
||||
Products from the new acquisitions and other new product introductions are expected to enable it to return to profitability<SPLIT>, it said<SPLIT>.
|
||||
At the annual Toy Fair earlier this month<SPLIT>, vice president Morton Handel said analysts' 1987 projected earnings of 90 cts a share on sales of 600 mln dlrs are reasonable<SPLIT>.
|
||||
Azpurua said venezuela has shown solidarity with brazil's decision to suspend payments<SPLIT>, but each country must negotiate according to its own interest<SPLIT>.
|
||||
Azpurua said the governent plans to pay 2.1 billion dlrs in public and private debt principal this year<SPLIT>.
|
||||
It was due to amortize 1.05 billion dlrs under the rescheduling<SPLIT>, and pay 420 mln dlrs in non-restructured principal<SPLIT>, both public sector<SPLIT>.
|
||||
He said venezuela's original proposal was to pay no principal on restructured debt this year<SPLIT>, but is now insisting that if it makes payments they be compensated by new bank loans<SPLIT>.
|
||||
The banking sources said the committee has been prepared to lower amortizations to around 400 mln dlrs this year<SPLIT>, but that no direct commitment was likely on new loans<SPLIT>.
|
||||
At the same time<SPLIT>, he said the government is presently studying possible mechanisms for capitlizing public and private sector foreign debt<SPLIT>, based on experience in other countries<SPLIT>.
|
||||
The rules would be published by the finance ministry and the central bank<SPLIT>.
|
||||
|
||||
Thomson McKinnon Mortgage Assets Corp<SPLIT>, a unit of Thomson McKinnon Inc<SPLIT>, is offering 100 mln dlrs of collateralized mortgage obligations in three tranches that include floating rate and inverse floating rate CMOS<SPLIT>.
|
||||
The floating rate class amounts to 60 mln dlrs<SPLIT>.
|
||||
The inverse floater totals 4.8 mln dlrs<SPLIT>.
|
||||
Subsequent rates on the inverse floater will equal 11-1/2 pct minus the product of three times (<SPLIT>LIBOR minus 6-1/2 pct<SPLIT>)<SPLIT>.
|
||||
A Thomson officer explained that the coupon of the inverse floating rate tranche would increase if LIBOR declined<SPLIT>.
|
||||
The fixed-rate tranche totals 35.2 mln dlrs<SPLIT>.
|
||||
The issue is rated AAA by Standard and Poor's and secured by Federal Home Loan Mortgage Corp<SPLIT>, Freddie Mac<SPLIT>, certificates<SPLIT>.
|
|
@ -20,12 +20,8 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.util.English;
|
||||
|
||||
public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||
|
@ -111,7 +107,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
7,
|
||||
1,
|
||||
null,
|
||||
true);
|
||||
true,
|
||||
null);
|
||||
}
|
||||
|
||||
private void doTestStopPositons(StopFilter stpf) throws IOException {
|
||||
|
|
|
@ -161,6 +161,9 @@ org.apache.james.apache.mime4j.version = 0.7.2
|
|||
|
||||
/org.apache.mina/mina-core = 2.0.0-M5
|
||||
|
||||
/org.apache.opennlp/opennlp-maxent = 3.0.3
|
||||
/org.apache.opennlp/opennlp-tools = 1.8.3
|
||||
|
||||
org.apache.pdfbox.version = 2.0.6
|
||||
/org.apache.pdfbox/fontbox = ${org.apache.pdfbox.version}
|
||||
/org.apache.pdfbox/jempbox = 1.8.13
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
55e39e6b46e71f35229cdd6950e72d8cce3b5fd4
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Apache OpenNLP Maxent
|
||||
Copyright 2013 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
|
@ -0,0 +1 @@
|
|||
3ce7c9056048f55478d983248cf18c7e02b1d072
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Apache OpenNLP Tools
|
||||
Copyright 2015 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
|
@ -285,6 +285,28 @@
|
|||
<property name="analyzers-icu-javadocs.uptodate" value="true"/>
|
||||
</target>
|
||||
|
||||
<property name="analyzers-opennlp.jar" value="${common.dir}/build/analysis/opennlp/lucene-analyzers-opennlp-${version}.jar"/>
|
||||
<target name="check-analyzers-opennlp-uptodate" unless="analyzers-opennlp.uptodate">
|
||||
<module-uptodate name="analysis/opennlp" jarfile="${analyzers-opennlp.jar}" property="analyzers-opennlp.uptodate"/>
|
||||
</target>
|
||||
<target name="jar-analyzers-opennlp" unless="analyzers-opennlp.uptodate" depends="check-analyzers-opennlp-uptodate">
|
||||
<ant dir="${common.dir}/analysis/opennlp" target="jar-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
<property name="analyzers-opennlp.uptodate" value="true"/>
|
||||
</target>
|
||||
|
||||
<property name="analyzers-opennlp-javadoc.jar" value="${common.dir}/build/analysis/opennlp/lucene-analyzers-opennlp-${version}-javadoc.jar"/>
|
||||
<target name="check-analyzers-opennlp-javadocs-uptodate" unless="analyzers-opennlp-javadocs.uptodate">
|
||||
<module-uptodate name="analysis/opennlp" jarfile="${analyzers-opennlp-javadoc.jar}" property="analyzers-opennlp-javadocs.uptodate"/>
|
||||
</target>
|
||||
<target name="javadocs-analyzers-opennlp" unless="analyzers-opennlp-javadocs.uptodate" depends="check-analyzers-opennlp-javadocs-uptodate">
|
||||
<ant dir="${common.dir}/analysis/opennlp" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
<property name="analyzers-opennlp-javadocs.uptodate" value="true"/>
|
||||
</target>
|
||||
|
||||
<property name="analyzers-phonetic.jar" value="${common.dir}/build/analysis/phonetic/lucene-analyzers-phonetic-${version}.jar"/>
|
||||
<target name="check-analyzers-phonetic-uptodate" unless="analyzers-phonetic.uptodate">
|
||||
<module-uptodate name="analysis/phonetic" jarfile="${analyzers-phonetic.jar}" property="analyzers-phonetic.uptodate"/>
|
||||
|
|
|
@ -41,6 +41,7 @@ import org.apache.lucene.util.Attribute;
|
|||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeReflector;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -127,7 +128,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// lastStartOffset)
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
|
||||
int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
|
||||
boolean offsetsAreCorrect) throws IOException {
|
||||
boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||
assertNotNull(output);
|
||||
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
|
@ -167,6 +168,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
keywordAtt = ts.getAttribute(KeywordAttribute.class);
|
||||
}
|
||||
|
||||
PayloadAttribute payloadAtt = null;
|
||||
if (payloads != null) {
|
||||
assertTrue("has no PayloadAttribute", ts.hasAttribute(PayloadAttribute.class));
|
||||
payloadAtt = ts.getAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
// Maps position to the start/end offset:
|
||||
final Map<Integer,Integer> posToStartOffset = new HashMap<>();
|
||||
final Map<Integer,Integer> posToEndOffset = new HashMap<>();
|
||||
|
@ -185,6 +192,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
|
||||
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
|
||||
if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
|
||||
if (payloadAtt != null) payloadAtt.setPayload(new BytesRef(new byte[] { 0x00, -0x21, 0x12, -0x43, 0x24 }));
|
||||
|
||||
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
|
||||
assertTrue("token "+i+" does not exist", ts.incrementToken());
|
||||
|
@ -209,6 +217,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (keywordAtts != null) {
|
||||
assertEquals("keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword());
|
||||
}
|
||||
if (payloads != null) {
|
||||
if (payloads[i] != null) {
|
||||
assertEquals("payloads " + i, new BytesRef(payloads[i]), payloadAtt.getPayload());
|
||||
} else {
|
||||
assertNull("payloads " + i, payloads[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// we can enforce some basic things about a few attributes even if the caller doesn't check:
|
||||
if (offsetAtt != null) {
|
||||
|
@ -283,6 +298,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (typeAtt != null) typeAtt.setType("bogusType");
|
||||
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
|
||||
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
|
||||
if (keywordAtt != null) keywordAtt.setKeyword(true);
|
||||
if (payloadAtt != null) payloadAtt.setPayload(new BytesRef(new byte[] { 0x00, -0x21, 0x12, -0x43, 0x24 }));
|
||||
|
||||
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
|
||||
|
||||
|
@ -305,7 +322,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
|
||||
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
|
||||
boolean offsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect);
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, offsetsAreCorrect, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
|
||||
|
@ -374,6 +391,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||
checkResetException(a, input);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, offsetsAreCorrect, payloads);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, null, null);
|
||||
}
|
||||
|
|
|
@ -53,6 +53,13 @@ New Features
|
|||
----------------------
|
||||
* SOLR-11285: Simulation framework for autoscaling. (ab)
|
||||
|
||||
* LUCENE-2899: In the Solr analysis-extras contrib, added support for the
|
||||
OpenNLP-based analysis components in the Lucene analysis/opennlp module:
|
||||
tokenization, part-of-speech tagging, phrase chunking, and lemmatization.
|
||||
Also added OpenNLP-based named entity extraction as a Solr update request
|
||||
processor. (Lance Norskog, Grant Ingersoll, Joern Kottmann, Em, Kai Gülzau,
|
||||
Rene Nederhand, Robert Muir, Steven Bower, Steve Rowe)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
The analysis-extras plugin provides additional analyzers that rely
|
||||
upon large dependencies/dictionaries.
|
||||
|
||||
It includes integration with ICU for multilingual support, and
|
||||
analyzers for Chinese and Polish.
|
||||
It includes integration with ICU for multilingual support,
|
||||
analyzers for Chinese and Polish, and integration with
|
||||
OpenNLP for multilingual tokenization, part-of-speech tagging
|
||||
lemmatization, phrase chunking, and named-entity recognition.
|
||||
|
||||
ICU relies upon lucene-libs/lucene-analyzers-icu-X.Y.jar
|
||||
and lib/icu4j-X.Y.jar
|
||||
|
@ -14,3 +16,5 @@ Stempel relies on lucene-libs/lucene-analyzers-stempel-X.Y.jar
|
|||
Morfologik relies on lucene-libs/lucene-analyzers-morfologik-X.Y.jar
|
||||
and lib/morfologik-*.jar
|
||||
|
||||
OpenNLP relies on lucene-libs/lucene-analyzers-opennlp-X.Y.jar
|
||||
and lib/opennlp-*.jar
|
||||
|
|
|
@ -30,13 +30,14 @@
|
|||
<path id="analysis.extras.lucene.libs">
|
||||
<pathelement location="${analyzers-icu.jar}"/>
|
||||
<!--
|
||||
Although the smartcn, stempel, and morfologik jars are not dependencies of
|
||||
Although the smartcn, stempel, morfologik and opennlp jars are not dependencies of
|
||||
code in the analysis-extras contrib, they must remain here in order to
|
||||
populate the Solr distribution
|
||||
-->
|
||||
<pathelement location="${analyzers-smartcn.jar}"/>
|
||||
<pathelement location="${analyzers-stempel.jar}"/>
|
||||
<pathelement location="${analyzers-morfologik.jar}"/>
|
||||
<pathelement location="${analyzers-opennlp.jar}"/>
|
||||
</path>
|
||||
|
||||
<path id="classpath">
|
||||
|
@ -54,7 +55,7 @@
|
|||
</path>
|
||||
|
||||
<!--
|
||||
Although the smartcn, stempel, and morfologik jars are not dependencies of
|
||||
Although the smartcn, stempel, morfologik and opennlp jars are not dependencies of
|
||||
code in the analysis-extras contrib, they must remain here in order to
|
||||
populate the Solr distribution
|
||||
-->
|
||||
|
@ -66,6 +67,7 @@
|
|||
<target name="jar-analyzers-smartcn"/>
|
||||
<target name="jar-analyzers-stempel"/>
|
||||
<target name="jar-analyzers-morfologik"/>
|
||||
<target name="jar-analyzers-opennlp"/>
|
||||
</antcall>
|
||||
<property name="analyzers-icu.uptodate" value="true"/> <!-- compile-time dependency -->
|
||||
<mkdir dir="${build.dir}/lucene-libs"/>
|
||||
|
@ -85,6 +87,6 @@
|
|||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="jar-analyzers-icu, solr-contrib-build.compile-core"/>
|
||||
<target name="compile-core" depends="jar-analyzers-icu, jar-analyzers-opennlp, solr-contrib-build.compile-core"/>
|
||||
<target name="dist" depends="module-jars-to-solr, common-solr.dist"/>
|
||||
</project>
|
||||
|
|
|
@ -24,6 +24,9 @@
|
|||
</configurations>
|
||||
<dependencies>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="${/com.ibm.icu/icu4j}" conf="compile"/>
|
||||
<dependency org="org.apache.opennlp" name="opennlp-tools" rev="${/org.apache.opennlp/opennlp-tools}" conf="compile" />
|
||||
<dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${/org.apache.opennlp/opennlp-maxent}" conf="compile" />
|
||||
|
||||
<!--
|
||||
Although the 3rd party morfologik jars are not dependencies of code in
|
||||
the analysis-extras contrib, they must remain here in order to
|
||||
|
|
|
@ -0,0 +1,571 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import opennlp.tools.util.Span;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.OpenNLPTokenizer;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPNERTaggerOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.Pair;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
|
||||
import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Extracts named entities using an OpenNLP NER <code>modelFile</code> from the values found in
|
||||
* any matching <code>source</code> field into a configured <code>dest</code> field, after
|
||||
* first tokenizing the source text using the index analyzer on the configured
|
||||
* <code>analyzerFieldType</code>, which must include <code>solr.OpenNLPTokenizerFactory</code>
|
||||
* as the tokenizer. E.g.:
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="opennlp-en-tokenization" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
* sentenceModel="en-sent.bin"
|
||||
* tokenizerModel="en-tokenizer.bin"/>
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
*
|
||||
* <p>See the <a href="OpenNLP website">http://opennlp.apache.org/models.html</a>
|
||||
* for information on downloading pre-trained models.</p>
|
||||
*
|
||||
* <p>
|
||||
* The <code>source</code> field(s) can be configured as either:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>One or more <code><str></code></li>
|
||||
* <li>An <code><arr></code> of <code><str></code></li>
|
||||
* <li>A <code><lst></code> containing
|
||||
* {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory style selector arguments}</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>The <code>dest</code> field can be a single <code><str></code>
|
||||
* containing the literal name of a destination field, or it may be a <code><lst></code> specifying a
|
||||
* regex <code>pattern</code> and a <code>replacement</code> string. If the pattern + replacement option
|
||||
* is used the pattern will be matched against all fields matched by the source selector, and the replacement
|
||||
* string (including any capture groups specified from the pattern) will be evaluated a using
|
||||
* {@link Matcher#replaceAll(String)} to generate the literal name of the destination field. Additionally,
|
||||
* an occurrence of the string "{EntityType}" in the <code>dest</code> field specification, or in the
|
||||
* <code>replacement</code> string, will be replaced with the entity type(s) returned for each entity by
|
||||
* the OpenNLP NER model; as a result, if the model extracts more than one entity type, then more than one
|
||||
* <code>dest</code> field will be populated.
|
||||
* </p>
|
||||
*
|
||||
* <p>If the resolved <code>dest</code> field already exists in the document, then the
|
||||
* named entities extracted from the <code>source</code> fields will be added to it.
|
||||
* </p>
|
||||
* <p>
|
||||
* In the example below:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>Named entities will be extracted from the <code>text</code> field and added
|
||||
* to the <code>names_ss</code> field</li>
|
||||
* <li>Named entities will be extracted from both the <code>title</code> and
|
||||
* <code>subtitle</code> fields and added into the <code>titular_people</code> field</li>
|
||||
* <li>Named entities will be extracted from any field with a name ending in <code>_txt</code>
|
||||
* -- except for <code>notes_txt</code> -- and added into the <code>people_ss</code> field</li>
|
||||
* <li>Named entities will be extracted from any field with a name beginning with "desc" and
|
||||
* ending in "s" (e.g. "descs" and "descriptions") and added to a field prefixed with "key_",
|
||||
* not ending in "s", and suffixed with "_people". (e.g. "key_desc_people" or
|
||||
* "key_description_people")</li>
|
||||
* <li>Named entities will be extracted from the <code>summary</code> field and added
|
||||
* to the <code>summary_person_ss</code> field, assuming that the modelFile only extracts
|
||||
* entities of type "person".</li>
|
||||
* </ul>
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <updateRequestProcessorChain name="multiple-extract">
|
||||
* <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
* <str name="modelFile">en-test-ner-person.bin</str>
|
||||
* <str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
* <str name="source">text</str>
|
||||
* <str name="dest">people_s</str>
|
||||
* </processor>
|
||||
* <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
* <str name="modelFile">en-test-ner-person.bin</str>
|
||||
* <str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
* <arr name="source">
|
||||
* <str>title</str>
|
||||
* <str>subtitle</str>
|
||||
* </arr>
|
||||
* <str name="dest">titular_people</str>
|
||||
* </processor>
|
||||
* <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
* <str name="modelFile">en-test-ner-person.bin</str>
|
||||
* <str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
* <lst name="source">
|
||||
* <str name="fieldRegex">.*_txt$</str>
|
||||
* <lst name="exclude">
|
||||
* <str name="fieldName">notes_txt</str>
|
||||
* </lst>
|
||||
* </lst>
|
||||
* <str name="dest">people_s</str>
|
||||
* </processor>
|
||||
* <processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
* <str name="modelFile">en-test-ner-person.bin</str>
|
||||
* <str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
* <lst name="source">
|
||||
* <str name="fieldRegex">^desc(.*)s$</str>
|
||||
* </lst>
|
||||
* <lst name="dest">
|
||||
* <str name="pattern">^desc(.*)s$</str>
|
||||
* <str name="replacement">key_desc$1_people</str>
|
||||
* </lst>
|
||||
* </processor>
|
||||
* <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
* <str name="modelFile">en-test-ner-person.bin</str>
|
||||
* <str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
* <str name="source">summary</str>
|
||||
* <str name="dest">summary_{EntityType}_s</str>
|
||||
* </processor>
|
||||
* </updateRequestProcessorChain>
|
||||
* </pre>
|
||||
*
|
||||
* @since 7.3.0
|
||||
*/
|
||||
public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
|
||||
extends UpdateRequestProcessorFactory implements SolrCoreAware {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public static final String SOURCE_PARAM = "source";
|
||||
public static final String DEST_PARAM = "dest";
|
||||
public static final String PATTERN_PARAM = "pattern";
|
||||
public static final String REPLACEMENT_PARAM = "replacement";
|
||||
public static final String MODEL_PARAM = "modelFile";
|
||||
public static final String ANALYZER_FIELD_TYPE_PARAM = "analyzerFieldType";
|
||||
public static final String ENTITY_TYPE = "{EntityType}";
|
||||
|
||||
private SelectorParams srcInclusions = new SelectorParams();
|
||||
private Collection<SelectorParams> srcExclusions = new ArrayList<>();
|
||||
|
||||
private FieldNameSelector srcSelector = null;
|
||||
|
||||
private String modelFile = null;
|
||||
private String analyzerFieldType = null;
|
||||
|
||||
/**
|
||||
* If pattern is null, this this is a literal field name. If pattern is non-null then this
|
||||
* is a replacement string that may contain meta-characters (ie: capture group identifiers)
|
||||
* @see #pattern
|
||||
*/
|
||||
private String dest = null;
|
||||
/** @see #dest */
|
||||
private Pattern pattern = null;
|
||||
|
||||
protected final FieldNameSelector getSourceSelector() {
|
||||
if (null != srcSelector) return srcSelector;
|
||||
|
||||
throw new SolrException(SERVER_ERROR, "selector was never initialized, inform(SolrCore) never called???");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public void init(NamedList args) {
|
||||
|
||||
// high level (loose) check for which type of config we have.
|
||||
//
|
||||
// individual init methods do more strict syntax checking
|
||||
if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) {
|
||||
initSourceSelectorSyntax(args);
|
||||
} else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
|
||||
initSimpleRegexReplacement(args);
|
||||
} else {
|
||||
throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+
|
||||
DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
|
||||
PATTERN_PARAM + "' init params are mandatory");
|
||||
}
|
||||
|
||||
Object modelParam = args.remove(MODEL_PARAM);
|
||||
if (null == modelParam) {
|
||||
throw new SolrException(SERVER_ERROR, "Missing required init param '" + MODEL_PARAM + "'");
|
||||
}
|
||||
if ( ! (modelParam instanceof CharSequence)) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "' must be a <str>");
|
||||
}
|
||||
modelFile = modelParam.toString();
|
||||
|
||||
Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
|
||||
if (null == analyzerFieldTypeParam) {
|
||||
throw new SolrException(SERVER_ERROR, "Missing required init param '" + ANALYZER_FIELD_TYPE_PARAM + "'");
|
||||
}
|
||||
if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
|
||||
}
|
||||
analyzerFieldType = analyzerFieldTypeParam.toString();
|
||||
|
||||
if (0 < args.size()) {
|
||||
throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
|
||||
}
|
||||
|
||||
super.init(args);
|
||||
}
|
||||
|
||||
/**
|
||||
* init helper method that should only be called when we know for certain that both the
|
||||
* "source" and "dest" init params do <em>not</em> exist.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void initSimpleRegexReplacement(NamedList args) {
|
||||
// The syntactic sugar for the case where there is only one regex pattern for source and the same pattern
|
||||
// is used for the destination pattern...
|
||||
//
|
||||
// pattern != null && replacement != null
|
||||
//
|
||||
// ...as top level elements, with no other config options specified
|
||||
|
||||
// if we got here we know we had pattern and replacement, now check for the other two so that we can give a better
|
||||
// message than "unexpected"
|
||||
if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) {
|
||||
throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
|
||||
PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM);
|
||||
}
|
||||
|
||||
assert args.indexOf(SOURCE_PARAM, 0) < 0;
|
||||
|
||||
Object patt = args.remove(PATTERN_PARAM);
|
||||
Object replacement = args.remove(REPLACEMENT_PARAM);
|
||||
|
||||
if (null == patt || null == replacement) {
|
||||
throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" +
|
||||
REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+
|
||||
DEST_PARAM + "' are not both specified");
|
||||
}
|
||||
|
||||
if (0 != args.size()) {
|
||||
throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" +
|
||||
PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
|
||||
"' to be combined with other options.");
|
||||
}
|
||||
|
||||
if (!(replacement instanceof String)) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. <str>)");
|
||||
}
|
||||
if (!(patt instanceof String)) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. <str>)");
|
||||
}
|
||||
|
||||
dest = replacement.toString();
|
||||
try {
|
||||
this.pattern = Pattern.compile(patt.toString());
|
||||
} catch (PatternSyntaxException pe) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
|
||||
" is not a valid regex pattern: " + patt, pe);
|
||||
|
||||
}
|
||||
srcInclusions = new SelectorParams();
|
||||
srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* init helper method that should only be called when we know for certain that both the
|
||||
* "source" and "dest" init params <em>do</em> exist.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void initSourceSelectorSyntax(NamedList args) {
|
||||
// Full and complete syntax where source and dest are mandatory.
|
||||
//
|
||||
// source may be a single string or a selector.
|
||||
// dest may be a single string or list containing pattern and replacement
|
||||
//
|
||||
// source != null && dest != null
|
||||
|
||||
// if we got here we know we had source and dest, now check for the other two so that we can give a better
|
||||
// message than "unexpected"
|
||||
if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) {
|
||||
throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
|
||||
SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
|
||||
}
|
||||
|
||||
Object d = args.remove(DEST_PARAM);
|
||||
assert null != d;
|
||||
|
||||
List<Object> sources = args.getAll(SOURCE_PARAM);
|
||||
assert null != sources;
|
||||
|
||||
if (1 == sources.size()) {
|
||||
if (sources.get(0) instanceof NamedList) {
|
||||
// nested set of selector options
|
||||
NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
|
||||
|
||||
srcInclusions = parseSelectorParams(selectorConfig);
|
||||
|
||||
List<Object> excList = selectorConfig.getAll("exclude");
|
||||
|
||||
for (Object excObj : excList) {
|
||||
if (null == excObj) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
|
||||
"' child 'exclude' can not be null");
|
||||
}
|
||||
if (!(excObj instanceof NamedList)) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
|
||||
"' child 'exclude' must be <lst/>");
|
||||
}
|
||||
NamedList exc = (NamedList) excObj;
|
||||
srcExclusions.add(parseSelectorParams(exc));
|
||||
if (0 < exc.size()) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
|
||||
"' has unexpected 'exclude' sub-param(s): '"
|
||||
+ selectorConfig.getName(0) + "'");
|
||||
}
|
||||
// call once per instance
|
||||
selectorConfig.remove("exclude");
|
||||
}
|
||||
|
||||
if (0 < selectorConfig.size()) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
|
||||
"' contains unexpected child param(s): '" +
|
||||
selectorConfig.getName(0) + "'");
|
||||
}
|
||||
// consume from the named list so it doesn't interfere with subsequent processing
|
||||
sources.remove(0);
|
||||
}
|
||||
}
|
||||
if (1 <= sources.size()) {
|
||||
// source better be one or more strings
|
||||
srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
|
||||
}
|
||||
if (srcInclusions == null) {
|
||||
throw new SolrException(SERVER_ERROR,
|
||||
"Init params do not specify any field from which to extract entities, please supply either "
|
||||
+ SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" +
|
||||
"for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
|
||||
}
|
||||
|
||||
if (d instanceof NamedList) {
|
||||
NamedList destList = (NamedList) d;
|
||||
|
||||
Object patt = destList.remove(PATTERN_PARAM);
|
||||
Object replacement = destList.remove(REPLACEMENT_PARAM);
|
||||
|
||||
if (null == patt || null == replacement) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
|
||||
PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
|
||||
"' are both mandatory and can not be null");
|
||||
}
|
||||
if (! (patt instanceof String && replacement instanceof String)) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
|
||||
PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
|
||||
"' must both be strings (i.e. <str>)");
|
||||
}
|
||||
if (0 != destList.size()) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '"
|
||||
+ destList.getName(0) + "'");
|
||||
}
|
||||
|
||||
try {
|
||||
this.pattern = Pattern.compile(patt.toString());
|
||||
} catch (PatternSyntaxException pe) {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM +
|
||||
" is not a valid regex pattern: " + patt, pe);
|
||||
}
|
||||
dest = replacement.toString();
|
||||
|
||||
} else if (d instanceof String) {
|
||||
dest = d.toString();
|
||||
} else {
|
||||
throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " +
|
||||
"(i.e. <str>) or a list (i.e. <lst>) containing '" +
|
||||
PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(final SolrCore core) {
|
||||
|
||||
srcSelector =
|
||||
FieldMutatingUpdateProcessor.createFieldNameSelector
|
||||
(core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
|
||||
|
||||
for (SelectorParams exc : srcExclusions) {
|
||||
srcSelector = FieldMutatingUpdateProcessor.wrap
|
||||
(srcSelector,
|
||||
FieldMutatingUpdateProcessor.createFieldNameSelector
|
||||
(core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
|
||||
}
|
||||
try {
|
||||
OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final UpdateRequestProcessor getInstance
|
||||
(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
|
||||
final FieldNameSelector srcSelector = getSourceSelector();
|
||||
return new UpdateRequestProcessor(next) {
|
||||
private final NLPNERTaggerOp nerTaggerOp;
|
||||
private Analyzer analyzer = null;
|
||||
{
|
||||
try {
|
||||
nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
|
||||
FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
|
||||
if (fieldType == null) {
|
||||
throw new SolrException
|
||||
(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema.");
|
||||
}
|
||||
analyzer = fieldType.getIndexAnalyzer();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processAdd(AddUpdateCommand cmd) throws IOException {
|
||||
|
||||
final SolrInputDocument doc = cmd.getSolrInputDocument();
|
||||
|
||||
// Destination may be regex replace string, or "{EntityType}" replaced by
|
||||
// each entity's type, both of which can cause multiple output fields.
|
||||
Map<String,SolrInputField> destMap = new HashMap<>();
|
||||
|
||||
// preserve initial values
|
||||
for (final String fname : doc.getFieldNames()) {
|
||||
if ( ! srcSelector.shouldMutate(fname)) continue;
|
||||
|
||||
Collection<Object> srcFieldValues = doc.getFieldValues(fname);
|
||||
if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
|
||||
|
||||
String resolvedDest = dest;
|
||||
|
||||
if (pattern != null) {
|
||||
Matcher matcher = pattern.matcher(fname);
|
||||
if (matcher.find()) {
|
||||
resolvedDest = matcher.replaceAll(dest);
|
||||
} else {
|
||||
log.debug("srcSelector.shouldMutate(\"{}\") returned true, " +
|
||||
"but replacement pattern did not match, field skipped.", fname);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (Object val : srcFieldValues) {
|
||||
for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
|
||||
SolrInputField destField = null;
|
||||
String entityName = entity.first();
|
||||
String entityType = entity.second();
|
||||
resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
|
||||
if (doc.containsKey(resolvedDest)) {
|
||||
destField = doc.getField(resolvedDest);
|
||||
} else {
|
||||
SolrInputField targetField = destMap.get(resolvedDest);
|
||||
if (targetField == null) {
|
||||
destField = new SolrInputField(resolvedDest);
|
||||
} else {
|
||||
destField = targetField;
|
||||
}
|
||||
}
|
||||
destField.addValue(entityName);
|
||||
|
||||
// put it in map to avoid concurrent modification...
|
||||
destMap.put(resolvedDest, destField);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
|
||||
doc.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
super.processAdd(cmd);
|
||||
}
|
||||
|
||||
/** Using configured NER model, extracts (name, type) pairs from the given source field value */
|
||||
private List<Pair<String,String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
|
||||
List<Pair<String,String>> entitiesWithType = new ArrayList<>();
|
||||
List<String> terms = new ArrayList<>();
|
||||
List<Integer> startOffsets = new ArrayList<>();
|
||||
List<Integer> endOffsets = new ArrayList<>();
|
||||
String fullText = srcFieldValue.toString();
|
||||
TokenStream tokenStream = analyzer.tokenStream("", fullText);
|
||||
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||
FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
|
||||
tokenStream.reset();
|
||||
synchronized (nerTaggerOp) {
|
||||
while (tokenStream.incrementToken()) {
|
||||
terms.add(termAtt.toString());
|
||||
startOffsets.add(offsetAtt.startOffset());
|
||||
endOffsets.add(offsetAtt.endOffset());
|
||||
boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
if (endOfSentence) { // extract named entities one sentence at a time
|
||||
extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
|
||||
}
|
||||
}
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
|
||||
extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
|
||||
}
|
||||
nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
|
||||
}
|
||||
return entitiesWithType;
|
||||
}
|
||||
|
||||
private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets,
|
||||
List<Integer> endOffsets, List<Pair<String,String>> entitiesWithType) {
|
||||
for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
|
||||
String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1));
|
||||
entitiesWithType.add(new Pair<>(text, span.getType()));
|
||||
}
|
||||
terms.clear();
|
||||
startOffsets.clear();
|
||||
endOffsets.clear();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** macro */
|
||||
private static SelectorParams parseSelectorParams(NamedList args) {
|
||||
return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- not a package-info.java, because we already defined this package in core/ -->
|
||||
<html>
|
||||
<body>
|
||||
Update request processor invoking OpenNLP Named Entity Recognition over configured
|
||||
source field(s), populating configured target field(s) with the results.
|
||||
</body>
|
||||
</html>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,49 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<schema name="test-opennlp-extract" version="1.6">
|
||||
<fieldType name="opennlp-en-tokenization" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-test-sent.bin"
|
||||
tokenizerModel="en-test-tokenizer.bin"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
|
||||
|
||||
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="subject" type="text" indexed="true" stored="true"/>
|
||||
<field name="title" type="text" indexed="true" stored="true"/>
|
||||
<field name="subtitle" type="text" indexed="true" stored="true"/>
|
||||
<field name="descs" type="text" indexed="true" stored="true"/>
|
||||
<field name="descriptions" type="text" indexed="true" stored="true"/>
|
||||
|
||||
<dynamicField name="*_txt" type="text" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_s" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||
<dynamicField name="*_people" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||
</schema>
|
|
@ -0,0 +1,206 @@
|
|||
<?xml version="1.0" ?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<config>
|
||||
<luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
|
||||
<xi:include href="solrconfig.snippet.randomindexconfig.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
|
||||
<requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
|
||||
<requestHandler name="/update" class="solr.UpdateRequestHandler" />
|
||||
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
|
||||
<schemaFactory class="ClassicIndexSchemaFactory"/>
|
||||
|
||||
<updateRequestProcessorChain name="extract-single">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<str name="source">source1_s</str>
|
||||
<str name="dest">dest_s</str>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-single-regex">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<str name="source">source1_s</str>
|
||||
<lst name="dest">
|
||||
<str name="pattern">source\d(_s)</str>
|
||||
<str name="replacement">dest$1</str>
|
||||
</lst>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-multi">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<str name="source">source1_s</str>
|
||||
<str name="source">source2_s</str>
|
||||
<str name="dest">dest_s</str>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-multi-regex">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<str name="source">source1_s</str>
|
||||
<str name="source">source2_s</str>
|
||||
<lst name="dest">
|
||||
<str name="pattern">source\d(_s)</str>
|
||||
<str name="replacement">dest$1</str>
|
||||
</lst>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-array">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<arr name="source">
|
||||
<str>source1_s</str>
|
||||
<str>source2_s</str>
|
||||
</arr>
|
||||
<str name="dest">dest_s</str>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-array-regex">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<arr name="source">
|
||||
<str>source1_s</str>
|
||||
<str>source2_s</str>
|
||||
</arr>
|
||||
<lst name="dest">
|
||||
<str name="pattern">source\d(_s)</str>
|
||||
<str name="replacement">dest$1</str>
|
||||
</lst>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-selector">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<lst name="source">
|
||||
<str name="fieldRegex">source\d_.*</str>
|
||||
<lst name="exclude">
|
||||
<str name="fieldRegex">source0_.*</str>
|
||||
</lst>
|
||||
</lst>
|
||||
<str name="dest">dest_s</str>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-selector-regex">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<lst name="source">
|
||||
<str name="fieldRegex">source\d_.*</str>
|
||||
<lst name="exclude">
|
||||
<str name="fieldRegex">source0_.*</str>
|
||||
</lst>
|
||||
</lst>
|
||||
<lst name="dest">
|
||||
<str name="pattern">source\d(_s)</str>
|
||||
<str name="replacement">dest$1</str>
|
||||
</lst>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-regex-replaceall">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<lst name="source">
|
||||
<str name="fieldRegex">foo.*</str>
|
||||
</lst>
|
||||
<lst name="dest">
|
||||
<!-- unbounded pattern that can be replaced multiple times in field name -->
|
||||
<str name="pattern">x(\d)</str>
|
||||
<str name="replacement">y$1</str>
|
||||
</lst>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<updateRequestProcessorChain name="extract-regex-replaceall-with-entity-type">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<lst name="source">
|
||||
<str name="fieldRegex">foo.*</str>
|
||||
</lst>
|
||||
<lst name="dest">
|
||||
<!-- unbounded pattern that can be replaced multiple times in field name -->
|
||||
<str name="pattern">x(\d)</str>
|
||||
<str name="replacement">{EntityType}_y$1</str>
|
||||
</lst>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
<!-- example used in OpenNLPExtractNamedEntitiesUpdateProcessorFactory javadocs -->
|
||||
<updateRequestProcessorChain name="multiple-extract">
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<str name="source">text</str>
|
||||
<str name="dest">people_s</str>
|
||||
</processor>
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<arr name="source">
|
||||
<str>title</str>
|
||||
<str>subtitle</str>
|
||||
</arr>
|
||||
<str name="dest">titular_people</str>
|
||||
</processor>
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<lst name="source">
|
||||
<str name="fieldRegex">.*_txt$</str>
|
||||
<lst name="exclude">
|
||||
<str name="fieldName">notes_txt</str>
|
||||
</lst>
|
||||
</lst>
|
||||
<str name="dest">people_s</str>
|
||||
</processor>
|
||||
<processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<lst name="source">
|
||||
<str name="fieldRegex">^desc(.*)s$</str>
|
||||
</lst>
|
||||
<lst name="dest">
|
||||
<str name="pattern">^desc(.*)s$</str>
|
||||
<str name="replacement">key_desc$1_people</str>
|
||||
</lst>
|
||||
</processor>
|
||||
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
|
||||
<str name="modelFile">en-test-ner-person.bin</str>
|
||||
<str name="analyzerFieldType">opennlp-en-tokenization</str>
|
||||
<str name="source">summary</str>
|
||||
<str name="dest">summary_{EntityType}_s</str>
|
||||
</processor>
|
||||
</updateRequestProcessorChain>
|
||||
</config>
|
|
@ -0,0 +1,48 @@
|
|||
<?xml version="1.0" ?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
A solrconfig.xml snippet containing indexConfig settings for randomized testing.
|
||||
-->
|
||||
<indexConfig>
|
||||
<!-- this sys property is not set by SolrTestCaseJ4 because we ideally want to use
|
||||
the RandomMergePolicy in all tests - but some tests expect very specific
|
||||
Merge behavior, so those tests can set it as needed.
|
||||
-->
|
||||
<mergePolicyFactory class="${solr.tests.mergePolicyFactory:org.apache.solr.util.RandomMergePolicyFactory}" />
|
||||
|
||||
<useCompoundFile>${useCompoundFile:false}</useCompoundFile>
|
||||
|
||||
<maxBufferedDocs>${solr.tests.maxBufferedDocs}</maxBufferedDocs>
|
||||
<ramBufferSizeMB>${solr.tests.ramBufferSizeMB}</ramBufferSizeMB>
|
||||
|
||||
<mergeScheduler class="${solr.tests.mergeScheduler}" />
|
||||
|
||||
<writeLockTimeout>1000</writeLockTimeout>
|
||||
<commitLockTimeout>10000</commitLockTimeout>
|
||||
|
||||
<!-- this sys property is not set by SolrTestCaseJ4 because almost all tests should
|
||||
use the single process lockType for speed - but tests that explicitly need
|
||||
to vary the lockType can set it as needed.
|
||||
-->
|
||||
<lockType>${solr.tests.lockType:single}</lockType>
|
||||
|
||||
<infoStream>${solr.tests.infostream:false}</infoStream>
|
||||
|
||||
</indexConfig>
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory extends UpdateProcessorTestBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
File testHome = createTempDir().toFile();
|
||||
FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
|
||||
initCore("solrconfig-opennlp-extract.xml", "schema-opennlp-extract.xml", testHome.getAbsolutePath());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleExtract() throws Exception {
|
||||
SolrInputDocument doc = processAdd("extract-single",
|
||||
doc(f("id", "1"),
|
||||
f("source1_s", "Take this to Mr. Flashman.")));
|
||||
assertEquals("dest_s should have stringValue", "Flashman", doc.getFieldValue("dest_s"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiExtract() throws Exception {
|
||||
SolrInputDocument doc = processAdd("extract-multi",
|
||||
doc(f("id", "1"),
|
||||
f("source1_s", "Hello Flashman."),
|
||||
f("source2_s", "Calling Flashman.")));
|
||||
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("dest_s"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArrayExtract() throws Exception {
|
||||
SolrInputDocument doc = processAdd("extract-array",
|
||||
doc(f("id", "1"),
|
||||
f("source1_s", "Currently we have Flashman. Not much else."),
|
||||
f("source2_s", "Flashman. Is. Not. There.")));
|
||||
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("dest_s"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelectorExtract() throws Exception {
|
||||
SolrInputDocument doc = processAdd("extract-selector",
|
||||
doc(f("id", "1"),
|
||||
f("source0_s", "Flashman. Or not."),
|
||||
f("source1_s", "Serendipitously, he was. I mean, Flashman. And yet."),
|
||||
f("source2_s", "Correct, Flashman.")));
|
||||
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("dest_s"));
|
||||
}
|
||||
|
||||
public void testMultipleExtracts() throws Exception {
|
||||
// test example from the javadocs
|
||||
SolrInputDocument doc = processAdd("multiple-extract",
|
||||
doc(f("id", "1"),
|
||||
f("text", "From Flashman. To Panman."),
|
||||
f("title", "It's Captain Flashman.", "Privately, Flashman."),
|
||||
f("subtitle", "Ineluctably, Flashman."),
|
||||
f("corrolary_txt", "Forsooth thou bringeth Flashman."),
|
||||
f("notes_txt", "Yes Flashman."),
|
||||
f("summary", "Many aspire to be Flashman."),
|
||||
f("descs", "Courage, Flashman.", "Ain't he Flashman."),
|
||||
f("descriptions", "Flashman. Flashman. Flashman.")));
|
||||
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("people_s"));
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman", "Flashman"), doc.getFieldValues("titular_people"));
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman"), doc.getFieldValues("key_desc_people"));
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman", "Flashman"), doc.getFieldValues("key_description_people"));
|
||||
assertEquals("Flashman", doc.getFieldValue("summary_person_s")); // {EntityType} field name interpolation
|
||||
}
|
||||
|
||||
public void testEquivalentExtraction() throws Exception {
|
||||
SolrInputDocument d;
|
||||
|
||||
// regardless of chain, all of these checks should be equivalent
|
||||
for (String chain : Arrays.asList("extract-single", "extract-single-regex",
|
||||
"extract-multi", "extract-multi-regex",
|
||||
"extract-array", "extract-array-regex",
|
||||
"extract-selector", "extract-selector-regex")) {
|
||||
|
||||
// simple extract
|
||||
d = processAdd(chain,
|
||||
doc(f("id", "1111"),
|
||||
f("source0_s", "Totally Flashman."), // not extracted
|
||||
f("source1_s", "One nation under Flashman.", "Good Flashman.")));
|
||||
assertNotNull(chain, d);
|
||||
assertEquals(chain, Arrays.asList("Flashman", "Flashman"), d.getFieldValues("dest_s"));
|
||||
|
||||
// append to existing values
|
||||
d = processAdd(chain,
|
||||
doc(f("id", "1111"),
|
||||
field("dest_s", "orig1", "orig2"),
|
||||
f("source0_s", "Flashman. In totality."), // not extracted
|
||||
f("source1_s", "Two nations under Flashman.", "Meh Flashman.")));
|
||||
assertNotNull(chain, d);
|
||||
assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
|
||||
}
|
||||
|
||||
// should be equivalent for any chain matching source1_s and source2_s (but not source0_s)
|
||||
for (String chain : Arrays.asList("extract-multi", "extract-multi-regex",
|
||||
"extract-array", "extract-array-regex",
|
||||
"extract-selector", "extract-selector-regex")) {
|
||||
|
||||
// simple extract
|
||||
d = processAdd(chain,
|
||||
doc(f("id", "1111"),
|
||||
f("source0_s", "Not Flashman."), // not extracted
|
||||
f("source1_s", "Could have had a Flashman.", "Bad Flashman."),
|
||||
f("source2_s", "Indubitably Flashman.")));
|
||||
assertNotNull(chain, d);
|
||||
assertEquals(chain, Arrays.asList("Flashman", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
|
||||
|
||||
// append to existing values
|
||||
d = processAdd(chain,
|
||||
doc(f("id", "1111"),
|
||||
field("dest_s", "orig1", "orig2"),
|
||||
f("source0_s", "Never Flashman."), // not extracted
|
||||
f("source1_s", "Seeking Flashman.", "Evil incarnate Flashman."),
|
||||
f("source2_s", "Perfunctorily Flashman.")));
|
||||
assertNotNull(chain, d);
|
||||
assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
|
||||
}
|
||||
|
||||
// any chain that copies source1_s to dest_s should be equivalent for these assertions
|
||||
for (String chain : Arrays.asList("extract-single", "extract-single-regex",
|
||||
"extract-multi", "extract-multi-regex",
|
||||
"extract-array", "extract-array-regex",
|
||||
"extract-selector", "extract-selector-regex")) {
|
||||
|
||||
// simple extract
|
||||
d = processAdd(chain,
|
||||
doc(f("id", "1111"),
|
||||
f("source1_s", "Always Flashman.", "Flashman. Noone else.")));
|
||||
assertNotNull(chain, d);
|
||||
assertEquals(chain, Arrays.asList("Flashman", "Flashman"), d.getFieldValues("dest_s"));
|
||||
|
||||
// append to existing values
|
||||
d = processAdd(chain,
|
||||
doc(f("id", "1111"),
|
||||
field("dest_s", "orig1", "orig2"),
|
||||
f("source1_s", "Flashman. And, scene.", "Contemporary Flashman. Yeesh.")));
|
||||
assertNotNull(chain, d);
|
||||
assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman", "Flashman"), d.getFieldValues("dest_s"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testExtractFieldRegexReplaceAll() throws Exception {
|
||||
SolrInputDocument d = processAdd("extract-regex-replaceall",
|
||||
doc(f("id", "1111"),
|
||||
f("foo_x2_s", "Infrequently Flashman.", "In the words of Flashman."),
|
||||
f("foo_x3_x7_s", "Flashman. Whoa.")));
|
||||
|
||||
assertNotNull(d);
|
||||
assertEquals(Arrays.asList("Flashman", "Flashman"), d.getFieldValues("foo_y2_s"));
|
||||
assertEquals("Flashman", d.getFieldValue("foo_y3_y7_s"));
|
||||
}
|
||||
|
||||
public void testExtractFieldRegexReplaceAllWithEntityType() throws Exception {
|
||||
SolrInputDocument d = processAdd("extract-regex-replaceall-with-entity-type",
|
||||
doc(f("id", "1111"),
|
||||
f("foo_x2_s", "Infrequently Flashman.", "In the words of Flashman."),
|
||||
f("foo_x3_x7_s", "Flashman. Whoa.")));
|
||||
|
||||
assertNotNull(d);
|
||||
assertEquals(d.getFieldNames().toString(), Arrays.asList("Flashman", "Flashman"), d.getFieldValues("foo_person_y2_s"));
|
||||
assertEquals(d.getFieldNames().toString(),"Flashman", d.getFieldValue("foo_person_y3_person_y7_s"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
55e39e6b46e71f35229cdd6950e72d8cce3b5fd4
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Apache OpenNLP Maxent
|
||||
Copyright 2013 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
|
@ -0,0 +1 @@
|
|||
3ce7c9056048f55478d983248cf18c7e02b1d072
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Apache OpenNLP Tools
|
||||
Copyright 2015 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
|
@ -1576,6 +1576,38 @@ This filter adds the token's type, as an encoded byte sequence, as its payload.
|
|||
|
||||
*Out:* "Pay"[<ALPHANUM>], "Bob's"[<APOSTROPHE>], "I.O.U."[<ACRONYM>]
|
||||
|
||||
== Type As Synonym Filter
|
||||
|
||||
This filter adds the token's type, as a token at the same position as the token, optionally with a configurable prefix prepended.
|
||||
|
||||
*Factory class:* `solr.TypeAsSynonymFilterFactory`
|
||||
|
||||
*Arguments:*
|
||||
|
||||
`prefix`:: (optional) The prefix to prepend to the token's type.
|
||||
|
||||
*Examples:*
|
||||
|
||||
With the example below, each token's type will be emitted verbatim at the same position:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.TypeAsSynonymFilterFactory"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
With the example below, for a token "example.com" with type `<URL>`, the token emitted at the same position will be "\_type_<URL>":
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.UAX29URLEmailTokenizerFactory"/>
|
||||
<filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
== Type Token Filter
|
||||
|
||||
This filter blacklists or whitelists a specified list of token types, assuming the tokens have type metadata associated with them. For example, the <<tokenizers.adoc#uax29-url-email-tokenizer,UAX29 URL Email Tokenizer>> emits "<URL>" and "<EMAIL>" typed tokens, as well as other types. This filter would allow you to pull out only e-mail addresses from text as tokens, if you wish.
|
||||
|
|
|
@ -355,6 +355,214 @@ This can increase recall by causing more matches. On the other hand, it can redu
|
|||
</analyzer>
|
||||
----
|
||||
|
||||
== OpenNLP Integration
|
||||
|
||||
The `lucene/analysis/opennlp` module provides OpenNLP integration via several analysis components: a tokenizer, a part-of-speech tagging filter, a phrase chunking filter, and a lemmatization filter. In addition to these analysis components, Solr also provides an update request processor to extract named entities - see <<update-request-processors.adoc#update-processor-factories-that-can-be-loaded-as-plugins,Update Processor Factories That Can Be Loaded as Plugins>>.
|
||||
|
||||
NOTE: The <<OpenNLP Tokenizer>> must be used with all other OpenNLP analysis components, for two reasons: first, the OpenNLP Tokenizer detects and marks the sentence boundaries required by all the OpenNLP filters; and second, since the pre-trained OpenNLP models used by these filters were trained using the corresponding language-specific sentence-detection/tokenization models, the same tokenization, using the same models, must be used at runtime for optimal performance.
|
||||
|
||||
See `solr/contrib/analysis-extras/README.txt` for information on which jars you need to add to your `SOLR_HOME/lib`.
|
||||
|
||||
=== OpenNLP Tokenizer
|
||||
|
||||
The OpenNLP Tokenizer takes two language-specific binary model files as parameters: a sentence detector model and a tokenizer model. The last token in each sentence is flagged, so that following OpenNLP-based filters can use this information to apply operations to tokens one sentence at a time. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
|
||||
|
||||
*Factory class:* `solr.OpenNLPTokenizerFactory`
|
||||
|
||||
*Arguments:*
|
||||
|
||||
`sentenceModel`:: (required) The path of a language-specific OpenNLP sentence detection model file. This path may be an absolute path, or path relative to the Solr config directory.
|
||||
|
||||
`tokenizerModel`:: (required) The path of a language-specific OpenNLP tokenization model file. This path may be an absolute path, or path relative to the Solr config directory.
|
||||
|
||||
*Example:*
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
=== OpenNLP Part-Of-Speech Filter
|
||||
|
||||
This filter sets each token's type attribute to the part of speech (POS) assigned by the configured model. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
|
||||
|
||||
NOTE: Lucene currently does not index token types, so if you want to keep this information, you have to preserve it either in a payload or as a synonym; see the examples below.
|
||||
|
||||
*Factory class:* `solr.OpenNLPPOSFilterFactory`
|
||||
|
||||
*Arguments:*
|
||||
|
||||
`posTaggerModel`:: (required) The path of a language-specific OpenNLP POS tagger model file. This path may be an absolute path, or path relative to the Solr config directory.
|
||||
|
||||
*Examples:*
|
||||
|
||||
The OpenNLP tokenizer will tokenize punctuation, which is useful for following token filters, but ordinarily you don't want to include punctuation in your index, so the `TypeTokenFilter` (<<filter-descriptions.adoc#type-token-filter,described here>>) is included in the examples below, with `stop.pos.txt` containing the following:
|
||||
|
||||
.stop.pos.txt
|
||||
[source,text]
|
||||
----
|
||||
#
|
||||
$
|
||||
''
|
||||
``
|
||||
,
|
||||
-LRB-
|
||||
-RRB-
|
||||
:
|
||||
.
|
||||
----
|
||||
|
||||
Index the POS for each token as a payload:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.TypeAsPayloadFilterFactory"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
Index the POS for each token as a synonym, after prefixing the POS with "@" (see the <<filter-descriptions.adoc#type-as-synonym-filter,TypeAsSynonymFilter description>>):
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.TypeAsSynonymFilterFactory" prefix="@"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
Only index nouns - the `keep.pos.txt` file contains lines `NN`, `NNS`, `NNP` and `NNPS`:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="keep.pos.txt" useWhitelist="true"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
=== OpenNLP Phrase Chunking Filter
|
||||
|
||||
This filter sets each token's type attribute based on the output of an OpenNLP phrase chunking model. The chunk labels replace the POS tags that previously were in each token's type attribute. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
|
||||
|
||||
Prerequisite: the <<OpenNLP Tokenizer>> and the <<OpenNLP Part-Of-Speech Filter>> must precede this filter.
|
||||
|
||||
NOTE: Lucene currently does not index token types, so if you want to keep this information, you have to preserve it either in a payload or as a synonym; see the examples below.
|
||||
|
||||
*Factory class:* `solr.OpenNLPChunkerFilter`
|
||||
|
||||
*Arguments:*
|
||||
|
||||
`chunkerModel`:: (required) The path of a language-specific OpenNLP phrase chunker model file. This path may be an absolute path, or path relative to the Solr config directory.
|
||||
|
||||
*Examples*:
|
||||
|
||||
Index the phrase chunk label for each token as a payload:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.OpenNLPChunkerFactory" chunkerModel="en-chunker.bin"/>
|
||||
<filter class="solr.TypeAsPayloadFilterFactory"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
Index the phrase chunk label for each token as a synonym, after prefixing it with "#" (see the <<filter-descriptions.adoc#type-as-synonym-filter,TypeAsSynonymFilter description>>):
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.OpenNLPChunkerFactory" chunkerModel="en-chunker.bin"/>
|
||||
<filter class="solr.TypeAsSynonymFilterFactory" prefix="#"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
=== OpenNLP Lemmatizer Filter
|
||||
|
||||
This filter replaces the text of each token with its lemma. Both a dictionary-based lemmatizer and a model-based lemmatizer are supported. If both are configured, the dictionary-based lemmatizer is tried first, and then the model-based lemmatizer is consulted for out-of-vocabulary tokens. See the http://opennlp.apache.org/models.html[OpenNLP website] for information on downloading pre-trained models.
|
||||
|
||||
*Factory class:* `solr.OpenNLPLemmatizerFilter`
|
||||
|
||||
*Arguments:*
|
||||
|
||||
Either `dictionary` or `lemmatizerModel` must be provided, and both may be provided - see the examples below:
|
||||
|
||||
`dictionary`:: (optional) The path of a lemmatization dictionary file. This path may be an absolute path, or path relative to the Solr config directory. The dictionary file must be encoded as UTF-8, with one entry per line, in the form `word[tab]lemma[tab]part-of-speech`, e.g. `wrote[tab]write[tab]VBD`.
|
||||
|
||||
`lemmatizerModel`:: (optional) The path of a language-specific OpenNLP lemmatizer model file. This path may be an absolute path, or path relative to the Solr config directory.
|
||||
|
||||
*Examples:*
|
||||
|
||||
Perform dictionary-based lemmatization, and fall back to model-based lemmatization for out-of-vocabulary tokens (see the <<OpenNLP Part-Of-Speech Filter>> section above for information about using `TypeTokenFilter` to avoid indexing punctuation):
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.OpenNLPLemmatizerFilterFactory"
|
||||
dictionary="lemmas.txt"
|
||||
lemmatizerModel="en-lemmatizer.bin"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
Perform dictionary-based lemmatization only:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.OpenNLPLemmatizerFilterFactory" dictionary="lemmas.txt"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
Perform model-based lemmatization only, preserving the original token and emitting the lemma as a synonym (see the <<KeywordRepeatFilterFactory,KeywordRepeatFilterFactory description>>)):
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.OpenNLPTokenizerFactory"
|
||||
sentenceModel="en-sent.bin"
|
||||
tokenizerModel="en-tokenizer.bin"/>
|
||||
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
|
||||
<filter class="solr.KeywordRepeatFilterFactory"/>
|
||||
<filter class="solr.OpenNLPLemmatizerFilterFactory" lemmatizerModel="en-lemmatizer.bin"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="stop.pos.txt"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
== Language-Specific Factories
|
||||
|
||||
These factories are each designed to work with specific languages. The languages covered here are:
|
||||
|
|
|
@ -502,3 +502,7 @@ Specifies how to define whitespace for the purpose of tokenization. Valid values
|
|||
*In:* "To be, or what?"
|
||||
|
||||
*Out:* "To", "be,", "or", "what?"
|
||||
|
||||
== OpenNLP Tokenizer and OpenNLP Filters
|
||||
|
||||
See <<language-analysis.adoc#opennlp-integration,OpenNLP Integration>> for information about using the OpenNLP Tokenizer, along with information about available OpenNLP token filters.
|
|
@ -275,6 +275,8 @@ What follows are brief descriptions of the currently available update request pr
|
|||
|
||||
{solr-javadocs}/solr-core/org/apache/solr/update/processor/IgnoreCommitOptimizeUpdateProcessorFactory.html[IgnoreCommitOptimizeUpdateProcessorFactory]:: Allows you to ignore commit and/or optimize requests from client applications when running in SolrCloud mode, for more information, see: Shards and Indexing Data in SolrCloud
|
||||
|
||||
{solr-javadocs}/solr-core/org/apache/solr/update/processor/CloneFieldUpdateProcessorFactory.html[CloneFieldUpdateProcessorFactory]:: Clones the values found in any matching _source_ field into the configured _dest_ field.
|
||||
|
||||
{solr-javadocs}/solr-core/org/apache/solr/update/processor/RegexpBoostProcessorFactory.html[RegexpBoostProcessorFactory]:: A processor which will match content of "inputField" against regular expressions found in "boostFilename", and if it matches will return the corresponding boost value from the file and output this to "boostField" as a double value.
|
||||
|
||||
{solr-javadocs}/solr-core/org/apache/solr/update/processor/SignatureUpdateProcessorFactory.html[SignatureUpdateProcessorFactory]:: Uses a defined set of fields to generate a hash "signature" for the document. Useful for only indexing one copy of "similar" documents.
|
||||
|
@ -351,6 +353,10 @@ The {solr-javadocs}/solr-uima/index.html[`uima`] contrib provides::
|
|||
|
||||
{solr-javadocs}/solr-uima/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.html[UIMAUpdateRequestProcessorFactory]::: Update document(s) to be indexed with UIMA extracted information.
|
||||
|
||||
The {solr-javadocs}/solr-analysis-extras/index.html[`analysis-extras`] contrib provides::
|
||||
|
||||
{solr-javadocs}/solr-analysis-extras/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesProcessorFactory.html[OpenNLPExtractNamedEntitiesProcessorFactory]::: Update document(s) to be indexed with named entities extracted using an OpenNLP NER model.
|
||||
|
||||
=== Update Processor Factories You Should _Not_ Modify or Remove
|
||||
|
||||
These are listed for completeness, but are part of the Solr infrastructure, particularly SolrCloud. Other than insuring you do _not_ remove them when modifying the update request handlers (or any copies you make), you will rarely, if ever, need to change these.
|
||||
|
|
Loading…
Reference in New Issue