[SOLR-3013] - removing the ae package from Solr as it's now under analysis/uima module, adding the Solr factories for UIMA based tokenizers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1295330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tommaso Teofili 2012-02-29 22:43:12 +00:00
parent 651a236696
commit 0c5c13e157
12 changed files with 1910 additions and 183 deletions

View File

@ -5,9 +5,12 @@ This file describes changes to the Solr UIMA (contrib/uima) module. See SOLR-212
Introduction
------------
This module is intended to be used while indexing documents.
Its purpose is to provide additional on the fly automatically generated fields to the Solr index.
This module is intended to be used both as an UpdateRequestProcessor while indexing documents and as a set of tokenizer/filters
to be configured inside the schema.xml for use during analysis phase.
UIMAUpdateRequestProcessor purpose is to provide additional on the fly automatically generated fields to the Solr index.
Such fields could be language, concepts, keywords, sentences, named entities, etc.
UIMA based tokenizers/filters can be used either inside plain Lucene or as index/query analyzers to be defined
inside the schema.xml of a Solr core to create/filter tokens using specific UIMA annotations.
UIMA Dependency
---------------

View File

@ -25,4 +25,18 @@
<import file="../contrib-build.xml"/>
<path id="classpath">
<pathelement path="${analyzers-uima.jar}"/>
<path refid="solr.base.classpath"/>
</path>
<target name="module-jars-to-solr" depends="jar-analyzers-uima">
<mkdir dir="${build.dir}/lucene-libs"/>
<copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
<fileset file="${analyzers-uima.jar}"/>
</copy>
</target>
<target name="compile-core" depends="jar-analyzers-uima, solr-contrib-build.compile-core"/>
<target name="dist" depends="module-jars-to-solr, common-solr.dist"/>
</project>

View File

@ -1,6 +1,6 @@
package org.apache.solr.uima.processor.ae;
package org.apache.solr.uima.analysis;
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@ -17,16 +17,30 @@ package org.apache.solr.uima.processor.ae;
* limitations under the License.
*/
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;
import java.io.Reader;
import java.util.Map;
/**
* provide an Apache UIMA {@link AnalysisEngine}
*
*
* Solr {@link org.apache.solr.analysis.TokenizerFactory} for {@link UIMAAnnotationsTokenizer}
*/
public interface AEProvider {
public class UIMAAnnotationsTokenizerFactory extends BaseTokenizerFactory {
public AnalysisEngine getAE() throws ResourceInitializationException;
private String descriptorPath;
private String tokenType;
@Override
public void init(Map<String, String> args) {
super.init(args);
descriptorPath = args.get("descriptorPath");
tokenType = args.get("tokenType");
}
@Override
public Tokenizer create(Reader input) {
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input);
}
}

View File

@ -0,0 +1,48 @@
package org.apache.solr.uima.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;
import java.io.Reader;
import java.util.Map;
/**
* Solr {@link org.apache.solr.analysis.TokenizerFactory} for {@link UIMATypeAwareAnnotationsTokenizer}
*/
public class UIMATypeAwareAnnotationsTokenizerFactory extends BaseTokenizerFactory {
private String descriptorPath;
private String tokenType;
private String featurePath;
@Override
public void init(Map<String, String> args) {
super.init(args);
descriptorPath = args.get("descriptorPath");
tokenType = args.get("tokenType");
featurePath = args.get("featurePath");
}
@Override
public Tokenizer create(Reader input) {
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input);
}
}

View File

@ -23,8 +23,8 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
import org.apache.solr.uima.processor.ae.AEProvider;
import org.apache.solr.uima.processor.ae.AEProviderFactory;
import org.apache.lucene.analysis.uima.ae.AEProvider;
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.uima.analysis_engine.AnalysisEngine;

View File

@ -1,53 +0,0 @@
package org.apache.solr.uima.processor.ae;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
/**
* Singleton factory class responsible of {@link AEProvider}s' creation
*
*
*/
public class AEProviderFactory {
private static AEProviderFactory instance;
private Map<String, AEProvider> providerCache = new HashMap<String, AEProvider>();
private AEProviderFactory() {
// Singleton
}
public static AEProviderFactory getInstance() {
if (instance == null) {
instance = new AEProviderFactory();
}
return instance;
}
public synchronized AEProvider getAEProvider(String core, String aePath,
Map<String, Object> runtimeParameters) {
String key = new StringBuilder(core).append(aePath).toString();
if (providerCache.get(key) == null) {
providerCache.put(key, new OverridingParamsAEProvider(aePath, runtimeParameters));
}
return providerCache.get(key);
}
}

View File

@ -1,117 +0,0 @@
package org.apache.solr.uima.processor.ae;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.net.URL;
import java.util.Map;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.XMLInputSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link AEProvider} implementation that creates an Aggregate AE from the given path, also
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
* them as overriding parameters in the aggregate AE
*
*
*/
public class OverridingParamsAEProvider implements AEProvider {
private static Logger log = LoggerFactory.getLogger(OverridingParamsAEProvider.class);
private String aeFilePath;
private AnalysisEngine cachedAE;
private Map<String, Object> runtimeParameters;
public OverridingParamsAEProvider(String aeFilePath, Map<String, Object> runtimeParameters) {
this.aeFilePath = aeFilePath;
this.runtimeParameters = runtimeParameters;
}
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
try {
if (cachedAE == null) {
// get Resource Specifier from XML file
URL url = this.getClass().getResource(aeFilePath);
XMLInputSource in = new XMLInputSource(url);
// get AE description
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(in);
/* iterate over each AE (to set runtime parameters) */
for (String attributeName : runtimeParameters.keySet()) {
Object val = getRuntimeValue(desc, attributeName);
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
attributeName, val);
if (log.isDebugEnabled())
log.debug(new StringBuilder("setting ").append(attributeName).append(" : ").append(
runtimeParameters.get(attributeName)).toString());
}
// create AE here
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
if (log.isDebugEnabled())
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
.append(" created from descriptor ").append(aeFilePath).toString());
} else {
cachedAE.reconfigure();
if (log.isDebugEnabled())
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
.append(" at path ").append(aeFilePath).append(" reconfigured ").toString());
}
} catch (Exception e) {
cachedAE = null;
throw new ResourceInitializationException(e);
}
return cachedAE;
}
/* create the value to inject in the runtime parameter depending on its declared type */
private Object getRuntimeValue(AnalysisEngineDescription desc, String attributeName)
throws ClassNotFoundException {
String type = desc.getAnalysisEngineMetaData().getConfigurationParameterDeclarations().
getConfigurationParameter(null, attributeName).getType();
// TODO : do it via reflection ? i.e. Class paramType = Class.forName(type)...
Object val = null;
Object runtimeValue = runtimeParameters.get(attributeName);
if (runtimeValue!=null) {
if ("String".equals(type)) {
val = String.valueOf(runtimeValue);
}
else if ("Integer".equals(type)) {
val = Integer.valueOf(runtimeValue.toString());
}
else if ("Boolean".equals(type)) {
val = Boolean.valueOf(runtimeValue.toString());
}
else if ("Float".equals(type)) {
val = Float.valueOf(runtimeValue.toString());
}
}
return val;
}
}

View File

@ -0,0 +1,25 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
vbg
vbz
vbd
vbn
vb
bez
cc
cd
at
.
:

View File

@ -0,0 +1,680 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version
2.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 Unless required by
applicable law or agreed to in writing, software distributed under
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and
limitations under the License.
-->
<!--
This is the Solr schema file. This file should be named "schema.xml"
and should be in the conf directory under the solr home (i.e.
./solr/conf/schema.xml by default) or located where the classloader
for the Solr webapp can find it. This example schema is the
recommended starting point for users. It should be kept correct and
concise, usable out-of-the-box. For more information, on how to
customize this file, please see
http://wiki.apache.org/solr/SchemaXml PERFORMANCE NOTE: this schema
includes many optional features and should not be used for
benchmarking. To improve performance one could - set stored="false"
for all fields possible (esp large fields) when you only need to
search on the field but don't need to return the original value. -
set indexed="false" if you don't need to search on the field, but
only return the field as a result of searching on other indexed
fields. - remove all unneeded copyField statements - for best index
size and searching performance, set "index" to false for all general
text fields, use copyField to copy them to the catchall "text"
field, and use that for searching. - For maximum indexing
performance, use the StreamingUpdateSolrServer java client. -
Remember to run the JVM in server mode, and use a higher logging
level that avoids logging every request
-->
<schema name="sample" version="1.2">
<!--
attribute "name" is the name of this schema and is only used for
display purposes. Applications should change this to reflect the
nature of the search collection. version="1.2" is Solr's version
number for the schema syntax and semantics. It should not normally
be changed by applications. 1.0: multiValued attribute did not
exist, all fields are multiValued by nature 1.1: multiValued
attribute introduced, false by default 1.2: omitTermFreqAndPositions
attribute introduced, true by default except for text fields.
-->
<types>
<!--
field type definitions. The "name" attribute is just a label to be
used by field definitions. The "class" attribute and any other
attributes determine the real behavior of the fieldType. Class
names starting with "solr" refer to java classes in the
org.apache.solr.analysis package.
-->
<!--
The StrField type is not analyzed, but indexed/stored verbatim. -
StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values
which exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField"
sortMissingLast="true" omitNorms="true" />
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField"
sortMissingLast="true" omitNorms="true" />
<!--
Binary data type. The data should be sent/retrieved in as Base64
encoded Strings
-->
<fieldtype name="binary" class="solr.BinaryField" />
<!--
The optional sortMissingLast and sortMissingFirst attributes are
currently supported on types that are sorted internally as
strings. This includes
"string","boolean","sint","slong","sfloat","sdouble","pdate" - If
sortMissingLast="true", then a sort on this field will cause
documents without the field to come after documents with the
field, regardless of the requested sort order (asc or desc). - If
sortMissingFirst="true", then a sort on this field will cause
documents without the field to come before documents with the
field, regardless of the requested sort order. - If
sortMissingLast="false" and sortMissingFirst="false" (the
default), then default lucene sorting will be used which places
docs without the field first in an ascending sort and last in a
descending sort.
-->
<!--
Default numeric field types. For faster range queries, consider
the tint/tfloat/tlong/tdouble types.
-->
<fieldType name="int" class="solr.TrieIntField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="float" class="solr.TrieFloatField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="long" class="solr.TrieLongField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="double" class="solr.TrieDoubleField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<!--
Numeric field types that index each value at various levels of
precision to accelerate range queries when the number of values
between the range endpoints is large. See the javadoc for
NumericRangeQuery for internal implementation details. Smaller
precisionStep values (specified in bits) will lead to more tokens
indexed per value, slightly larger index size, and faster range
queries. A precisionStep of 0 disables indexing at different
precision levels.
-->
<fieldType name="tint" class="solr.TrieIntField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<fieldType name="tfloat" class="solr.TrieFloatField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<fieldType name="tlong" class="solr.TrieLongField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<fieldType name="tdouble" class="solr.TrieDoubleField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<!--
The format for this date field is of the form
1995-12-31T23:59:59Z, and is a more restricted form of the
canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime The trailing "Z"
designates UTC time and is mandatory. Optional fractional seconds
are allowed: 1995-12-31T23:59:59.999Z All other components are
mandatory. Expressions can also be used to denote calculations
that should be performed relative to "NOW" to determine the value,
ie... NOW/HOUR ... Round to the start of the current hour NOW-1DAY
... Exactly 1 day prior to now NOW/DAY+6MONTHS+3DAYS ... 6 months
and 3 days in the future from the start of the current day Consult
the DateField javadocs for more information. Note: For faster
range queries, consider the tdate type
-->
<fieldType name="date" class="solr.TrieDateField"
omitNorms="true" precisionStep="0" positionIncrementGap="0" />
<!--
A Trie based date field for faster date range queries and date
faceting.
-->
<fieldType name="tdate" class="solr.TrieDateField"
omitNorms="true" precisionStep="6" positionIncrementGap="0" />
<!--
Note: These should only be used for compatibility with existing
indexes (created with older Solr versions) or if
"sortMissingFirst" or "sortMissingLast" functionality is needed.
Use Trie based fields instead. Plain numeric field types that
store and index the text value verbatim (and hence don't support
range queries, since the lexicographic ordering isn't equal to the
numeric ordering)
-->
<fieldType name="pint" class="solr.IntField" omitNorms="true" />
<fieldType name="plong" class="solr.LongField" omitNorms="true" />
<fieldType name="pfloat" class="solr.FloatField"
omitNorms="true" />
<fieldType name="pdouble" class="solr.DoubleField"
omitNorms="true" />
<fieldType name="pdate" class="solr.DateField"
sortMissingLast="true" omitNorms="true" />
<!--
Note: These should only be used for compatibility with existing
indexes (created with older Solr versions) or if
"sortMissingFirst" or "sortMissingLast" functionality is needed.
Use Trie based fields instead. Numeric field types that manipulate
the value into a string value that isn't human-readable in its
internal form, but with a lexicographic ordering the same as the
numeric ordering, so that range queries work correctly.
-->
<fieldType name="sint" class="solr.SortableIntField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="slong" class="solr.SortableLongField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="sfloat" class="solr.SortableFloatField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="sdouble" class="solr.SortableDoubleField"
sortMissingLast="true" omitNorms="true" />
<!--
The "RandomSortField" is not used to store or search any data. You
can declare fields of this type it in your schema to generate
pseudo-random orderings of your docs for sorting purposes. The
ordering is generated based on the field name and the version of
the index, As long as the index version remains unchanged, and the
same field name is reused, the ordering of the docs will be
consistent. If you want different psuedo-random orderings of
documents, for the same version of the index, use a dynamicField
and change the name
-->
<fieldType name="random" class="solr.RandomSortField"
indexed="true" />
<!--
solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying. The optional
positionIncrementGap puts space between multiple fields of this
type on the same document, with the purpose of preventing false
phrase matching across fields. For more info on customizing your
analyzer chain, please see
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
-->
<!--
One can also specify an existing Analyzer class that has a default
constructor via the class attribute on the analyzer element
<fieldType name="text_greek" class="solr.TextField"> <analyzer
class="org.apache.lucene.analysis.el.GreekAnalyzer"/> </fieldType>
-->
<!--
A text field that only splits on whitespace for exact matching of
words
-->
<fieldType name="text_ws" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" />
</analyzer>
</fieldType>
<!--
A text field that uses WordDelimiterFilter to enable splitting and
matching of words on case-change, alpha numeric boundaries, and
non-alphanumeric chars, so that a query of "wifi" or "wi fi" could
match a document containing "Wi-Fi". Synonyms and stopwords are
customized by external files, and stemming is enabled.
-->
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory" />
<!--
in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory"
synonyms="index_synonyms.txt" ignoreCase="true"
expand="false"/>
-->
<!--
Case insensitive stop word removal. add
enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
Less flexible matching, but less false matches. Probably not ideal
for product names, but may be good for SKUs. Can insert dashes in
the wrong place and still match.
-->
<fieldType name="textTight" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0" />
<filter class="solr.LowerCaseFilterFactory" />
<!--
this filter can remove any duplicate tokens that appear at the
same position - sometimes possible with WordDelimiterFilter in
conjuncton with stemming.
-->
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="uima_sentences" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="org.apache.solr.uima.analysis.UIMAAnnotationsTokenizerFactory"
descriptorPath="/uima/AggregateSentenceAE.xml" tokenType="org.apache.uima.SentenceAnnotation"/>
</analyzer>
</fieldType>
<fieldType name="uima_nouns" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="org.apache.solr.uima.analysis.UIMATypeAwareAnnotationsTokenizerFactory"
descriptorPath="/uima/AggregateSentenceAE.xml" tokenType="org.apache.uima.TokenAnnotation"
featurePath="posTag"/>
<filter class="solr.TypeTokenFilterFactory" types="uima/stoptypes.txt" />
</analyzer>
</fieldType>
<!--
A general unstemmed text field - good if one does not know the
language of the field
-->
<fieldType name="textgen" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
A general unstemmed text field that indexes tokens normally and
also reversed (via ReversedWildcardFilterFactory), to enable more
efficient leading wildcard queries.
-->
<fieldType name="text_rev" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.ReversedWildcardFilterFactory"
withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2"
maxFractionAsterisk="0.33" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- charFilter + WhitespaceTokenizer -->
<!--
<fieldType name="textCharNorm" class="solr.TextField"
positionIncrementGap="100" > <analyzer> <charFilter
class="solr.MappingCharFilterFactory"
mapping="mapping-ISOLatin1Accent.txt"/> <tokenizer
class="solr.MockTokenizerFactory"/> </analyzer> </fieldType>
-->
<!--
This is an example of using the KeywordTokenizer along With
various TokenFilterFactories to produce a sortable field that does
not include some properties of the source text
-->
<fieldType name="alphaOnlySort" class="solr.TextField"
sortMissingLast="true" omitNorms="true">
<analyzer>
<!--
KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
<!--
The LowerCase TokenFilter does what you expect, which can be
when you want your sorting to be case insensitive
-->
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!--
The PatternReplaceFilter gives you the flexibility to use Java
Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string, which
may include back references to portions of the original string
matched by the pattern. See the Java Regular Expression
documentation for more information on pattern and replacement
string syntax.
http://java.sun.com/j2se/1.6.0/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])"
replacement="" replace="all" />
</analyzer>
</fieldType>
<fieldtype name="phonetic" stored="false" indexed="true"
class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
</analyzer>
</fieldtype>
<fieldtype name="payloads" stored="false" indexed="true"
class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" />
<!--
The DelimitedPayloadTokenFilter can put payloads on tokens...
for example, a token of "foo|1.4" would be indexed as "foo"
with a payload of 1.4f Attributes of the
DelimitedPayloadTokenFilterFactory : "delimiter" - a one
character delimiter. Default is | (pipe) "encoder" - how to
encode the following value into a playload float ->
org.apache.lucene.analysis.payloads.FloatEncoder, integer ->
o.a.l.a.p.IntegerEncoder identity -> o.a.l.a.p.IdentityEncoder
Fully Qualified class name implementing PayloadEncoder,
Encoder must have a no arg constructor.
-->
<filter class="solr.DelimitedPayloadTokenFilterFactory"
encoder="float" />
</analyzer>
</fieldtype>
<!--
lowercases the entire field value, keeping it as a single token.
-->
<fieldType name="lowercase" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright.
-->
<fieldtype name="ignored" stored="false" indexed="false"
multiValued="true" class="solr.StrField" />
</types>
<fields>
<!--
Valid attributes for fields: name: mandatory - the name for the
field type: mandatory - the name of a previously defined type from
the <types> section indexed: true if this field should be indexed
(searchable or sortable) stored: true if this field should be
retrievable compressed: [false] if this field should be stored
using gzip compression (this will only apply if the field type is
compressable; among the standard field types, only TextField and
StrField are) multiValued: true if this field may contain multiple
values per document omitNorms: (expert) set to true to omit the
norms associated with this field (this disables length
normalization and index-time boosting for the field, and saves
some memory). Only full-text fields or fields that need an
index-time boost need norms. termVectors: [false] set to true to
store the term vector for a given field. When using MoreLikeThis,
fields used for similarity should be stored for best performance.
termPositions: Store position information with the term vector.
This will increase storage costs. termOffsets: Store offset
information with the term vector. This will increase storage
costs. default: a value that should be used if no value is
specified when adding a document.
-->
<field name="id" type="string" indexed="true" stored="true"
required="true" />
<field name="sku" type="textTight" indexed="true" stored="true"
omitNorms="true" />
<field name="name" type="textgen" indexed="true" stored="true" />
<field name="alphaNameSort" type="alphaOnlySort" indexed="true"
stored="false" />
<field name="manu" type="textgen" indexed="true" stored="true"
omitNorms="true" />
<field name="cat" type="text_ws" indexed="true" stored="true"
multiValued="true" omitNorms="true" />
<field name="features" type="text" indexed="true" stored="true"
multiValued="true" />
<field name="includes" type="text" indexed="true" stored="true"
termVectors="true" termPositions="true" termOffsets="true" />
<field name="sentences" type="uima_sentences" indexed="true" stored="true" multiValued="true"
termVectors="true" termPositions="true" termOffsets="true" />
<field name="nouns" type="uima_nouns" indexed="true" stored="true" multiValued="true"
termVectors="true" termPositions="true" termOffsets="true" />
<field name="weight" type="float" indexed="true" stored="true" />
<field name="price" type="float" indexed="true" stored="true" />
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />
<!--
Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
Some fields are multiValued only because Tika currently may return
multiple values for them.
-->
<field name="title" type="text" indexed="true" stored="true"
multiValued="true" />
<field name="subject" type="text" indexed="true" stored="true" />
<field name="description" type="text" indexed="true" stored="true" />
<field name="comments" type="text" indexed="true" stored="true" />
<field name="author" type="textgen" indexed="true" stored="true" />
<field name="keywords" type="textgen" indexed="true" stored="true" />
<field name="category" type="textgen" indexed="true" stored="true" />
<field name="content_type" type="string" indexed="true"
stored="true" multiValued="true" />
<field name="last_modified" type="date" indexed="true" stored="true" />
<field name="links" type="string" indexed="true" stored="true"
multiValued="true" />
<!--
catchall field, containing all other searchable text fields
(implemented via copyField further on in this schema
-->
<field name="text" type="text" indexed="true" stored="false"
multiValued="true" />
<!--
catchall text field that indexes tokens both normally and in
reverse for efficient leading wildcard queries.
-->
<field name="text_rev" type="text_rev" indexed="true" stored="false"
multiValued="true" />
<!--
non-tokenized version of manufacturer to make it easier to sort or
group results by manufacturer. copied from "manu" via copyField
-->
<field name="manu_exact" type="string" indexed="true" stored="false" />
<field name="payloads" type="payloads" indexed="true" stored="true" />
<!--
Uncommenting the following will create a "timestamp" field using a
default value of "NOW" to indicate when each document was indexed.
-->
<!--
<field name="timestamp" type="date" indexed="true" stored="true"
default="NOW" multiValued="false"/>
-->
<field name="language" type="string" indexed="true" stored="true" required="false"/>
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
<field name="sentiment" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="entity" type="text" indexed="true" stored="true" multiValued="true"/>
<!--
Dynamic field definitions. If a field name is not found,
dynamicFields will be used if the name matches any of the
patterns. RESTRICTION: the glob-like pattern in the name attribute
must have a "*" only at the start or the end. EXAMPLE: name="*_i"
will match any field ending in _i (like myid_i, z_i) Longer
patterns will be matched first. if equal size patterns both match,
the first appearing in the schema will be used. <dynamicField
name="*_i" type="int" indexed="true" stored="true"/> <dynamicField
name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true"
stored="true"/> <dynamicField name="*_t" type="text"
indexed="true" stored="true"/> <dynamicField name="*_b"
type="boolean" indexed="true" stored="true"/> <dynamicField
name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true"
stored="true"/> <dynamicField name="*_dt" type="date"
indexed="true" stored="true"/> <dynamicField name="*_ti"
type="tint" indexed="true" stored="true"/> <dynamicField
name="*_tl" type="tlong" indexed="true" stored="true"/>
<dynamicField name="*_tf" type="tfloat" indexed="true"
stored="true"/> <dynamicField name="*_td" type="tdouble"
indexed="true" stored="true"/> <dynamicField name="*_tdt"
type="tdate" indexed="true" stored="true"/> <dynamicField
name="*_pi" type="pint" indexed="true" stored="true"/>
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<dynamicField name="attr_*" type="textgen" indexed="true"
stored="true" multiValued="true"/> <dynamicField name="random_*"
type="random" />
-->
<dynamicField name="*_sm" type="string" indexed="true" stored="true" multiValued="true"/>
<!--
uncomment the following to ignore any fields that don't already
match an existing field name or dynamic field, rather than
reporting them as an error. alternately, change the type="ignored"
to some other type e.g. "text" if you want unknown fields indexed
and/or stored by default
-->
<!--dynamicField name="*" type="ignored" multiValued="true" /-->
</fields>
<!--
Field to use to determine and enforce document uniqueness. Unless
this field is marked with required="false", it will be a required
field
-->
<uniqueKey>id</uniqueKey>
<!--
field for the QueryParser to use when an explicit fieldname is
absent
-->
<defaultSearchField>text</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR" />
<!--
copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field
differently, or to add multiple fields to the same field for
easier/faster searching.
-->
<copyField source="cat" dest="text" />
<copyField source="name" dest="text" />
<copyField source="manu" dest="text" />
<copyField source="features" dest="text" />
<copyField source="includes" dest="text" />
<copyField source="text" dest="nouns" />
<copyField source="text" dest="sentences" />
<copyField source="manu" dest="manu_exact" />
<!--copyField source="Titolo" dest="text"/-->
<!--
Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same destination
field is to use the dynamic field syntax. copyField also supports a
maxChars to copy setting.
-->
<!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
<!--
copy name to alphaNameSort, a field designed for sorting by name
-->
<!-- <copyField source="name" dest="alphaNameSort"/> -->
<!--
Similarity is the scoring routine for each document vs. a query. A
custom similarity may be specified here, but the default is fine for
most applications.
-->
<!--
<similarity class="org.apache.lucene.search.DefaultSimilarity"/>
-->
<!--
... OR ... Specify a SimilarityFactory class name implementation
allowing parameters to be used.
-->
<!--
<similarity class="com.example.solr.CustomSimilarityFactory"> <str
name="paramkey">param value</str> </similarity>
-->
</schema>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
package org.apache.solr.uima.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.request.SolrQueryRequest;
import org.junit.BeforeClass;
import org.junit.Test;
/**
*/
public class UIMAAnnotationsTokenizerFactoryTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("uima/uima-tokenizers-solrconfig.xml", "uima/uima-tokenizers-schema.xml");
}
@Test
public void testInitialization() throws Exception {
assertNotNull(h.getCore().getSchema().getField("sentences"));
assertNotNull(h.getCore().getSchema().getFieldType("sentences"));
}
@Test
public void testIndexAndQuery() throws Exception {
assertU("<add><doc><field name=\"id\">123</field><field name=\"text\">One and 1 is two. Instead One or 1 is 0.</field></doc></add>");
assertU(commit());
SolrQueryRequest req = req("qt", "/terms", "terms.fl", "sentences");
assertQ(req, "//lst[@name='sentences']/int[@name='One and 1 is two.']");
assertQ(req, "//lst[@name='sentences']/int[@name=' Instead One or 1 is 0.']");
req.close();
}
}

View File

@ -0,0 +1,58 @@
package org.apache.solr.uima.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.request.SolrQueryRequest;
import org.junit.BeforeClass;
import org.junit.Test;
/**
*/
public class UIMATypeAwareAnnotationsTokenizerFactoryTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("uima/uima-tokenizers-solrconfig.xml", "uima/uima-tokenizers-schema.xml");
}
@Test
public void testInitialization() throws Exception {
assertNotNull(h.getCore().getSchema().getField("nouns"));
assertNotNull(h.getCore().getSchema().getFieldType("nouns"));
}
@Test
public void testIndexAndQuery() throws Exception {
assertU("<add><doc><field name=\"id\">123</field><field name=\"text\">The counter counts the beans: 1 and 2 and three.</field></doc></add>");
assertU(commit());
SolrQueryRequest req = req("qt", "/terms", "terms.fl", "nouns");
assertQ(req, "//lst[@name='nouns']/int[@name='beans']");
assertQ(req, "//lst[@name='nouns']/int[@name='counter']");
assertQ(req, "//lst[@name='nouns']/int[@name!='The']");
assertQ(req, "//lst[@name='nouns']/int[@name!='counts']");
assertQ(req, "//lst[@name='nouns']/int[@name!='the']");
assertQ(req, "//lst[@name='nouns']/int[@name!=':']");
assertQ(req, "//lst[@name='nouns']/int[@name!='1']");
assertQ(req, "//lst[@name='nouns']/int[@name!='and']");
assertQ(req, "//lst[@name='nouns']/int[@name!='2']");
assertQ(req, "//lst[@name='nouns']/int[@name!='three']");
assertQ(req, "//lst[@name='nouns']/int[@name!='.']");
req.close();
}
}