mirror of https://github.com/apache/lucene.git
[SOLR-3013] - removing the ae package from Solr as it's now under analysis/uima module, adding the Solr factories for UIMA based tokenizers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1295330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
651a236696
commit
0c5c13e157
|
@ -5,9 +5,12 @@ This file describes changes to the Solr UIMA (contrib/uima) module. See SOLR-212
|
|||
|
||||
Introduction
|
||||
------------
|
||||
This module is intended to be used while indexing documents.
|
||||
Its purpose is to provide additional on the fly automatically generated fields to the Solr index.
|
||||
This module is intended to be used both as an UpdateRequestProcessor while indexing documents and as a set of tokenizer/filters
|
||||
to be configured inside the schema.xml for use during analysis phase.
|
||||
UIMAUpdateRequestProcessor purpose is to provide additional on the fly automatically generated fields to the Solr index.
|
||||
Such fields could be language, concepts, keywords, sentences, named entities, etc.
|
||||
UIMA based tokenizers/filters can be used either inside plain Lucene or as index/query analyzers to be defined
|
||||
inside the schema.xml of a Solr core to create/filter tokens using specific UIMA annotations.
|
||||
|
||||
UIMA Dependency
|
||||
---------------
|
||||
|
|
|
@ -25,4 +25,18 @@
|
|||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${analyzers-uima.jar}"/>
|
||||
<path refid="solr.base.classpath"/>
|
||||
</path>
|
||||
|
||||
<target name="module-jars-to-solr" depends="jar-analyzers-uima">
|
||||
<mkdir dir="${build.dir}/lucene-libs"/>
|
||||
<copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
|
||||
<fileset file="${analyzers-uima.jar}"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="jar-analyzers-uima, solr-contrib-build.compile-core"/>
|
||||
<target name="dist" depends="module-jars-to-solr, common-solr.dist"/>
|
||||
</project>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package org.apache.solr.uima.processor.ae;
|
||||
package org.apache.solr.uima.analysis;
|
||||
|
||||
/**
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
@ -17,16 +17,30 @@ package org.apache.solr.uima.processor.ae;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer;
|
||||
import org.apache.solr.analysis.BaseTokenizerFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* provide an Apache UIMA {@link AnalysisEngine}
|
||||
*
|
||||
*
|
||||
* Solr {@link org.apache.solr.analysis.TokenizerFactory} for {@link UIMAAnnotationsTokenizer}
|
||||
*/
|
||||
public interface AEProvider {
|
||||
public class UIMAAnnotationsTokenizerFactory extends BaseTokenizerFactory {
|
||||
|
||||
public AnalysisEngine getAE() throws ResourceInitializationException;
|
||||
private String descriptorPath;
|
||||
private String tokenType;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
descriptorPath = args.get("descriptorPath");
|
||||
tokenType = args.get("tokenType");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.solr.uima.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer;
|
||||
import org.apache.solr.analysis.BaseTokenizerFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Solr {@link org.apache.solr.analysis.TokenizerFactory} for {@link UIMATypeAwareAnnotationsTokenizer}
|
||||
*/
|
||||
public class UIMATypeAwareAnnotationsTokenizerFactory extends BaseTokenizerFactory {
|
||||
|
||||
private String descriptorPath;
|
||||
private String tokenType;
|
||||
private String featurePath;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
descriptorPath = args.get("descriptorPath");
|
||||
tokenType = args.get("tokenType");
|
||||
featurePath = args.get("featurePath");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input);
|
||||
}
|
||||
}
|
|
@ -23,8 +23,8 @@ import org.apache.solr.common.SolrInputDocument;
|
|||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
|
||||
import org.apache.solr.uima.processor.ae.AEProvider;
|
||||
import org.apache.solr.uima.processor.ae.AEProviderFactory;
|
||||
import org.apache.lucene.analysis.uima.ae.AEProvider;
|
||||
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
|
|
|
@ -1,53 +0,0 @@
|
|||
package org.apache.solr.uima.processor.ae;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Singleton factory class responsible of {@link AEProvider}s' creation
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class AEProviderFactory {
|
||||
|
||||
private static AEProviderFactory instance;
|
||||
|
||||
private Map<String, AEProvider> providerCache = new HashMap<String, AEProvider>();
|
||||
|
||||
private AEProviderFactory() {
|
||||
// Singleton
|
||||
}
|
||||
|
||||
public static AEProviderFactory getInstance() {
|
||||
if (instance == null) {
|
||||
instance = new AEProviderFactory();
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
public synchronized AEProvider getAEProvider(String core, String aePath,
|
||||
Map<String, Object> runtimeParameters) {
|
||||
String key = new StringBuilder(core).append(aePath).toString();
|
||||
if (providerCache.get(key) == null) {
|
||||
providerCache.put(key, new OverridingParamsAEProvider(aePath, runtimeParameters));
|
||||
}
|
||||
return providerCache.get(key);
|
||||
}
|
||||
}
|
|
@ -1,117 +0,0 @@
|
|||
package org.apache.solr.uima.processor.ae;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.uima.UIMAFramework;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.uima.util.XMLInputSource;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* {@link AEProvider} implementation that creates an Aggregate AE from the given path, also
|
||||
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
|
||||
* them as overriding parameters in the aggregate AE
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class OverridingParamsAEProvider implements AEProvider {
|
||||
|
||||
private static Logger log = LoggerFactory.getLogger(OverridingParamsAEProvider.class);
|
||||
|
||||
private String aeFilePath;
|
||||
|
||||
private AnalysisEngine cachedAE;
|
||||
|
||||
private Map<String, Object> runtimeParameters;
|
||||
|
||||
public OverridingParamsAEProvider(String aeFilePath, Map<String, Object> runtimeParameters) {
|
||||
this.aeFilePath = aeFilePath;
|
||||
this.runtimeParameters = runtimeParameters;
|
||||
}
|
||||
|
||||
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
|
||||
try {
|
||||
if (cachedAE == null) {
|
||||
// get Resource Specifier from XML file
|
||||
URL url = this.getClass().getResource(aeFilePath);
|
||||
XMLInputSource in = new XMLInputSource(url);
|
||||
|
||||
// get AE description
|
||||
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
|
||||
.parseAnalysisEngineDescription(in);
|
||||
|
||||
/* iterate over each AE (to set runtime parameters) */
|
||||
for (String attributeName : runtimeParameters.keySet()) {
|
||||
Object val = getRuntimeValue(desc, attributeName);
|
||||
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
|
||||
attributeName, val);
|
||||
if (log.isDebugEnabled())
|
||||
log.debug(new StringBuilder("setting ").append(attributeName).append(" : ").append(
|
||||
runtimeParameters.get(attributeName)).toString());
|
||||
}
|
||||
// create AE here
|
||||
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
|
||||
if (log.isDebugEnabled())
|
||||
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
|
||||
.append(" created from descriptor ").append(aeFilePath).toString());
|
||||
} else {
|
||||
cachedAE.reconfigure();
|
||||
if (log.isDebugEnabled())
|
||||
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
|
||||
.append(" at path ").append(aeFilePath).append(" reconfigured ").toString());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
cachedAE = null;
|
||||
throw new ResourceInitializationException(e);
|
||||
}
|
||||
return cachedAE;
|
||||
}
|
||||
|
||||
/* create the value to inject in the runtime parameter depending on its declared type */
|
||||
private Object getRuntimeValue(AnalysisEngineDescription desc, String attributeName)
|
||||
throws ClassNotFoundException {
|
||||
String type = desc.getAnalysisEngineMetaData().getConfigurationParameterDeclarations().
|
||||
getConfigurationParameter(null, attributeName).getType();
|
||||
// TODO : do it via reflection ? i.e. Class paramType = Class.forName(type)...
|
||||
Object val = null;
|
||||
Object runtimeValue = runtimeParameters.get(attributeName);
|
||||
if (runtimeValue!=null) {
|
||||
if ("String".equals(type)) {
|
||||
val = String.valueOf(runtimeValue);
|
||||
}
|
||||
else if ("Integer".equals(type)) {
|
||||
val = Integer.valueOf(runtimeValue.toString());
|
||||
}
|
||||
else if ("Boolean".equals(type)) {
|
||||
val = Boolean.valueOf(runtimeValue.toString());
|
||||
}
|
||||
else if ("Float".equals(type)) {
|
||||
val = Float.valueOf(runtimeValue.toString());
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
vbg
|
||||
vbz
|
||||
vbd
|
||||
vbn
|
||||
vb
|
||||
bez
|
||||
cc
|
||||
cd
|
||||
at
|
||||
.
|
||||
:
|
|
@ -0,0 +1,680 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version
|
||||
2.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 Unless required by
|
||||
applicable law or agreed to in writing, software distributed under
|
||||
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
This is the Solr schema file. This file should be named "schema.xml"
|
||||
and should be in the conf directory under the solr home (i.e.
|
||||
./solr/conf/schema.xml by default) or located where the classloader
|
||||
for the Solr webapp can find it. This example schema is the
|
||||
recommended starting point for users. It should be kept correct and
|
||||
concise, usable out-of-the-box. For more information, on how to
|
||||
customize this file, please see
|
||||
http://wiki.apache.org/solr/SchemaXml PERFORMANCE NOTE: this schema
|
||||
includes many optional features and should not be used for
|
||||
benchmarking. To improve performance one could - set stored="false"
|
||||
for all fields possible (esp large fields) when you only need to
|
||||
search on the field but don't need to return the original value. -
|
||||
set indexed="false" if you don't need to search on the field, but
|
||||
only return the field as a result of searching on other indexed
|
||||
fields. - remove all unneeded copyField statements - for best index
|
||||
size and searching performance, set "index" to false for all general
|
||||
text fields, use copyField to copy them to the catchall "text"
|
||||
field, and use that for searching. - For maximum indexing
|
||||
performance, use the StreamingUpdateSolrServer java client. -
|
||||
Remember to run the JVM in server mode, and use a higher logging
|
||||
level that avoids logging every request
|
||||
-->
|
||||
|
||||
<schema name="sample" version="1.2">
|
||||
<!--
|
||||
attribute "name" is the name of this schema and is only used for
|
||||
display purposes. Applications should change this to reflect the
|
||||
nature of the search collection. version="1.2" is Solr's version
|
||||
number for the schema syntax and semantics. It should not normally
|
||||
be changed by applications. 1.0: multiValued attribute did not
|
||||
exist, all fields are multiValued by nature 1.1: multiValued
|
||||
attribute introduced, false by default 1.2: omitTermFreqAndPositions
|
||||
attribute introduced, true by default except for text fields.
|
||||
-->
|
||||
|
||||
<types>
|
||||
<!--
|
||||
field type definitions. The "name" attribute is just a label to be
|
||||
used by field definitions. The "class" attribute and any other
|
||||
attributes determine the real behavior of the fieldType. Class
|
||||
names starting with "solr" refer to java classes in the
|
||||
org.apache.solr.analysis package.
|
||||
-->
|
||||
|
||||
<!--
|
||||
The StrField type is not analyzed, but indexed/stored verbatim. -
|
||||
StrField and TextField support an optional compressThreshold which
|
||||
limits compression (if enabled in the derived fields) to values
|
||||
which exceed a certain size (in characters).
|
||||
-->
|
||||
<fieldType name="string" class="solr.StrField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
|
||||
<!-- boolean type: "true" or "false" -->
|
||||
<fieldType name="boolean" class="solr.BoolField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<!--
|
||||
Binary data type. The data should be sent/retrieved in as Base64
|
||||
encoded Strings
|
||||
-->
|
||||
<fieldtype name="binary" class="solr.BinaryField" />
|
||||
|
||||
<!--
|
||||
The optional sortMissingLast and sortMissingFirst attributes are
|
||||
currently supported on types that are sorted internally as
|
||||
strings. This includes
|
||||
"string","boolean","sint","slong","sfloat","sdouble","pdate" - If
|
||||
sortMissingLast="true", then a sort on this field will cause
|
||||
documents without the field to come after documents with the
|
||||
field, regardless of the requested sort order (asc or desc). - If
|
||||
sortMissingFirst="true", then a sort on this field will cause
|
||||
documents without the field to come before documents with the
|
||||
field, regardless of the requested sort order. - If
|
||||
sortMissingLast="false" and sortMissingFirst="false" (the
|
||||
default), then default lucene sorting will be used which places
|
||||
docs without the field first in an ascending sort and last in a
|
||||
descending sort.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Default numeric field types. For faster range queries, consider
|
||||
the tint/tfloat/tlong/tdouble types.
|
||||
-->
|
||||
<fieldType name="int" class="solr.TrieIntField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="float" class="solr.TrieFloatField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="long" class="solr.TrieLongField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="double" class="solr.TrieDoubleField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
|
||||
<!--
|
||||
Numeric field types that index each value at various levels of
|
||||
precision to accelerate range queries when the number of values
|
||||
between the range endpoints is large. See the javadoc for
|
||||
NumericRangeQuery for internal implementation details. Smaller
|
||||
precisionStep values (specified in bits) will lead to more tokens
|
||||
indexed per value, slightly larger index size, and faster range
|
||||
queries. A precisionStep of 0 disables indexing at different
|
||||
precision levels.
|
||||
-->
|
||||
<fieldType name="tint" class="solr.TrieIntField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="tfloat" class="solr.TrieFloatField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="tlong" class="solr.TrieLongField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="tdouble" class="solr.TrieDoubleField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
|
||||
<!--
|
||||
The format for this date field is of the form
|
||||
1995-12-31T23:59:59Z, and is a more restricted form of the
|
||||
canonical representation of dateTime
|
||||
http://www.w3.org/TR/xmlschema-2/#dateTime The trailing "Z"
|
||||
designates UTC time and is mandatory. Optional fractional seconds
|
||||
are allowed: 1995-12-31T23:59:59.999Z All other components are
|
||||
mandatory. Expressions can also be used to denote calculations
|
||||
that should be performed relative to "NOW" to determine the value,
|
||||
ie... NOW/HOUR ... Round to the start of the current hour NOW-1DAY
|
||||
... Exactly 1 day prior to now NOW/DAY+6MONTHS+3DAYS ... 6 months
|
||||
and 3 days in the future from the start of the current day Consult
|
||||
the DateField javadocs for more information. Note: For faster
|
||||
range queries, consider the tdate type
|
||||
-->
|
||||
<fieldType name="date" class="solr.TrieDateField"
|
||||
omitNorms="true" precisionStep="0" positionIncrementGap="0" />
|
||||
|
||||
<!--
|
||||
A Trie based date field for faster date range queries and date
|
||||
faceting.
|
||||
-->
|
||||
<fieldType name="tdate" class="solr.TrieDateField"
|
||||
omitNorms="true" precisionStep="6" positionIncrementGap="0" />
|
||||
|
||||
|
||||
<!--
|
||||
Note: These should only be used for compatibility with existing
|
||||
indexes (created with older Solr versions) or if
|
||||
"sortMissingFirst" or "sortMissingLast" functionality is needed.
|
||||
Use Trie based fields instead. Plain numeric field types that
|
||||
store and index the text value verbatim (and hence don't support
|
||||
range queries, since the lexicographic ordering isn't equal to the
|
||||
numeric ordering)
|
||||
-->
|
||||
<fieldType name="pint" class="solr.IntField" omitNorms="true" />
|
||||
<fieldType name="plong" class="solr.LongField" omitNorms="true" />
|
||||
<fieldType name="pfloat" class="solr.FloatField"
|
||||
omitNorms="true" />
|
||||
<fieldType name="pdouble" class="solr.DoubleField"
|
||||
omitNorms="true" />
|
||||
<fieldType name="pdate" class="solr.DateField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
|
||||
|
||||
<!--
|
||||
Note: These should only be used for compatibility with existing
|
||||
indexes (created with older Solr versions) or if
|
||||
"sortMissingFirst" or "sortMissingLast" functionality is needed.
|
||||
Use Trie based fields instead. Numeric field types that manipulate
|
||||
the value into a string value that isn't human-readable in its
|
||||
internal form, but with a lexicographic ordering the same as the
|
||||
numeric ordering, so that range queries work correctly.
|
||||
-->
|
||||
<fieldType name="sint" class="solr.SortableIntField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<fieldType name="slong" class="solr.SortableLongField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<fieldType name="sfloat" class="solr.SortableFloatField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<fieldType name="sdouble" class="solr.SortableDoubleField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
|
||||
|
||||
<!--
|
||||
The "RandomSortField" is not used to store or search any data. You
|
||||
can declare fields of this type it in your schema to generate
|
||||
pseudo-random orderings of your docs for sorting purposes. The
|
||||
ordering is generated based on the field name and the version of
|
||||
the index, As long as the index version remains unchanged, and the
|
||||
same field name is reused, the ordering of the docs will be
|
||||
consistent. If you want different psuedo-random orderings of
|
||||
documents, for the same version of the index, use a dynamicField
|
||||
and change the name
|
||||
-->
|
||||
<fieldType name="random" class="solr.RandomSortField"
|
||||
indexed="true" />
|
||||
|
||||
<!--
|
||||
solr.TextField allows the specification of custom text analyzers
|
||||
specified as a tokenizer and a list of token filters. Different
|
||||
analyzers may be specified for indexing and querying. The optional
|
||||
positionIncrementGap puts space between multiple fields of this
|
||||
type on the same document, with the purpose of preventing false
|
||||
phrase matching across fields. For more info on customizing your
|
||||
analyzer chain, please see
|
||||
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
|
||||
-->
|
||||
|
||||
<!--
|
||||
One can also specify an existing Analyzer class that has a default
|
||||
constructor via the class attribute on the analyzer element
|
||||
<fieldType name="text_greek" class="solr.TextField"> <analyzer
|
||||
class="org.apache.lucene.analysis.el.GreekAnalyzer"/> </fieldType>
|
||||
-->
|
||||
|
||||
<!--
|
||||
A text field that only splits on whitespace for exact matching of
|
||||
words
|
||||
-->
|
||||
<fieldType name="text_ws" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!--
|
||||
A text field that uses WordDelimiterFilter to enable splitting and
|
||||
matching of words on case-change, alpha numeric boundaries, and
|
||||
non-alphanumeric chars, so that a query of "wifi" or "wi fi" could
|
||||
match a document containing "Wi-Fi". Synonyms and stopwords are
|
||||
customized by external files, and stemming is enabled.
|
||||
-->
|
||||
<fieldType name="text" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<!--
|
||||
in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymFilterFactory"
|
||||
synonyms="index_synonyms.txt" ignoreCase="true"
|
||||
expand="false"/>
|
||||
-->
|
||||
<!--
|
||||
Case insensitive stop word removal. add
|
||||
enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
Less flexible matching, but less false matches. Probably not ideal
|
||||
for product names, but may be good for SKUs. Can insert dashes in
|
||||
the wrong place and still match.
|
||||
-->
|
||||
<fieldType name="textTight" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="0" generateNumberParts="0" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
|
||||
<!--
|
||||
this filter can remove any duplicate tokens that appear at the
|
||||
same position - sometimes possible with WordDelimiterFilter in
|
||||
conjuncton with stemming.
|
||||
-->
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="uima_sentences" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="org.apache.solr.uima.analysis.UIMAAnnotationsTokenizerFactory"
|
||||
descriptorPath="/uima/AggregateSentenceAE.xml" tokenType="org.apache.uima.SentenceAnnotation"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="uima_nouns" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="org.apache.solr.uima.analysis.UIMATypeAwareAnnotationsTokenizerFactory"
|
||||
descriptorPath="/uima/AggregateSentenceAE.xml" tokenType="org.apache.uima.TokenAnnotation"
|
||||
featurePath="posTag"/>
|
||||
<filter class="solr.TypeTokenFilterFactory" types="uima/stoptypes.txt" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
A general unstemmed text field - good if one does not know the
|
||||
language of the field
|
||||
-->
|
||||
<fieldType name="textgen" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
A general unstemmed text field that indexes tokens normally and
|
||||
also reversed (via ReversedWildcardFilterFactory), to enable more
|
||||
efficient leading wildcard queries.
|
||||
-->
|
||||
<fieldType name="text_rev" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<filter class="solr.ReversedWildcardFilterFactory"
|
||||
withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2"
|
||||
maxFractionAsterisk="0.33" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- charFilter + WhitespaceTokenizer -->
|
||||
<!--
|
||||
<fieldType name="textCharNorm" class="solr.TextField"
|
||||
positionIncrementGap="100" > <analyzer> <charFilter
|
||||
class="solr.MappingCharFilterFactory"
|
||||
mapping="mapping-ISOLatin1Accent.txt"/> <tokenizer
|
||||
class="solr.MockTokenizerFactory"/> </analyzer> </fieldType>
|
||||
-->
|
||||
|
||||
<!--
|
||||
This is an example of using the KeywordTokenizer along With
|
||||
various TokenFilterFactories to produce a sortable field that does
|
||||
not include some properties of the source text
|
||||
-->
|
||||
<fieldType name="alphaOnlySort" class="solr.TextField"
|
||||
sortMissingLast="true" omitNorms="true">
|
||||
<analyzer>
|
||||
<!--
|
||||
KeywordTokenizer does no actual tokenizing, so the entire
|
||||
input string is preserved as a single token
|
||||
-->
|
||||
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
|
||||
<!--
|
||||
The LowerCase TokenFilter does what you expect, which can be
|
||||
when you want your sorting to be case insensitive
|
||||
-->
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<!-- The TrimFilter removes any leading or trailing whitespace -->
|
||||
<filter class="solr.TrimFilterFactory" />
|
||||
<!--
|
||||
The PatternReplaceFilter gives you the flexibility to use Java
|
||||
Regular expression to replace any sequence of characters
|
||||
matching a pattern with an arbitrary replacement string, which
|
||||
may include back references to portions of the original string
|
||||
matched by the pattern. See the Java Regular Expression
|
||||
documentation for more information on pattern and replacement
|
||||
string syntax.
|
||||
|
||||
http://java.sun.com/j2se/1.6.0/docs/api/java/util/regex/package-summary.html
|
||||
-->
|
||||
<filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])"
|
||||
replacement="" replace="all" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldtype name="phonetic" stored="false" indexed="true"
|
||||
class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory" />
|
||||
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="payloads" stored="false" indexed="true"
|
||||
class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory" />
|
||||
<!--
|
||||
The DelimitedPayloadTokenFilter can put payloads on tokens...
|
||||
for example, a token of "foo|1.4" would be indexed as "foo"
|
||||
with a payload of 1.4f Attributes of the
|
||||
DelimitedPayloadTokenFilterFactory : "delimiter" - a one
|
||||
character delimiter. Default is | (pipe) "encoder" - how to
|
||||
encode the following value into a playload float ->
|
||||
org.apache.lucene.analysis.payloads.FloatEncoder, integer ->
|
||||
o.a.l.a.p.IntegerEncoder identity -> o.a.l.a.p.IdentityEncoder
|
||||
Fully Qualified class name implementing PayloadEncoder,
|
||||
Encoder must have a no arg constructor.
|
||||
-->
|
||||
<filter class="solr.DelimitedPayloadTokenFilterFactory"
|
||||
encoder="float" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<!--
|
||||
lowercases the entire field value, keeping it as a single token.
|
||||
-->
|
||||
<fieldType name="lowercase" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
since fields of this type are by default not stored or indexed,
|
||||
any data added to them will be ignored outright.
|
||||
-->
|
||||
<fieldtype name="ignored" stored="false" indexed="false"
|
||||
multiValued="true" class="solr.StrField" />
|
||||
|
||||
</types>
|
||||
|
||||
|
||||
<fields>
|
||||
<!--
|
||||
Valid attributes for fields: name: mandatory - the name for the
|
||||
field type: mandatory - the name of a previously defined type from
|
||||
the <types> section indexed: true if this field should be indexed
|
||||
(searchable or sortable) stored: true if this field should be
|
||||
retrievable compressed: [false] if this field should be stored
|
||||
using gzip compression (this will only apply if the field type is
|
||||
compressable; among the standard field types, only TextField and
|
||||
StrField are) multiValued: true if this field may contain multiple
|
||||
values per document omitNorms: (expert) set to true to omit the
|
||||
norms associated with this field (this disables length
|
||||
normalization and index-time boosting for the field, and saves
|
||||
some memory). Only full-text fields or fields that need an
|
||||
index-time boost need norms. termVectors: [false] set to true to
|
||||
store the term vector for a given field. When using MoreLikeThis,
|
||||
fields used for similarity should be stored for best performance.
|
||||
termPositions: Store position information with the term vector.
|
||||
This will increase storage costs. termOffsets: Store offset
|
||||
information with the term vector. This will increase storage
|
||||
costs. default: a value that should be used if no value is
|
||||
specified when adding a document.
|
||||
-->
|
||||
<field name="id" type="string" indexed="true" stored="true"
|
||||
required="true" />
|
||||
<field name="sku" type="textTight" indexed="true" stored="true"
|
||||
omitNorms="true" />
|
||||
<field name="name" type="textgen" indexed="true" stored="true" />
|
||||
<field name="alphaNameSort" type="alphaOnlySort" indexed="true"
|
||||
stored="false" />
|
||||
<field name="manu" type="textgen" indexed="true" stored="true"
|
||||
omitNorms="true" />
|
||||
<field name="cat" type="text_ws" indexed="true" stored="true"
|
||||
multiValued="true" omitNorms="true" />
|
||||
<field name="features" type="text" indexed="true" stored="true"
|
||||
multiValued="true" />
|
||||
<field name="includes" type="text" indexed="true" stored="true"
|
||||
termVectors="true" termPositions="true" termOffsets="true" />
|
||||
|
||||
<field name="sentences" type="uima_sentences" indexed="true" stored="true" multiValued="true"
|
||||
termVectors="true" termPositions="true" termOffsets="true" />
|
||||
<field name="nouns" type="uima_nouns" indexed="true" stored="true" multiValued="true"
|
||||
termVectors="true" termPositions="true" termOffsets="true" />
|
||||
|
||||
<field name="weight" type="float" indexed="true" stored="true" />
|
||||
<field name="price" type="float" indexed="true" stored="true" />
|
||||
<field name="popularity" type="int" indexed="true" stored="true" />
|
||||
<field name="inStock" type="boolean" indexed="true" stored="true" />
|
||||
|
||||
|
||||
<!--
|
||||
Common metadata fields, named specifically to match up with
|
||||
SolrCell metadata when parsing rich documents such as Word, PDF.
|
||||
Some fields are multiValued only because Tika currently may return
|
||||
multiple values for them.
|
||||
-->
|
||||
<field name="title" type="text" indexed="true" stored="true"
|
||||
multiValued="true" />
|
||||
<field name="subject" type="text" indexed="true" stored="true" />
|
||||
<field name="description" type="text" indexed="true" stored="true" />
|
||||
<field name="comments" type="text" indexed="true" stored="true" />
|
||||
<field name="author" type="textgen" indexed="true" stored="true" />
|
||||
<field name="keywords" type="textgen" indexed="true" stored="true" />
|
||||
<field name="category" type="textgen" indexed="true" stored="true" />
|
||||
<field name="content_type" type="string" indexed="true"
|
||||
stored="true" multiValued="true" />
|
||||
<field name="last_modified" type="date" indexed="true" stored="true" />
|
||||
<field name="links" type="string" indexed="true" stored="true"
|
||||
multiValued="true" />
|
||||
|
||||
|
||||
<!--
|
||||
catchall field, containing all other searchable text fields
|
||||
(implemented via copyField further on in this schema
|
||||
-->
|
||||
<field name="text" type="text" indexed="true" stored="false"
|
||||
multiValued="true" />
|
||||
|
||||
<!--
|
||||
catchall text field that indexes tokens both normally and in
|
||||
reverse for efficient leading wildcard queries.
|
||||
-->
|
||||
<field name="text_rev" type="text_rev" indexed="true" stored="false"
|
||||
multiValued="true" />
|
||||
|
||||
<!--
|
||||
non-tokenized version of manufacturer to make it easier to sort or
|
||||
group results by manufacturer. copied from "manu" via copyField
|
||||
-->
|
||||
<field name="manu_exact" type="string" indexed="true" stored="false" />
|
||||
|
||||
<field name="payloads" type="payloads" indexed="true" stored="true" />
|
||||
|
||||
<!--
|
||||
Uncommenting the following will create a "timestamp" field using a
|
||||
default value of "NOW" to indicate when each document was indexed.
|
||||
-->
|
||||
<!--
|
||||
<field name="timestamp" type="date" indexed="true" stored="true"
|
||||
default="NOW" multiValued="false"/>
|
||||
-->
|
||||
|
||||
<field name="language" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
|
||||
<field name="sentiment" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="entity" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
|
||||
<!--
|
||||
Dynamic field definitions. If a field name is not found,
|
||||
dynamicFields will be used if the name matches any of the
|
||||
patterns. RESTRICTION: the glob-like pattern in the name attribute
|
||||
must have a "*" only at the start or the end. EXAMPLE: name="*_i"
|
||||
will match any field ending in _i (like myid_i, z_i) Longer
|
||||
patterns will be matched first. if equal size patterns both match,
|
||||
the first appearing in the schema will be used. <dynamicField
|
||||
name="*_i" type="int" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_s" type="string" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_l" type="long" indexed="true"
|
||||
stored="true"/> <dynamicField name="*_t" type="text"
|
||||
indexed="true" stored="true"/> <dynamicField name="*_b"
|
||||
type="boolean" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_f" type="float" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_d" type="double" indexed="true"
|
||||
stored="true"/> <dynamicField name="*_dt" type="date"
|
||||
indexed="true" stored="true"/> <dynamicField name="*_ti"
|
||||
type="tint" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_tl" type="tlong" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_tf" type="tfloat" indexed="true"
|
||||
stored="true"/> <dynamicField name="*_td" type="tdouble"
|
||||
indexed="true" stored="true"/> <dynamicField name="*_tdt"
|
||||
type="tdate" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_pi" type="pint" indexed="true" stored="true"/>
|
||||
|
||||
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
|
||||
<dynamicField name="attr_*" type="textgen" indexed="true"
|
||||
stored="true" multiValued="true"/> <dynamicField name="random_*"
|
||||
type="random" />
|
||||
-->
|
||||
<dynamicField name="*_sm" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||
<!--
|
||||
uncomment the following to ignore any fields that don't already
|
||||
match an existing field name or dynamic field, rather than
|
||||
reporting them as an error. alternately, change the type="ignored"
|
||||
to some other type e.g. "text" if you want unknown fields indexed
|
||||
and/or stored by default
|
||||
-->
|
||||
<!--dynamicField name="*" type="ignored" multiValued="true" /-->
|
||||
|
||||
</fields>
|
||||
|
||||
<!--
|
||||
Field to use to determine and enforce document uniqueness. Unless
|
||||
this field is marked with required="false", it will be a required
|
||||
field
|
||||
-->
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!--
|
||||
field for the QueryParser to use when an explicit fieldname is
|
||||
absent
|
||||
-->
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
|
||||
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
|
||||
<solrQueryParser defaultOperator="OR" />
|
||||
|
||||
<!--
|
||||
copyField commands copy one field to another at the time a document
|
||||
is added to the index. It's used either to index the same field
|
||||
differently, or to add multiple fields to the same field for
|
||||
easier/faster searching.
|
||||
-->
|
||||
|
||||
<copyField source="cat" dest="text" />
|
||||
<copyField source="name" dest="text" />
|
||||
<copyField source="manu" dest="text" />
|
||||
<copyField source="features" dest="text" />
|
||||
<copyField source="includes" dest="text" />
|
||||
<copyField source="text" dest="nouns" />
|
||||
<copyField source="text" dest="sentences" />
|
||||
<copyField source="manu" dest="manu_exact" />
|
||||
|
||||
|
||||
<!--copyField source="Titolo" dest="text"/-->
|
||||
|
||||
<!--
|
||||
Above, multiple source fields are copied to the [text] field.
|
||||
Another way to map multiple source fields to the same destination
|
||||
field is to use the dynamic field syntax. copyField also supports a
|
||||
maxChars to copy setting.
|
||||
-->
|
||||
|
||||
<!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
|
||||
|
||||
<!--
|
||||
copy name to alphaNameSort, a field designed for sorting by name
|
||||
-->
|
||||
<!-- <copyField source="name" dest="alphaNameSort"/> -->
|
||||
|
||||
|
||||
<!--
|
||||
Similarity is the scoring routine for each document vs. a query. A
|
||||
custom similarity may be specified here, but the default is fine for
|
||||
most applications.
|
||||
-->
|
||||
<!--
|
||||
<similarity class="org.apache.lucene.search.DefaultSimilarity"/>
|
||||
-->
|
||||
<!--
|
||||
... OR ... Specify a SimilarityFactory class name implementation
|
||||
allowing parameters to be used.
|
||||
-->
|
||||
<!--
|
||||
<similarity class="com.example.solr.CustomSimilarityFactory"> <str
|
||||
name="paramkey">param value</str> </similarity>
|
||||
-->
|
||||
|
||||
|
||||
</schema>
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,49 @@
|
|||
package org.apache.solr.uima.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class UIMAAnnotationsTokenizerFactoryTest extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("uima/uima-tokenizers-solrconfig.xml", "uima/uima-tokenizers-schema.xml");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInitialization() throws Exception {
|
||||
assertNotNull(h.getCore().getSchema().getField("sentences"));
|
||||
assertNotNull(h.getCore().getSchema().getFieldType("sentences"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndexAndQuery() throws Exception {
|
||||
assertU("<add><doc><field name=\"id\">123</field><field name=\"text\">One and 1 is two. Instead One or 1 is 0.</field></doc></add>");
|
||||
assertU(commit());
|
||||
SolrQueryRequest req = req("qt", "/terms", "terms.fl", "sentences");
|
||||
assertQ(req, "//lst[@name='sentences']/int[@name='One and 1 is two.']");
|
||||
assertQ(req, "//lst[@name='sentences']/int[@name=' Instead One or 1 is 0.']");
|
||||
req.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.solr.uima.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class UIMATypeAwareAnnotationsTokenizerFactoryTest extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("uima/uima-tokenizers-solrconfig.xml", "uima/uima-tokenizers-schema.xml");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInitialization() throws Exception {
|
||||
assertNotNull(h.getCore().getSchema().getField("nouns"));
|
||||
assertNotNull(h.getCore().getSchema().getFieldType("nouns"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndexAndQuery() throws Exception {
|
||||
assertU("<add><doc><field name=\"id\">123</field><field name=\"text\">The counter counts the beans: 1 and 2 and three.</field></doc></add>");
|
||||
assertU(commit());
|
||||
SolrQueryRequest req = req("qt", "/terms", "terms.fl", "nouns");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name='beans']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name='counter']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='The']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='counts']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='the']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!=':']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='1']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='and']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='2']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='three']");
|
||||
assertQ(req, "//lst[@name='nouns']/int[@name!='.']");
|
||||
req.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue