2011-10-05 16:21:59 -04:00
|
|
|
<?xml version="1.0"?>
|
|
|
|
|
|
|
|
<!--
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<project name="solr-langid" default="default">
|
|
|
|
|
|
|
|
<description>
|
|
|
|
Language Identifier contrib for extracting language from a document being indexed
|
|
|
|
</description>
|
|
|
|
|
2015-08-11 19:00:39 -04:00
|
|
|
<import file="../contrib-build.xml"/>
|
2011-10-05 16:21:59 -04:00
|
|
|
|
2018-01-17 11:29:17 -05:00
|
|
|
<property name="test.model.dir" location="${tests.userdir}/langid/solr/collection1/conf"/>
|
2019-05-21 11:06:00 -04:00
|
|
|
<property name="test.leipzig.folder.link" value="http://pcai056.informatik.uni-leipzig.de/downloads/corpora"/><!-- URL broken? -->
|
2018-01-17 11:29:17 -05:00
|
|
|
<property name="test.build.models.dir" location="${build.dir}/build-test-models"/>
|
|
|
|
<property name="test.build.models.data.dir" location="${test.build.models.dir}/data"/>
|
|
|
|
<property name="test.build.models.sentences.dir" location="${test.build.models.dir}/train"/>
|
|
|
|
<property name="test.opennlp.model" value="opennlp-langdetect.eng-swe-spa-rus-deu.bin"/>
|
|
|
|
|
|
|
|
<path id="opennlp.jars">
|
|
|
|
<fileset dir="lib" includes="opennlp*.jar"/>
|
|
|
|
</path>
|
|
|
|
|
2015-08-11 19:00:39 -04:00
|
|
|
<path id="classpath">
|
2012-04-04 13:53:32 -04:00
|
|
|
<fileset dir="../extraction/lib" excludes="${common.classpath.excludes}"/>
|
|
|
|
<fileset dir="lib" excludes="${common.classpath.excludes}"/>
|
2011-10-05 16:21:59 -04:00
|
|
|
<path refid="solr.base.classpath"/>
|
|
|
|
</path>
|
|
|
|
|
2014-04-25 04:55:05 -04:00
|
|
|
<!-- we don't actually need to compile this thing, we just want its libs -->
|
2012-03-30 14:04:43 -04:00
|
|
|
<target name="resolve-extraction-libs">
|
|
|
|
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
|
|
|
|
<propertyset refid="uptodate.and.compiled.properties"/>
|
|
|
|
</ant>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="compile-core" depends="resolve-extraction-libs,solr-contrib-build.compile-core"/>
|
2018-01-17 11:29:17 -05:00
|
|
|
|
|
|
|
<!--
|
|
|
|
Create test models using data for five languages from the Leipzig corpora.
|
2019-05-21 11:06:00 -04:00
|
|
|
See https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.langdetect.training.leipzig
|
2018-01-17 11:29:17 -05:00
|
|
|
-->
|
|
|
|
<target name="train-test-models" description="Train small test models for unit tests" depends="resolve">
|
|
|
|
<download-leipzig language.code="eng"/>
|
|
|
|
<download-leipzig language.code="swe"/>
|
|
|
|
<download-leipzig language.code="spa"/>
|
|
|
|
<download-leipzig language.code="rus"/>
|
|
|
|
<download-leipzig language.code="deu"/>
|
|
|
|
|
|
|
|
<echo message="Train OpenNLP test model over data from the Leipzig corpora"/>
|
|
|
|
<java classname="opennlp.tools.cmdline.CLI" classpathref="opennlp.jars" fork="true" failonerror="true">
|
|
|
|
<arg value="LanguageDetectorTrainer.leipzig"/>
|
|
|
|
|
|
|
|
<arg value="-model"/>
|
|
|
|
<arg value="${test.model.dir}/${test.opennlp.model}"/>
|
|
|
|
|
|
|
|
<arg value="-params"/>
|
|
|
|
<arg value="${tests.userdir}/opennlp.langdetect.trainer.params.txt"/>
|
|
|
|
|
|
|
|
<arg value="-sentencesDir"/>
|
|
|
|
<arg value="${test.build.models.sentences.dir}"/>
|
|
|
|
|
|
|
|
<arg value="-sentencesPerSample"/>
|
|
|
|
<arg value="3"/>
|
|
|
|
|
|
|
|
<arg value="-samplesPerLanguage"/>
|
|
|
|
<arg value="10000"/>
|
|
|
|
</java>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<macrodef name="download-leipzig">
|
|
|
|
<attribute name="language.code"/>
|
|
|
|
<attribute name="leipzig.tarball" default="@{language.code}_news_2007_30K.tar.gz"/>
|
|
|
|
<sequential>
|
|
|
|
<mkdir dir="${test.build.models.data.dir}"/>
|
|
|
|
<get src="${test.leipzig.folder.link}/@{leipzig.tarball}" dest="${test.build.models.data.dir}"/>
|
|
|
|
<untar compression="gzip" src="${test.build.models.data.dir}/@{leipzig.tarball}"
|
|
|
|
dest="${test.build.models.sentences.dir}">
|
|
|
|
<patternset>
|
|
|
|
<include name="*-sentences.txt"/>
|
|
|
|
</patternset>
|
|
|
|
</untar>
|
|
|
|
</sequential>
|
|
|
|
</macrodef>
|
|
|
|
|
|
|
|
<target name="regenerate" depends="train-test-models"/>
|
2011-10-05 16:21:59 -04:00
|
|
|
</project>
|