mirror of https://github.com/apache/lucene.git
SOLR-9221: Remove Solr contribs: map-reduce, morphlines-core and morphlines-cell
This commit is contained in:
parent
843fabb6e1
commit
b1a574df4e
|
@ -45,7 +45,6 @@ parent.iml
|
|||
/solr/example/example-DIH/solr/mail/lib/*.jar
|
||||
|
||||
solr/contrib/dataimporthandler/test-lib/
|
||||
solr/contrib/morphlines-core/test-lib/
|
||||
|
||||
solr/core/test-lib/
|
||||
|
||||
|
|
|
@ -46,9 +46,6 @@
|
|||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/dataimporthandler/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/extraction/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/langid/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/map-reduce/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/uima/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/velocity/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/solrj/build.xml" />
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Solr morphlines cell library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -1,10 +0,0 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Solr morphlines core library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -1,10 +0,0 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Solr morphlines core test library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -56,9 +56,6 @@
|
|||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/extraction/extraction.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/langid/langid.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/ltr/ltr.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/map-reduce/map-reduce.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/morphlines-cell/morphlines-cell.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/morphlines-core/morphlines-core.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/uima/uima.iml" />
|
||||
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/velocity/velocity.iml" />
|
||||
</modules>
|
||||
|
|
|
@ -316,30 +316,6 @@
|
|||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr map-reduce contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="map-reduce" />
|
||||
<option name="TEST_OBJECT" value="pattern" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/map-reduce" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr morphlines-cell contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="morphlines-cell" />
|
||||
<option name="TEST_OBJECT" value="pattern" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-cell" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr morphlines-core contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="morphlines-core" />
|
||||
<option name="TEST_OBJECT" value="pattern" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-core" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr uima contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="uima" />
|
||||
<option name="TEST_OBJECT" value="pattern" />
|
||||
|
@ -357,7 +333,7 @@
|
|||
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
|
||||
</configuration>
|
||||
|
||||
<list size="44">
|
||||
<list size="41">
|
||||
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
|
||||
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
|
||||
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
|
||||
|
@ -395,13 +371,10 @@
|
|||
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
|
||||
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
|
||||
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
|
||||
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr map-reduce contrib" />
|
||||
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr morphlines-cell contrib" />
|
||||
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr morphlines-core contrib" />
|
||||
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
|
||||
<item index="41" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
|
||||
<item index="42" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
|
||||
<item index="43" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
|
||||
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
|
||||
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
|
||||
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
|
||||
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
|
||||
</list>
|
||||
</component>
|
||||
</project>
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Solr core library" level="project" />
|
||||
<orderEntry type="library" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" name="Solr extraction library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines core library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr core test library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="misc" />
|
||||
<orderEntry type="module" module-name="extraction" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="morphlines-core" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
<orderEntry type="module-library">
|
||||
<library>
|
||||
<CLASSES>
|
||||
<root url="file://$MODULE_DIR$/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
|
||||
</library>
|
||||
</orderEntry>
|
||||
</component>
|
||||
</module>
|
|
@ -1,29 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Solr core library" level="project" />
|
||||
<orderEntry type="library" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" name="Solr extraction library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines core library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="extraction" />
|
||||
<orderEntry type="module" module-name="morphlines-core" />
|
||||
</component>
|
||||
</module>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Solr example library" level="project" />
|
||||
<orderEntry type="library" name="Solr core library" level="project" />
|
||||
<orderEntry type="library" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" name="Solr extraction library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines core library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
</component>
|
||||
</module>
|
|
@ -1,90 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-map-reduce</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Apache Solr map-reduce index construction</name>
|
||||
<description>Apache Solr - map-reduce index construction</description>
|
||||
<properties>
|
||||
<module-directory>solr/contrib/map-reduce</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:git:${vc-anonymous-base-url}</connection>
|
||||
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
|
||||
<url>${vc-browse-base-url};f=${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<!-- This dependency cannot be put into solr-parent, because local -->
|
||||
<!-- dependencies are always ordered before inherited dependencies. -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-map-reduce.internal.dependencies@
|
||||
@solr-map-reduce.external.dependencies@
|
||||
@solr-map-reduce.internal.test.dependencies@
|
||||
@solr-map-reduce.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<!-- TODO: This is a hack, because the shared test-files folder seems not to be
|
||||
included by the dependency, maybe because the dependency test-jar is not unpacked? -->
|
||||
<directory>${module-path}/../morphlines-core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
</build>
|
||||
</project>
|
|
@ -1,90 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-cell</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Apache Solr Cell Morphlines</name>
|
||||
<description>Apache Solr - Cell Morphlines</description>
|
||||
<properties>
|
||||
<module-directory>solr/contrib/morphlines-cell</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:git:${vc-anonymous-base-url}</connection>
|
||||
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
|
||||
<url>${vc-browse-base-url};f=${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<!-- This dependency cannot be put into solr-parent, because local -->
|
||||
<!-- dependencies are always ordered before inherited dependencies. -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-morphlines-cell.internal.dependencies@
|
||||
@solr-morphlines-cell.external.dependencies@
|
||||
@solr-morphlines-cell.internal.test.dependencies@
|
||||
@solr-morphlines-cell.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<!-- TODO: This is a hack, because the shared test-files folder seems not to be
|
||||
included by the dependency, maybe because the dependency test-jar is not unpacked? -->
|
||||
<directory>${module-path}/../morphlines-core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
</build>
|
||||
</project>
|
|
@ -1,91 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-core</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Apache Solr Morphlines Core</name>
|
||||
<description>Apache Solr - Morphlines Core</description>
|
||||
<properties>
|
||||
<module-directory>solr/contrib/morphlines-core</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:git:${vc-anonymous-base-url}</connection>
|
||||
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
|
||||
<url>${vc-browse-base-url};f=${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<!-- This dependency cannot be put into solr-parent, because local -->
|
||||
<!-- dependencies are always ordered before inherited dependencies. -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-morphlines-core.internal.dependencies@
|
||||
@solr-morphlines-core.external.dependencies@
|
||||
@solr-morphlines-core.internal.test.dependencies@
|
||||
@solr-morphlines-core.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -39,9 +39,6 @@
|
|||
<module>extraction</module>
|
||||
<module>langid</module>
|
||||
<module>ltr</module>
|
||||
<module>morphlines-cell</module>
|
||||
<module>morphlines-core</module>
|
||||
<module>map-reduce</module>
|
||||
<module>uima</module>
|
||||
<module>velocity</module>
|
||||
</modules>
|
||||
|
|
|
@ -10,6 +10,5 @@
|
|||
# trigger a conflict) when the ant check-lib-versions target is run.
|
||||
|
||||
/com.google.guava/guava = 16.0.1
|
||||
/com.google.inject/guice=4.0-beta5
|
||||
/javax.servlet/servlet-api = 2.5, 3.0-alpha-1
|
||||
/org.ow2.asm/asm = 5.0_BETA
|
|
@ -3,7 +3,6 @@
|
|||
# when the lexical sort check is performed by the ant check-lib-versions target.
|
||||
|
||||
/antlr/antlr = 2.7.7
|
||||
/aopalliance/aopalliance = 1.0
|
||||
/com.adobe.xmp/xmpcore = 5.1.2
|
||||
|
||||
com.carrotsearch.randomizedtesting.version = 2.5.0
|
||||
|
@ -26,10 +25,6 @@ com.fasterxml.jackson.core.version = 2.5.4
|
|||
|
||||
/com.google.guava/guava = 14.0.1
|
||||
|
||||
com.google.inject.guice.version = 3.0
|
||||
/com.google.inject.extensions/guice-servlet = ${com.google.inject.guice.version}
|
||||
/com.google.inject/guice = ${com.google.inject.guice.version}
|
||||
|
||||
/com.google.protobuf/protobuf-java = 3.1.0
|
||||
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
||||
/com.googlecode.mp4parser/isoparser = 1.1.18
|
||||
|
@ -37,24 +32,14 @@ com.google.inject.guice.version = 3.0
|
|||
/com.ibm.icu/icu4j = 56.1
|
||||
/com.pff/java-libpst = 0.8.1
|
||||
|
||||
com.rometools.version = 1.6.1
|
||||
/com.rometools/rome = ${com.rometools.version}
|
||||
|
||||
com.sun.jersey.version = 1.9
|
||||
/com.sun.jersey.contribs/jersey-guice = ${com.sun.jersey.version}
|
||||
/com.sun.jersey/jersey-bundle = ${com.sun.jersey.version}
|
||||
/com.sun.jersey/jersey-core = ${com.sun.jersey.version}
|
||||
/com.sun.jersey/jersey-json = ${com.sun.jersey.version}
|
||||
/com.sun.jersey/jersey-server = ${com.sun.jersey.version}
|
||||
|
||||
/com.sun.mail/gimap = 1.5.1
|
||||
/com.sun.mail/javax.mail = 1.5.1
|
||||
|
||||
/com.sun.xml.bind/jaxb-impl = 2.2.3-1
|
||||
|
||||
/com.tdunning/t-digest = 3.1
|
||||
/com.thoughtworks.paranamer/paranamer = 2.3
|
||||
/com.typesafe/config = 1.0.2
|
||||
/commons-beanutils/commons-beanutils = 1.8.3
|
||||
/commons-cli/commons-cli = 1.2
|
||||
/commons-codec/commons-codec = 1.10
|
||||
|
@ -74,7 +59,6 @@ io.dropwizard.metrics.version = 3.1.2
|
|||
/io.dropwizard.metrics/metrics-core = ${io.dropwizard.metrics.version}
|
||||
/io.dropwizard.metrics/metrics-ganglia = ${io.dropwizard.metrics.version}
|
||||
/io.dropwizard.metrics/metrics-graphite = ${io.dropwizard.metrics.version}
|
||||
/io.dropwizard.metrics/metrics-healthchecks = ${io.dropwizard.metrics.version}
|
||||
/io.dropwizard.metrics/metrics-jetty9 = ${io.dropwizard.metrics.version}
|
||||
/io.dropwizard.metrics/metrics-jvm = ${io.dropwizard.metrics.version}
|
||||
|
||||
|
@ -82,7 +66,6 @@ io.netty.netty-all.version = 4.0.36.Final
|
|||
/io.netty/netty-all = ${io.netty.netty-all.version}
|
||||
|
||||
/javax.activation/activation = 1.1.1
|
||||
/javax.inject/javax.inject= 1
|
||||
/javax.servlet/javax.servlet-api = 3.1.0
|
||||
/javax.servlet/servlet-api = 2.4
|
||||
/jdom/jdom = 1.0
|
||||
|
@ -95,14 +78,11 @@ io.netty.netty-all.version = 4.0.36.Final
|
|||
/net.bytebuddy/byte-buddy = 1.6.2
|
||||
/net.hydromatic/eigenbase-properties = 1.1.5
|
||||
/net.sf.ehcache/ehcache-core = 2.4.4
|
||||
/net.sf.saxon/Saxon-HE = 9.6.0-2
|
||||
/net.sourceforge.argparse4j/argparse4j = 0.4.3
|
||||
/net.sourceforge.jmatio/jmatio = 1.0
|
||||
/net.sourceforge.nekohtml/nekohtml = 1.9.17
|
||||
/org.antlr/antlr4-runtime = 4.5.1-1
|
||||
|
||||
/org.apache.ant/ant = 1.8.2
|
||||
/org.apache.avro/avro = 1.7.5
|
||||
|
||||
org.apache.calcite.avatica.version = 1.9.0
|
||||
/org.apache.calcite.avatica/avatica-core = ${org.apache.calcite.avatica.version}
|
||||
|
@ -160,23 +140,7 @@ org.apache.hadoop.version = 2.7.2
|
|||
/org.apache.hadoop/hadoop-auth = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-common = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-hdfs = ${org.apache.hadoop.version}
|
||||
|
||||
/org.apache.hadoop/hadoop-mapreduce-client-app = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-mapreduce-client-common = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-mapreduce-client-core = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-mapreduce-client-hs = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-mapreduce-client-jobclient = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-mapreduce-client-shuffle = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-minikdc = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-api = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-client = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-common = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-server-applicationhistoryservice = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-server-common = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-server-nodemanager = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-server-resourcemanager = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-server-tests = ${org.apache.hadoop.version}
|
||||
/org.apache.hadoop/hadoop-yarn-server-web-proxy = ${org.apache.hadoop.version}
|
||||
|
||||
/org.apache.htrace/htrace-core = 3.2.0-incubating
|
||||
|
||||
|
@ -193,7 +157,6 @@ org.apache.james.apache.mime4j.version = 0.7.2
|
|||
/org.apache.james/apache-mime4j-dom = ${org.apache.james.apache.mime4j.version}
|
||||
|
||||
/org.apache.mina/mina-core = 2.0.0-M5
|
||||
/org.apache.mrunit/mrunit = 1.0.0
|
||||
|
||||
org.apache.pdfbox.version = 2.0.1
|
||||
/org.apache.pdfbox/fontbox = ${org.apache.pdfbox.version}
|
||||
|
@ -228,7 +191,6 @@ org.apache.uima.version = 2.3.1
|
|||
|
||||
org.bouncycastle.version = 1.45
|
||||
/org.bouncycastle/bcmail-jdk15 = ${org.bouncycastle.version}
|
||||
/org.bouncycastle/bcpkix-jdk15on = 1.47
|
||||
/org.bouncycastle/bcprov-jdk15 = ${org.bouncycastle.version}
|
||||
|
||||
/org.carrot2.attributes/attributes-binder = 1.3.1
|
||||
|
@ -245,7 +207,6 @@ org.carrot2.morfologik.version = 2.1.1
|
|||
|
||||
org.codehaus.jackson.version = 1.9.13
|
||||
/org.codehaus.jackson/jackson-core-asl = ${org.codehaus.jackson.version}
|
||||
/org.codehaus.jackson/jackson-jaxrs = ${org.codehaus.jackson.version}
|
||||
/org.codehaus.jackson/jackson-mapper-asl = ${org.codehaus.jackson.version}
|
||||
|
||||
org.codehaus.janino.version = 2.7.6
|
||||
|
@ -271,29 +232,10 @@ org.eclipse.jetty.version = 9.3.14.v20161028
|
|||
/org.eclipse.jetty/jetty-webapp = ${org.eclipse.jetty.version}
|
||||
/org.eclipse.jetty/jetty-xml = ${org.eclipse.jetty.version}
|
||||
|
||||
/org.fusesource.leveldbjni/leveldbjni = 1.8
|
||||
|
||||
org.gagravarr.vorbis.java.version = 0.8
|
||||
/org.gagravarr/vorbis-java-core = ${org.gagravarr.vorbis.java.version}
|
||||
/org.gagravarr/vorbis-java-tika = ${org.gagravarr.vorbis.java.version}
|
||||
|
||||
org.iq80.leveldb.version = 0.7
|
||||
/org.iq80.leveldb/leveldb = ${org.iq80.leveldb.version}
|
||||
/org.iq80.leveldb/leveldb-api = ${org.iq80.leveldb.version}
|
||||
|
||||
org.jboss.netty.netty.version = 3.2.4.Final
|
||||
/org.jboss.netty/netty = ${org.jboss.netty.netty.version}
|
||||
|
||||
org.kitesdk.kite-morphlines.version = 1.1.0
|
||||
/org.kitesdk/kite-morphlines-avro = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-core = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-hadoop-sequencefile = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-json = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-saxon = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-tika-core = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-tika-decompress = ${org.kitesdk.kite-morphlines.version}
|
||||
/org.kitesdk/kite-morphlines-twitter = ${org.kitesdk.kite-morphlines.version}
|
||||
|
||||
/org.locationtech.spatial4j/spatial4j = 0.6
|
||||
|
||||
/org.mockito/mockito-core = 2.6.2
|
||||
|
@ -322,7 +264,6 @@ org.slf4j.version = 1.7.7
|
|||
/org.slf4j/slf4j-log4j12 = ${org.slf4j.version}
|
||||
|
||||
/org.tukaani/xz = 1.5
|
||||
/org.xerial.snappy/snappy-java = 1.0.5
|
||||
/rome/rome = 1.0
|
||||
/xerces/xercesImpl = 2.9.1
|
||||
|
||||
|
|
|
@ -110,6 +110,11 @@ Apache UIMA 2.3.1
|
|||
Apache ZooKeeper 3.4.6
|
||||
Jetty 9.3.14.v20161028
|
||||
|
||||
Upgrade Notes
|
||||
----------------------
|
||||
|
||||
* Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed.
|
||||
|
||||
Detailed Change List
|
||||
----------------------
|
||||
|
||||
|
@ -134,6 +139,11 @@ Bug Fixes
|
|||
* SOLR-10281: ADMIN_PATHS is duplicated in two places and inconsistent. This can cause automatic
|
||||
retries to /admin/metrics handler by the CloudSolrClient. (shalin)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
* SOLR-9221: Remove Solr contribs: map-reduce, morphlines-core and morphlines-cell. (Steve Rowe)
|
||||
|
||||
================== 6.5.0 ==================
|
||||
|
||||
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
Apache Solr MapReduce
|
||||
|
||||
*Experimental* - This contrib is currently subject to change in ways that may
|
||||
break back compatibility.
|
||||
|
||||
The Solr MapReduce contrib provides an a mapreduce job that allows you to build
|
||||
Solr indexes and optionally merge them into a live Solr cluster.
|
||||
|
||||
Example:
|
||||
|
||||
# Build an index with map-reduce and deploy it to SolrCloud
|
||||
|
||||
source $solr_distrib/example/scripts/map-reduce/set-map-reduce-classpath.sh
|
||||
|
||||
$hadoop_distrib/bin/hadoop --config $hadoop_conf_dir jar \
|
||||
$solr_distrib/dist/solr-map-reduce-*.jar -D 'mapred.child.java.opts=-Xmx500m' \
|
||||
-libjars "$HADOOP_LIBJAR" --morphline-file readAvroContainer.conf \
|
||||
--zk-host 127.0.0.1:9983 --output-dir hdfs://127.0.0.1:8020/outdir \
|
||||
--collection $collection --log4j log4j.properties --go-live \
|
||||
--verbose "hdfs://127.0.0.1:8020/indir"
|
|
@ -1,157 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="solr-map-reduce" default="default">
|
||||
|
||||
<description>
|
||||
Solr map-reduce index construction.
|
||||
</description>
|
||||
|
||||
<!-- <property name="name" value="MapReduceIndexerTool" /> -->
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<solr-contrib-uptodate name="extraction"
|
||||
property="solr-extraction.uptodate"
|
||||
classpath.property="solr-cell.jar"/>
|
||||
|
||||
<target name="compile-solr-extraction" unless="solr-extraction.uptodate">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<solr-contrib-uptodate name="morphlines-core"
|
||||
property="solr-morphlines-core.uptodate"/>
|
||||
|
||||
<target name="compile-morphlines-core" unless="solr-morphlines-core.uptodate">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<solr-contrib-uptodate name="morphlines-cell"
|
||||
property="solr-morphlines-cell.uptodate"/>
|
||||
|
||||
<target name="compile-morphlines-cell" unless="solr-morphlines-cell.uptodate">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-cell" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-extraction-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-morphlines-core-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-morphlines-cell-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-cell" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<path id="classpath.additions">
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-cell/classes/java"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/extraction/lib" excludes="${common.classpath.excludes}"/>
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/java"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/morphlines-core/lib" excludes="${common.classpath.excludes}"/>
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-cell/classes/java"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/morphlines-cell/lib" excludes="${common.classpath.excludes}"/>
|
||||
</path>
|
||||
|
||||
<path id="classpath">
|
||||
<path refid="solr.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="solr.test.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
|
||||
<pathelement location="${common-solr.dir}/contrib/morphlines-core/src/test-files"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
|
||||
</path>
|
||||
|
||||
<path id="javadoc.classpath">
|
||||
<path refid="junit-path"/>
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${ant.home}/lib/ant.jar"/>
|
||||
<fileset dir=".">
|
||||
<exclude name="build/**/*.jar"/>
|
||||
<include name="**/lib/*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<!-- TODO: make this nicer like lucene? -->
|
||||
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,javadocs-extraction,javadocs-morphlines-core,javadocs-morphlines-cell,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
|
||||
<sequential>
|
||||
<mkdir dir="${javadoc.dir}/${name}"/>
|
||||
<solr-invoke-javadoc>
|
||||
<solrsources>
|
||||
<packageset dir="${src.dir}"/>
|
||||
</solrsources>
|
||||
<links>
|
||||
<link href="../solr-solrj"/>
|
||||
<link href="../solr-morphlines-core"/>
|
||||
<link href="../solr-cell"/>
|
||||
</links>
|
||||
</solr-invoke-javadoc>
|
||||
<solr-jarify basedir="${javadoc.dir}/${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="javadocs-extraction">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="javadocs-morphlines-core">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="javadocs-morphlines-cell">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-cell" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="resolve-extraction-libs, resolve-morphlines-core-libs, resolve-morphlines-cell-libs, compile-solr-extraction, compile-morphlines-core, compile-morphlines-cell, solr-contrib-build.compile-core"/>
|
||||
|
||||
<property name="main.class" value="org.apache.solr.hadoop.MapReduceIndexerTool" />
|
||||
<target name="jar-core" depends="compile-core">
|
||||
<solr-jarify>
|
||||
<solr-jarify-additional-manifest-attributes>
|
||||
<attribute name="Main-Class" value="${main.class}"/>
|
||||
</solr-jarify-additional-manifest-attributes>
|
||||
</solr-jarify>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="common-solr.dist"/>
|
||||
|
||||
</project>
|
|
@ -1,37 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.solr" module="map-reduce" />
|
||||
<configurations defaultconfmapping="compile->master;test->master">
|
||||
<conf name="compile" transitive="false" />
|
||||
<conf name="test" transitive="false" />
|
||||
</configurations>
|
||||
|
||||
<dependencies>
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="${/org.apache.hadoop/hadoop-mapreduce-client-core}" conf="compile" />
|
||||
<dependency org="net.sourceforge.argparse4j" name="argparse4j" rev="${/net.sourceforge.argparse4j/argparse4j}" conf="compile" />
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-saxon" rev="${/org.kitesdk/kite-morphlines-saxon}" conf="compile" />
|
||||
<dependency org="net.sf.saxon" name="Saxon-HE" rev="${/net.sf.saxon/Saxon-HE}" conf="compile" />
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-hadoop-sequencefile" rev="${/org.kitesdk/kite-morphlines-hadoop-sequencefile}" conf="compile" />
|
||||
<dependency org="org.jboss.netty" name="netty" rev="${/org.jboss.netty/netty}" conf="test" />
|
||||
<dependency org="org.bouncycastle" name="bcpkix-jdk15on" rev="${/org.bouncycastle/bcpkix-jdk15on}" conf="test"/>
|
||||
<dependency org="com.rometools" name="rome" rev="${/com.rometools/rome}" conf="test"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
|
||||
</dependencies>
|
||||
</ivy-module>
|
|
@ -1,39 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
|
||||
license agreements. See the NOTICE file distributed with this work for additional
|
||||
information regarding copyright ownership. The ASF licenses this file to
|
||||
You under the Apache License, Version 2.0 (the "License"); you may not use
|
||||
this file except in compliance with the License. You may obtain a copy of
|
||||
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
|
||||
by applicable law or agreed to in writing, software distributed under the
|
||||
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
|
||||
OF ANY KIND, either express or implied. See the License for the specific
|
||||
language governing permissions and limitations under the License. -->
|
||||
|
||||
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
|
||||
http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
|
||||
<id>job</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<unpack>false</unpack>
|
||||
<scope>runtime</scope>
|
||||
<outputDirectory>lib</outputDirectory>
|
||||
<excludes>
|
||||
<exclude>${groupId}:${artifactId}</exclude>
|
||||
</excludes>
|
||||
</dependencySet>
|
||||
<dependencySet>
|
||||
<unpack>true</unpack>
|
||||
<includes>
|
||||
<include>${groupId}:${artifactId}</include>
|
||||
</includes>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
|
@ -1,75 +0,0 @@
|
|||
//The MIT License
|
||||
//
|
||||
// Copyright (c) 2003 Ron Alford, Mike Grove, Bijan Parsia, Evren Sirin
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal in the Software without restriction, including without limitation the
|
||||
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
// sell copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* This is a comparator to perform a mix of alphabetical+numeric comparison. For
|
||||
* example, if there is a list {"test10", "test2", "test150", "test25", "test1"}
|
||||
* then what we generally expect from the ordering is the result {"test1",
|
||||
* "test2", "test10", "test25", "test150"}. However, standard lexigraphic
|
||||
* ordering does not do that and "test10" comes before "test2". This class is
|
||||
* provided to overcome that problem. This functionality is useful to sort the
|
||||
* benchmark files (like the ones in in DL-benchmark-suite) from smallest to the
|
||||
* largest. Comparisons are done on the String values retuned by toString() so
|
||||
* care should be taken when this comparator is used to sort arbitrary Java
|
||||
* objects.
|
||||
*
|
||||
*/
|
||||
final class AlphaNumericComparator implements Comparator {
|
||||
|
||||
public AlphaNumericComparator() {
|
||||
}
|
||||
|
||||
public int compare(Object o1, Object o2) {
|
||||
String s1 = o1.toString();
|
||||
String s2 = o2.toString();
|
||||
int n1 = s1.length(), n2 = s2.length();
|
||||
int i1 = 0, i2 = 0;
|
||||
while (i1 < n1 && i2 < n2) {
|
||||
int p1 = i1;
|
||||
int p2 = i2;
|
||||
char c1 = s1.charAt(i1++);
|
||||
char c2 = s2.charAt(i2++);
|
||||
if(c1 != c2) {
|
||||
if (Character.isDigit(c1) && Character.isDigit(c2)) {
|
||||
int value1 = 0, value2 = 0;
|
||||
while (i1 < n1 && Character.isDigit(c1 = s1.charAt(i1))) {
|
||||
i1++;
|
||||
}
|
||||
value1 = Integer.parseInt(s1.substring(p1, i1));
|
||||
while (i2 < n2 && Character.isDigit(c2 = s2.charAt(i2))) {
|
||||
i2++;
|
||||
}
|
||||
value2 = Integer.parseInt(s2.substring(p2, i2));
|
||||
if (value1 != value2) {
|
||||
return value1 - value2;
|
||||
}
|
||||
}
|
||||
return c1 - c2;
|
||||
}
|
||||
}
|
||||
|
||||
return n1 - n2;
|
||||
}
|
||||
}
|
|
@ -1,243 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
import org.apache.hadoop.mapreduce.TaskID;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.util.ExecutorUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Enables adding batches of documents to an EmbeddedSolrServer.
|
||||
*/
|
||||
class BatchWriter {
|
||||
|
||||
private final EmbeddedSolrServer solr;
|
||||
private volatile Exception batchWriteException = null;
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public Exception getBatchWriteException() {
|
||||
return batchWriteException;
|
||||
}
|
||||
|
||||
public void setBatchWriteException(Exception batchWriteException) {
|
||||
this.batchWriteException = batchWriteException;
|
||||
}
|
||||
|
||||
/** The number of writing threads. */
|
||||
final int writerThreads;
|
||||
|
||||
/** Queue Size */
|
||||
final int queueSize;
|
||||
|
||||
private final ThreadPoolExecutor batchPool;
|
||||
|
||||
private TaskID taskId = null;
|
||||
|
||||
/**
|
||||
* The number of in progress batches, must be zero before the close can
|
||||
* actually start closing
|
||||
*/
|
||||
AtomicInteger executingBatches = new AtomicInteger(0);
|
||||
|
||||
/**
|
||||
* Create the batch writer object, set the thread to daemon mode, and start
|
||||
* it.
|
||||
*
|
||||
*/
|
||||
|
||||
final class Batch implements Runnable {
|
||||
|
||||
private List<SolrInputDocument> documents;
|
||||
private UpdateResponse result;
|
||||
|
||||
public Batch(Collection<SolrInputDocument> batch) {
|
||||
documents = new ArrayList<>(batch);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
executingBatches.getAndIncrement();
|
||||
result = runUpdate(documents);
|
||||
} finally {
|
||||
executingBatches.getAndDecrement();
|
||||
}
|
||||
}
|
||||
|
||||
protected List<SolrInputDocument> getDocuments() {
|
||||
return documents;
|
||||
}
|
||||
|
||||
protected void setDocuments(List<SolrInputDocument> documents) {
|
||||
this.documents = documents;
|
||||
}
|
||||
|
||||
protected UpdateResponse getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
protected void setResult(UpdateResponse result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
protected void reset(List<SolrInputDocument> documents) {
|
||||
if (this.documents == null) {
|
||||
this.documents = new ArrayList<>(documents);
|
||||
} else {
|
||||
this.documents.clear();
|
||||
this.documents.addAll(documents);
|
||||
}
|
||||
result = null;
|
||||
}
|
||||
|
||||
protected void reset(SolrInputDocument document) {
|
||||
if (this.documents == null) {
|
||||
this.documents = new ArrayList<>();
|
||||
} else {
|
||||
this.documents.clear();
|
||||
}
|
||||
this.documents.add(document);
|
||||
result = null;
|
||||
}
|
||||
}
|
||||
|
||||
protected UpdateResponse runUpdate(List<SolrInputDocument> batchToWrite) {
|
||||
try {
|
||||
UpdateResponse result = solr.add(batchToWrite);
|
||||
SolrRecordWriter.incrementCounter(taskId, SolrCounters.class.getName(), SolrCounters.BATCHES_WRITTEN.toString(), 1);
|
||||
SolrRecordWriter.incrementCounter(taskId, SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString(), batchToWrite.size());
|
||||
if (LOG.isDebugEnabled()) {
|
||||
SolrRecordWriter.incrementCounter(taskId, SolrCounters.class.getName(), SolrCounters.BATCH_WRITE_TIME.toString(), result.getElapsedTime());
|
||||
}
|
||||
return result;
|
||||
} catch (Throwable e) {
|
||||
if (e instanceof Exception) {
|
||||
setBatchWriteException((Exception) e);
|
||||
} else {
|
||||
setBatchWriteException(new Exception(e));
|
||||
}
|
||||
SolrRecordWriter.incrementCounter(taskId, getClass().getName() + ".errors", e.getClass().getName(), 1);
|
||||
LOG.error("Unable to process batch", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public BatchWriter(EmbeddedSolrServer solr, int batchSize, TaskID tid,
|
||||
int writerThreads, int queueSize) {
|
||||
this.solr = solr;
|
||||
this.writerThreads = writerThreads;
|
||||
this.queueSize = queueSize;
|
||||
taskId = tid;
|
||||
|
||||
// we need to obtain the settings before the constructor
|
||||
if (writerThreads != 0) {
|
||||
batchPool = new ExecutorUtil.MDCAwareThreadPoolExecutor(writerThreads, writerThreads, 5,
|
||||
TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(queueSize),
|
||||
new ThreadPoolExecutor.CallerRunsPolicy());
|
||||
} else { // single threaded case
|
||||
batchPool = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void queueBatch(Collection<SolrInputDocument> batch)
|
||||
throws IOException, SolrServerException {
|
||||
|
||||
throwIf();
|
||||
Batch b = new Batch(batch);
|
||||
if (batchPool != null) {
|
||||
batchPool.execute(b);
|
||||
} else { // single threaded case
|
||||
b.run();
|
||||
throwIf();
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void close(TaskAttemptContext context)
|
||||
throws InterruptedException, SolrServerException, IOException {
|
||||
|
||||
if (batchPool != null) {
|
||||
context.setStatus("Waiting for batches to complete");
|
||||
batchPool.shutdown();
|
||||
|
||||
while (!batchPool.isTerminated()) {
|
||||
LOG.info(String.format(Locale.ENGLISH,
|
||||
"Waiting for %d items and %d threads to finish executing", batchPool
|
||||
.getQueue().size(), batchPool.getActiveCount()));
|
||||
batchPool.awaitTermination(5, TimeUnit.SECONDS);
|
||||
}
|
||||
}
|
||||
context.setStatus("Committing Solr Phase 1");
|
||||
solr.commit(true, false);
|
||||
context.setStatus("Optimizing Solr");
|
||||
int maxSegments = context.getConfiguration().getInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, 1);
|
||||
LOG.info("Optimizing Solr: forcing merge down to {} segments", maxSegments);
|
||||
long start = System.nanoTime();
|
||||
solr.optimize(true, false, maxSegments);
|
||||
context.getCounter(SolrCounters.class.getName(), SolrCounters.PHYSICAL_REDUCER_MERGE_TIME.toString()).increment(System.nanoTime() - start);
|
||||
float secs = (System.nanoTime() - start) / (float)(10^9);
|
||||
LOG.info("Optimizing Solr: done forcing merge down to {} segments in {} secs", maxSegments, secs);
|
||||
context.setStatus("Committing Solr Phase 2");
|
||||
solr.commit(true, false);
|
||||
context.setStatus("Shutting down Solr");
|
||||
solr.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Throw a legal exception if a previous batch write had an exception. The
|
||||
* previous state is cleared. Uses {@link #batchWriteException} for the state
|
||||
* from the last exception.
|
||||
*
|
||||
* This will loose individual exceptions if the exceptions happen rapidly.
|
||||
*
|
||||
* @throws IOException On low level IO error
|
||||
* @throws SolrServerException On Solr Exception
|
||||
*/
|
||||
private void throwIf() throws IOException, SolrServerException {
|
||||
|
||||
final Exception last = batchWriteException;
|
||||
batchWriteException = null;
|
||||
|
||||
if (last == null) {
|
||||
return;
|
||||
}
|
||||
if (last instanceof SolrServerException) {
|
||||
throw (SolrServerException) last;
|
||||
}
|
||||
if (last instanceof IOException) {
|
||||
throw (IOException) last;
|
||||
}
|
||||
throw new IOException("Batch Write Failure", last);
|
||||
}
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
|
||||
/**
|
||||
* An InputStream that wraps a DataInput.
|
||||
* @see DataOutputOutputStream
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class DataInputInputStream extends InputStream {
|
||||
|
||||
private DataInput in;
|
||||
|
||||
/**
|
||||
* Construct an InputStream from the given DataInput. If 'in'
|
||||
* is already an InputStream, simply returns it. Otherwise, wraps
|
||||
* it in an InputStream.
|
||||
* @param in the DataInput to wrap
|
||||
* @return an InputStream instance that reads from 'in'
|
||||
*/
|
||||
public static InputStream constructInputStream(DataInput in) {
|
||||
if (in instanceof InputStream) {
|
||||
return (InputStream)in;
|
||||
} else {
|
||||
return new DataInputInputStream(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public DataInputInputStream(DataInput in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return in.readUnsignedByte();
|
||||
}
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
|
||||
/**
|
||||
* OutputStream implementation that wraps a DataOutput.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class DataOutputOutputStream extends OutputStream {
|
||||
|
||||
private final DataOutput out;
|
||||
|
||||
/**
|
||||
* Construct an OutputStream from the given DataOutput. If 'out'
|
||||
* is already an OutputStream, simply returns it. Otherwise, wraps
|
||||
* it in an OutputStream.
|
||||
* @param out the DataOutput to wrap
|
||||
* @return an OutputStream instance that outputs to 'out'
|
||||
*/
|
||||
public static OutputStream constructOutputStream(DataOutput out) {
|
||||
if (out instanceof OutputStream) {
|
||||
return (OutputStream)out;
|
||||
} else {
|
||||
return new DataOutputOutputStream(out);
|
||||
}
|
||||
}
|
||||
|
||||
private DataOutputOutputStream(DataOutput out) {
|
||||
this.out = out;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(int b) throws IOException {
|
||||
out.writeByte(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(byte[] b, int off, int len) throws IOException {
|
||||
out.write(b, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(byte[] b) throws IOException {
|
||||
out.write(b);
|
||||
}
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.morphlines.solr.DocumentLoader;
|
||||
|
||||
/**
|
||||
* Prints documents to stdout instead of loading them into Solr for quicker turnaround during early
|
||||
* trial & debug sessions.
|
||||
*/
|
||||
final class DryRunDocumentLoader implements DocumentLoader {
|
||||
|
||||
@Override
|
||||
public void beginTransaction() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void load(SolrInputDocument doc) {
|
||||
System.out.println("dryrun: " + doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void commitTransaction() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateResponse rollbackTransaction() {
|
||||
return new UpdateResponse();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrPingResponse ping() {
|
||||
return new SolrPingResponse();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,182 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.CompletionService;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorCompletionService;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||
import org.apache.solr.client.solrj.request.CoreAdminRequest;
|
||||
import org.apache.solr.common.util.ExecutorUtil;
|
||||
import org.apache.solr.hadoop.MapReduceIndexerTool.Options;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* The optional (parallel) GoLive phase merges the output shards of the previous
|
||||
* phase into a set of live customer facing Solr servers, typically a SolrCloud.
|
||||
*/
|
||||
class GoLive {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
// TODO: handle clusters with replicas
|
||||
public boolean goLive(Options options, FileStatus[] outDirs) {
|
||||
LOG.info("Live merging of output shards into Solr cluster...");
|
||||
boolean success = false;
|
||||
long start = System.nanoTime();
|
||||
int concurrentMerges = options.goLiveThreads;
|
||||
ThreadPoolExecutor executor = new ExecutorUtil.MDCAwareThreadPoolExecutor(concurrentMerges,
|
||||
concurrentMerges, 1, TimeUnit.SECONDS,
|
||||
new LinkedBlockingQueue<Runnable>());
|
||||
|
||||
try {
|
||||
CompletionService<Request> completionService = new ExecutorCompletionService<>(executor);
|
||||
Set<Future<Request>> pending = new HashSet<>();
|
||||
int cnt = -1;
|
||||
for (final FileStatus dir : outDirs) {
|
||||
|
||||
LOG.debug("processing: " + dir.getPath());
|
||||
|
||||
cnt++;
|
||||
List<String> urls = options.shardUrls.get(cnt);
|
||||
|
||||
for (String url : urls) {
|
||||
|
||||
String baseUrl = url;
|
||||
if (baseUrl.endsWith("/")) {
|
||||
baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
|
||||
}
|
||||
|
||||
int lastPathIndex = baseUrl.lastIndexOf("/");
|
||||
if (lastPathIndex == -1) {
|
||||
LOG.error("Found unexpected shardurl, live merge failed: " + baseUrl);
|
||||
return false;
|
||||
}
|
||||
|
||||
final String name = baseUrl.substring(lastPathIndex + 1);
|
||||
baseUrl = baseUrl.substring(0, lastPathIndex);
|
||||
final String mergeUrl = baseUrl;
|
||||
|
||||
Callable<Request> task = () -> {
|
||||
Request req = new Request();
|
||||
LOG.info("Live merge " + dir.getPath() + " into " + mergeUrl);
|
||||
try (final HttpSolrClient client = new HttpSolrClient.Builder(mergeUrl).build()) {
|
||||
CoreAdminRequest.MergeIndexes mergeRequest = new CoreAdminRequest.MergeIndexes();
|
||||
mergeRequest.setCoreName(name);
|
||||
mergeRequest.setIndexDirs(Arrays.asList(dir.getPath().toString() + "/data/index"));
|
||||
mergeRequest.process(client);
|
||||
req.success = true;
|
||||
} catch (SolrServerException | IOException e) {
|
||||
req.e = e;
|
||||
}
|
||||
return req;
|
||||
};
|
||||
pending.add(completionService.submit(task));
|
||||
}
|
||||
}
|
||||
|
||||
while (pending != null && pending.size() > 0) {
|
||||
try {
|
||||
Future<Request> future = completionService.take();
|
||||
if (future == null) break;
|
||||
pending.remove(future);
|
||||
|
||||
try {
|
||||
Request req = future.get();
|
||||
|
||||
if (!req.success) {
|
||||
// failed
|
||||
LOG.error("A live merge command failed", req.e);
|
||||
return false;
|
||||
}
|
||||
|
||||
} catch (ExecutionException e) {
|
||||
LOG.error("Error sending live merge command", e);
|
||||
return false;
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
LOG.error("Live merge process interrupted", e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
cnt = -1;
|
||||
|
||||
|
||||
try {
|
||||
LOG.info("Committing live merge...");
|
||||
if (options.zkHost != null) {
|
||||
try (CloudSolrClient server = new CloudSolrClient.Builder().withZkHost(options.zkHost).build()) {
|
||||
server.setDefaultCollection(options.collection);
|
||||
server.commit();
|
||||
}
|
||||
} else {
|
||||
for (List<String> urls : options.shardUrls) {
|
||||
for (String url : urls) {
|
||||
// TODO: we should do these concurrently
|
||||
try (HttpSolrClient server = new HttpSolrClient.Builder(url).build()) {
|
||||
server.commit();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.info("Done committing live merge");
|
||||
} catch (Exception e) {
|
||||
LOG.error("Error sending commits to live Solr cluster", e);
|
||||
return false;
|
||||
}
|
||||
|
||||
success = true;
|
||||
return true;
|
||||
} finally {
|
||||
ExecutorUtil.shutdownAndAwaitTermination(executor);
|
||||
float secs = (System.nanoTime() - start) / (float)(10^9);
|
||||
LOG.info("Live merging of index shards into Solr cluster took " + secs + " secs");
|
||||
if (success) {
|
||||
LOG.info("Live merging completed successfully");
|
||||
} else {
|
||||
LOG.info("Live merging failed");
|
||||
}
|
||||
}
|
||||
|
||||
// if an output dir does not exist, we should fail and do no merge?
|
||||
}
|
||||
|
||||
private static final class Request {
|
||||
Exception e;
|
||||
boolean success = false;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
|
||||
/**
|
||||
* Solr field names for metadata of an HDFS file.
|
||||
*/
|
||||
public interface HdfsFileFieldNames {
|
||||
|
||||
public static final String FILE_UPLOAD_URL = "file_upload_url";
|
||||
public static final String FILE_DOWNLOAD_URL = "file_download_url";
|
||||
public static final String FILE_SCHEME = "file_scheme";
|
||||
public static final String FILE_HOST = "file_host";
|
||||
public static final String FILE_PORT = "file_port";
|
||||
public static final String FILE_PATH = "file_path";
|
||||
public static final String FILE_NAME = "file_name";
|
||||
public static final String FILE_LENGTH = "file_length";
|
||||
public static final String FILE_LAST_MODIFIED = "file_last_modified";
|
||||
public static final String FILE_OWNER = "file_owner";
|
||||
public static final String FILE_GROUP = "file_group";
|
||||
public static final String FILE_PERMISSIONS_USER = "file_permissions_user";
|
||||
public static final String FILE_PERMISSIONS_GROUP = "file_permissions_group";
|
||||
public static final String FILE_PERMISSIONS_OTHER = "file_permissions_other";
|
||||
public static final String FILE_PERMISSIONS_STICKYBIT = "file_permissions_stickybit";
|
||||
|
||||
}
|
|
@ -1,159 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Locale;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* This class runs a background thread that once every 60 seconds checks to see if
|
||||
* a progress report is needed. If a report is needed it is issued.
|
||||
*
|
||||
* A simple counter {@link #threadsNeedingHeartBeat} handles the number of
|
||||
* threads requesting a heart beat.
|
||||
*
|
||||
* The expected usage pattern is
|
||||
*
|
||||
* <pre>
|
||||
* try {
|
||||
* heartBeater.needHeartBeat();
|
||||
* do something that may take a while
|
||||
* } finally {
|
||||
* heartBeater.cancelHeartBeat();
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class HeartBeater extends Thread {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
/**
|
||||
* count of threads asking for heart beat, at 0 no heart beat done. This could
|
||||
* be an atomic long but then missmatches in need/cancel could result in
|
||||
* negative counts.
|
||||
*/
|
||||
private volatile int threadsNeedingHeartBeat = 0;
|
||||
|
||||
private Progressable progress;
|
||||
|
||||
/**
|
||||
* The amount of time to wait between checks for the need to issue a heart
|
||||
* beat. In milliseconds.
|
||||
*/
|
||||
private final long waitTimeMs = TimeUnit.MILLISECONDS.convert(60, TimeUnit.SECONDS);
|
||||
|
||||
private final CountDownLatch isClosing = new CountDownLatch(1);
|
||||
|
||||
/**
|
||||
* Create the heart beat object thread set it to daemon priority and start the
|
||||
* thread. When the count in {@link #threadsNeedingHeartBeat} is positive, the
|
||||
* heart beat will be issued on the progress object every 60 seconds.
|
||||
*/
|
||||
public HeartBeater(Progressable progress) {
|
||||
setDaemon(true);
|
||||
this.progress = progress;
|
||||
LOG.info("Heart beat reporting class is " + progress.getClass().getName());
|
||||
start();
|
||||
}
|
||||
|
||||
public Progressable getProgress() {
|
||||
return progress;
|
||||
}
|
||||
|
||||
public void setProgress(Progressable progress) {
|
||||
this.progress = progress;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("HeartBeat thread running");
|
||||
while (true) {
|
||||
try {
|
||||
synchronized (this) {
|
||||
if (threadsNeedingHeartBeat > 0) {
|
||||
progress.progress();
|
||||
if (LOG.isInfoEnabled()) {
|
||||
LOG.info(String.format(Locale.ENGLISH, "Issuing heart beat for %d threads",
|
||||
threadsNeedingHeartBeat));
|
||||
}
|
||||
} else {
|
||||
if (LOG.isInfoEnabled()) {
|
||||
LOG.info(String.format(Locale.ENGLISH, "heartbeat skipped count %d",
|
||||
threadsNeedingHeartBeat));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isClosing.await(waitTimeMs, TimeUnit.MILLISECONDS)) {
|
||||
return;
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
LOG.error("HeartBeat throwable", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* inform the background thread that heartbeats are to be issued. Issue a
|
||||
* heart beat also
|
||||
*/
|
||||
public synchronized void needHeartBeat() {
|
||||
threadsNeedingHeartBeat++;
|
||||
// Issue a progress report right away,
|
||||
// just in case the the cancel comes before the background thread issues a
|
||||
// report.
|
||||
// If enough cases like this happen the 600 second timeout can occur
|
||||
progress.progress();
|
||||
if (threadsNeedingHeartBeat == 1) {
|
||||
// this.notify(); // wake up the heartbeater
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* inform the background thread that this heartbeat request is not needed.
|
||||
* This must be called at some point after each {@link #needHeartBeat()}
|
||||
* request.
|
||||
*/
|
||||
public synchronized void cancelHeartBeat() {
|
||||
if (threadsNeedingHeartBeat > 0) {
|
||||
threadsNeedingHeartBeat--;
|
||||
} else {
|
||||
Exception e = new Exception("Dummy");
|
||||
e.fillInStackTrace();
|
||||
LOG.warn("extra call to cancelHeartBeat", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void setStatus(String status) {
|
||||
if (progress instanceof TaskInputOutputContext) {
|
||||
((TaskInputOutputContext<?,?,?,?>) progress).setStatus(status);
|
||||
}
|
||||
}
|
||||
|
||||
/** Releases any resources */
|
||||
public void close() {
|
||||
isClosing.countDown();
|
||||
}
|
||||
}
|
|
@ -1,67 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* MR Mapper that randomizing a list of URLs.
|
||||
*
|
||||
* Mapper input is (offset, URL) pairs. Each such pair indicates a file to
|
||||
* index.
|
||||
*
|
||||
* Mapper output is (randomPosition, URL) pairs. The reducer receives these
|
||||
* pairs sorted by randomPosition.
|
||||
*/
|
||||
public class LineRandomizerMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
|
||||
|
||||
private Random random;
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
protected void setup(Context context) throws IOException, InterruptedException {
|
||||
super.setup(context);
|
||||
random = createRandom(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
|
||||
LOGGER.debug("map key: {}, value: {}", key, value);
|
||||
context.write(new LongWritable(random.nextLong()), value);
|
||||
}
|
||||
|
||||
private Random createRandom(Context context) {
|
||||
long taskId = 0;
|
||||
if (context.getTaskAttemptID() != null) { // MRUnit returns null
|
||||
LOGGER.debug("context.getTaskAttemptID().getId(): {}", context.getTaskAttemptID().getId());
|
||||
LOGGER.debug("context.getTaskAttemptID().getTaskID().getId(): {}", context.getTaskAttemptID().getTaskID().getId());
|
||||
taskId = context.getTaskAttemptID().getTaskID().getId(); // taskId = 0, 1, ..., N
|
||||
}
|
||||
// create a good random seed, yet ensure deterministic PRNG sequence for easy reproducability
|
||||
return new Random(421439783L * (taskId + 1));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* MR Reducer that randomizing a list of URLs.
|
||||
*
|
||||
* Reducer input is (randomPosition, URL) pairs. Each such pair indicates a file
|
||||
* to index.
|
||||
*
|
||||
* Reducer output is a list of URLs, each URL in a random position.
|
||||
*/
|
||||
public class LineRandomizerReducer extends Reducer<LongWritable, Text, Text, NullWritable> {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
|
||||
for (Text value : values) {
|
||||
LOGGER.debug("reduce key: {}, value: {}", key, value);
|
||||
context.write(value, NullWritable.get());
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,233 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import net.sourceforge.argparse4j.inf.Argument;
|
||||
import net.sourceforge.argparse4j.inf.ArgumentParser;
|
||||
import net.sourceforge.argparse4j.inf.ArgumentParserException;
|
||||
import net.sourceforge.argparse4j.inf.ArgumentType;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsAction;
|
||||
|
||||
/**
|
||||
* ArgumentType subclass for HDFS Path type, using fluent style API.
|
||||
*/
|
||||
public class PathArgumentType implements ArgumentType<Path> {
|
||||
|
||||
private final Configuration conf;
|
||||
private FileSystem fs;
|
||||
private boolean acceptSystemIn = false;
|
||||
private boolean verifyExists = false;
|
||||
private boolean verifyNotExists = false;
|
||||
private boolean verifyIsFile = false;
|
||||
private boolean verifyIsDirectory = false;
|
||||
private boolean verifyCanRead = false;
|
||||
private boolean verifyCanWrite = false;
|
||||
private boolean verifyCanWriteParent = false;
|
||||
private boolean verifyCanExecute = false;
|
||||
private boolean verifyIsAbsolute = false;
|
||||
private boolean verifyHasScheme = false;
|
||||
private String verifyScheme = null;
|
||||
|
||||
public PathArgumentType(Configuration conf) {
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
public PathArgumentType acceptSystemIn() {
|
||||
acceptSystemIn = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyExists() {
|
||||
verifyExists = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyNotExists() {
|
||||
verifyNotExists = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyIsFile() {
|
||||
verifyIsFile = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyIsDirectory() {
|
||||
verifyIsDirectory = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyCanRead() {
|
||||
verifyCanRead = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyCanWrite() {
|
||||
verifyCanWrite = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyCanWriteParent() {
|
||||
verifyCanWriteParent = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyCanExecute() {
|
||||
verifyCanExecute = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyIsAbsolute() {
|
||||
verifyIsAbsolute = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyHasScheme() {
|
||||
verifyHasScheme = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PathArgumentType verifyScheme(String scheme) {
|
||||
verifyScheme = scheme;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Path convert(ArgumentParser parser, Argument arg, String value) throws ArgumentParserException {
|
||||
Path file = new Path(value);
|
||||
try {
|
||||
fs = file.getFileSystem(conf);
|
||||
if (verifyHasScheme && !isSystemIn(file)) {
|
||||
verifyHasScheme(parser, file);
|
||||
}
|
||||
if (verifyScheme != null && !isSystemIn(file)) {
|
||||
verifyScheme(parser, file);
|
||||
}
|
||||
if (verifyIsAbsolute && !isSystemIn(file)) {
|
||||
verifyIsAbsolute(parser, file);
|
||||
}
|
||||
if (verifyExists && !isSystemIn(file)) {
|
||||
verifyExists(parser, file);
|
||||
}
|
||||
if (verifyNotExists && !isSystemIn(file)) {
|
||||
verifyNotExists(parser, file);
|
||||
}
|
||||
if (verifyIsFile && !isSystemIn(file)) {
|
||||
verifyIsFile(parser, file);
|
||||
}
|
||||
if (verifyIsDirectory && !isSystemIn(file)) {
|
||||
verifyIsDirectory(parser, file);
|
||||
}
|
||||
if (verifyCanRead && !isSystemIn(file)) {
|
||||
verifyCanRead(parser, file);
|
||||
}
|
||||
if (verifyCanWrite && !isSystemIn(file)) {
|
||||
verifyCanWrite(parser, file);
|
||||
}
|
||||
if (verifyCanWriteParent && !isSystemIn(file)) {
|
||||
verifyCanWriteParent(parser, file);
|
||||
}
|
||||
if (verifyCanExecute && !isSystemIn(file)) {
|
||||
verifyCanExecute(parser, file);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ArgumentParserException(e, parser);
|
||||
}
|
||||
return file;
|
||||
}
|
||||
|
||||
private void verifyExists(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
if (!fs.exists(file)) {
|
||||
throw new ArgumentParserException("File not found: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyNotExists(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
if (fs.exists(file)) {
|
||||
throw new ArgumentParserException("File found: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyIsFile(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
if (!fs.isFile(file)) {
|
||||
throw new ArgumentParserException("Not a file: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyIsDirectory(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
if (!fs.isDirectory(file)) {
|
||||
throw new ArgumentParserException("Not a directory: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyCanRead(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
verifyExists(parser, file);
|
||||
if (!fs.getFileStatus(file).getPermission().getUserAction().implies(FsAction.READ)) {
|
||||
throw new ArgumentParserException("Insufficient permissions to read file: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyCanWrite(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
verifyExists(parser, file);
|
||||
if (!fs.getFileStatus(file).getPermission().getUserAction().implies(FsAction.WRITE)) {
|
||||
throw new ArgumentParserException("Insufficient permissions to write file: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyCanWriteParent(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
Path parent = file.getParent();
|
||||
if (parent == null || !fs.exists(parent) || !fs.getFileStatus(parent).getPermission().getUserAction().implies(FsAction.WRITE)) {
|
||||
throw new ArgumentParserException("Cannot write parent of file: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyCanExecute(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
|
||||
verifyExists(parser, file);
|
||||
if (!fs.getFileStatus(file).getPermission().getUserAction().implies(FsAction.EXECUTE)) {
|
||||
throw new ArgumentParserException("Insufficient permissions to execute file: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyIsAbsolute(ArgumentParser parser, Path file) throws ArgumentParserException {
|
||||
if (!file.isAbsolute()) {
|
||||
throw new ArgumentParserException("Not an absolute file: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyHasScheme(ArgumentParser parser, Path file) throws ArgumentParserException {
|
||||
if (file.toUri().getScheme() == null) {
|
||||
throw new ArgumentParserException("URI scheme is missing in path: " + file, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyScheme(ArgumentParser parser, Path file) throws ArgumentParserException {
|
||||
if (!verifyScheme.equals(file.toUri().getScheme())) {
|
||||
throw new ArgumentParserException("Scheme of path: " + file + " must be: " + verifyScheme, parser);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSystemIn(Path file) {
|
||||
return acceptSystemIn && file.toString().equals("-");
|
||||
}
|
||||
|
||||
}
|
|
@ -1,130 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
|
||||
/**
|
||||
* Extracts various components of an HDFS Path
|
||||
*/
|
||||
public final class PathParts {
|
||||
|
||||
private final String uploadURL;
|
||||
private final Configuration conf;
|
||||
private final FileSystem fs;
|
||||
private final Path normalizedPath;
|
||||
private FileStatus stats;
|
||||
|
||||
public PathParts(String uploadURL, Configuration conf) throws IOException {
|
||||
if (uploadURL == null) {
|
||||
throw new IllegalArgumentException("Path must not be null: " + uploadURL);
|
||||
}
|
||||
this.uploadURL = uploadURL;
|
||||
if (conf == null) {
|
||||
throw new IllegalArgumentException("Configuration must not be null: " + uploadURL);
|
||||
}
|
||||
this.conf = conf;
|
||||
URI uri = stringToUri(uploadURL);
|
||||
this.fs = FileSystem.get(uri, conf);
|
||||
if (fs == null) {
|
||||
throw new IllegalArgumentException("File system must not be null: " + uploadURL);
|
||||
}
|
||||
this.normalizedPath = fs.makeQualified(new Path(uri));
|
||||
if (!normalizedPath.isAbsolute()) {
|
||||
throw new IllegalArgumentException("Path must be absolute: " + uploadURL);
|
||||
}
|
||||
if (getScheme() == null) {
|
||||
throw new IllegalArgumentException("Scheme must not be null: " + uploadURL);
|
||||
}
|
||||
if (getHost() == null) {
|
||||
throw new IllegalArgumentException("Host must not be null: " + uploadURL);
|
||||
}
|
||||
if (getPort() < 0) {
|
||||
throw new IllegalArgumentException("Port must not be negative: " + uploadURL);
|
||||
}
|
||||
}
|
||||
|
||||
public String getUploadURL() {
|
||||
return uploadURL;
|
||||
}
|
||||
|
||||
public Path getUploadPath() {
|
||||
return new Path(getUploadURL());
|
||||
}
|
||||
|
||||
public String getURIPath() {
|
||||
return normalizedPath.toUri().getPath();
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return normalizedPath.getName();
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return normalizedPath.toUri().getScheme();
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return normalizedPath.toUri().getHost();
|
||||
}
|
||||
|
||||
public int getPort() {
|
||||
int port = normalizedPath.toUri().getPort();
|
||||
if (port == -1) {
|
||||
port = fs.getWorkingDirectory().toUri().getPort();
|
||||
if (port == -1) {
|
||||
port = NameNode.DEFAULT_PORT;
|
||||
}
|
||||
}
|
||||
return port;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return getScheme() + "://" + getHost() + ":" + getPort() + getURIPath();
|
||||
}
|
||||
|
||||
public String getDownloadURL() {
|
||||
return getId();
|
||||
}
|
||||
|
||||
public Configuration getConfiguration() {
|
||||
return conf;
|
||||
}
|
||||
|
||||
public FileSystem getFileSystem() {
|
||||
return fs;
|
||||
}
|
||||
|
||||
public FileStatus getFileStatus() throws IOException {
|
||||
if (stats == null) {
|
||||
stats = getFileSystem().getFileStatus(getUploadPath());
|
||||
}
|
||||
return stats;
|
||||
}
|
||||
|
||||
private URI stringToUri(String pathString) {
|
||||
//return new Path(pathString).toUri().normalize();
|
||||
return URI.create(pathString).normalize();
|
||||
}
|
||||
}
|
|
@ -1,143 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Partitioner;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.cloud.DocCollection;
|
||||
import org.apache.solr.common.cloud.DocRouter;
|
||||
import org.apache.solr.common.cloud.Slice;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.Hash;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* MapReduce partitioner that partitions the Mapper output such that each
|
||||
* SolrInputDocument gets sent to the SolrCloud shard that it would have been
|
||||
* sent to if the document were ingested via the standard SolrCloud Near Real
|
||||
* Time (NRT) API.
|
||||
*
|
||||
* In other words, this class implements the same partitioning semantics as the
|
||||
* standard SolrCloud NRT API. This enables to mix batch updates from MapReduce
|
||||
* ingestion with updates from standard NRT ingestion on the same SolrCloud
|
||||
* cluster, using identical unique document keys.
|
||||
*/
|
||||
public class SolrCloudPartitioner extends Partitioner<Text, SolrInputDocumentWritable> implements Configurable {
|
||||
|
||||
private Configuration conf;
|
||||
private DocCollection docCollection;
|
||||
private Map<String, Integer> shardNumbers;
|
||||
private int shards = 0;
|
||||
private final SolrParams emptySolrParams = new MapSolrParams(Collections.EMPTY_MAP);
|
||||
|
||||
public static final String SHARDS = SolrCloudPartitioner.class.getName() + ".shards";
|
||||
public static final String ZKHOST = SolrCloudPartitioner.class.getName() + ".zkHost";
|
||||
public static final String COLLECTION = SolrCloudPartitioner.class.getName() + ".collection";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public SolrCloudPartitioner() {}
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
this.conf = conf;
|
||||
this.shards = conf.getInt(SHARDS, -1);
|
||||
if (shards <= 0) {
|
||||
throw new IllegalArgumentException("Illegal shards: " + shards);
|
||||
}
|
||||
String zkHost = conf.get(ZKHOST);
|
||||
if (zkHost == null) {
|
||||
throw new IllegalArgumentException("zkHost must not be null");
|
||||
}
|
||||
String collection = conf.get(COLLECTION);
|
||||
if (collection == null) {
|
||||
throw new IllegalArgumentException("collection must not be null");
|
||||
}
|
||||
LOG.info("Using SolrCloud zkHost: {}, collection: {}", zkHost, collection);
|
||||
docCollection = new ZooKeeperInspector().extractDocCollection(zkHost, collection);
|
||||
if (docCollection == null) {
|
||||
throw new IllegalArgumentException("docCollection must not be null");
|
||||
}
|
||||
if (docCollection.getSlicesMap().size() != shards) {
|
||||
throw new IllegalArgumentException("Incompatible shards: + " + shards + " for docCollection: " + docCollection);
|
||||
}
|
||||
List<Slice> slices = new ZooKeeperInspector().getSortedSlices(docCollection.getSlices());
|
||||
if (slices.size() != shards) {
|
||||
throw new IllegalStateException("Incompatible sorted shards: + " + shards + " for docCollection: " + docCollection);
|
||||
}
|
||||
shardNumbers = new HashMap(10 * slices.size()); // sparse for performance
|
||||
for (int i = 0; i < slices.size(); i++) {
|
||||
shardNumbers.put(slices.get(i).getName(), i);
|
||||
}
|
||||
LOG.debug("Using SolrCloud docCollection: {}", docCollection);
|
||||
DocRouter docRouter = docCollection.getRouter();
|
||||
if (docRouter == null) {
|
||||
throw new IllegalArgumentException("docRouter must not be null");
|
||||
}
|
||||
LOG.info("Using SolrCloud docRouterClass: {}", docRouter.getClass());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Configuration getConf() {
|
||||
return conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartition(Text key, SolrInputDocumentWritable value, int numPartitions) {
|
||||
DocRouter docRouter = docCollection.getRouter();
|
||||
SolrInputDocument doc = value.getSolrInputDocument();
|
||||
String keyStr = key.toString();
|
||||
|
||||
// TODO: scalability: replace linear search in HashBasedRouter.hashToSlice() with binary search on sorted hash ranges
|
||||
Slice slice = docRouter.getTargetSlice(keyStr, doc, null, emptySolrParams, docCollection);
|
||||
|
||||
// LOG.info("slice: {}", slice);
|
||||
if (slice == null) {
|
||||
throw new IllegalStateException("No matching slice found! The slice seems unavailable. docRouterClass: "
|
||||
+ docRouter.getClass().getName());
|
||||
}
|
||||
int rootShard = shardNumbers.get(slice.getName());
|
||||
if (rootShard < 0 || rootShard >= shards) {
|
||||
throw new IllegalStateException("Illegal shard number " + rootShard + " for slice: " + slice + ", docCollection: "
|
||||
+ docCollection);
|
||||
}
|
||||
|
||||
// map doc to micro shard aka leaf shard, akin to HashBasedRouter.sliceHash()
|
||||
// taking into account mtree merge algorithm
|
||||
assert numPartitions % shards == 0; // Also note that numPartitions is equal to the number of reducers
|
||||
int hashCode = Hash.murmurhash3_x86_32(keyStr, 0, keyStr.length(), 0);
|
||||
int offset = (hashCode & Integer.MAX_VALUE) % (numPartitions / shards);
|
||||
int microShard = (rootShard * (numPartitions / shards)) + offset;
|
||||
// LOG.info("Subpartitions rootShard: {}, offset: {}", rootShard, offset);
|
||||
// LOG.info("Partitioned to p: {} for numPartitions: {}, shards: {}, key: {}, value: {}", microShard, numPartitions, shards, key, value);
|
||||
|
||||
assert microShard >= 0 && microShard < numPartitions;
|
||||
return microShard;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
public enum SolrCounters {
|
||||
|
||||
DOCUMENTS_WRITTEN (getClassName(SolrReducer.class)
|
||||
+ ": Number of documents processed"),
|
||||
|
||||
BATCHES_WRITTEN (getClassName(SolrReducer.class)
|
||||
+ ": Number of document batches processed"),
|
||||
|
||||
BATCH_WRITE_TIME (getClassName(SolrReducer.class)
|
||||
+ ": Time spent by reducers writing batches [ms]"),
|
||||
|
||||
PHYSICAL_REDUCER_MERGE_TIME (getClassName(SolrReducer.class)
|
||||
+ ": Time spent by reducers on physical merges [ms]"),
|
||||
|
||||
LOGICAL_TREE_MERGE_TIME (getClassName(TreeMergeMapper.class)
|
||||
+ ": Time spent on logical tree merges [ms]"),
|
||||
|
||||
PHYSICAL_TREE_MERGE_TIME (getClassName(TreeMergeMapper.class)
|
||||
+ ": Time spent on physical tree merges [ms]");
|
||||
|
||||
private final String label;
|
||||
|
||||
private SolrCounters(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return label;
|
||||
}
|
||||
|
||||
private static String getClassName(Class clazz) {
|
||||
return Utils.getShortClassName(clazz);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.util.FastOutputStream;
|
||||
import org.apache.solr.common.util.JavaBinCodec;
|
||||
|
||||
public class SolrInputDocumentWritable implements Writable {
|
||||
private SolrInputDocument sid;
|
||||
|
||||
public SolrInputDocumentWritable() {
|
||||
}
|
||||
|
||||
public SolrInputDocumentWritable(SolrInputDocument sid) {
|
||||
this.sid = sid;
|
||||
}
|
||||
|
||||
public SolrInputDocument getSolrInputDocument() {
|
||||
return sid;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return sid.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(DataOutput out) throws IOException {
|
||||
JavaBinCodec codec = new JavaBinCodec();
|
||||
FastOutputStream daos = FastOutputStream.wrap(DataOutputOutputStream.constructOutputStream(out));
|
||||
codec.init(daos);
|
||||
try {
|
||||
codec.writeVal(sid);
|
||||
} finally {
|
||||
daos.flushBuffer();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
JavaBinCodec codec = new JavaBinCodec();
|
||||
UnbufferedDataInputInputStream dis = new UnbufferedDataInputInputStream(in);
|
||||
sid = (SolrInputDocument)codec.readVal(dis);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
|
||||
public class SolrMapper<KEYIN, VALUEIN> extends Mapper<KEYIN, VALUEIN, Text, SolrInputDocumentWritable> {
|
||||
|
||||
private Path solrHomeDir;
|
||||
|
||||
@Override
|
||||
protected void setup(Context context) throws IOException, InterruptedException {
|
||||
Utils.getLogConfigFile(context.getConfiguration());
|
||||
super.setup(context);
|
||||
solrHomeDir = SolrRecordWriter.findSolrConfig(context.getConfiguration());
|
||||
}
|
||||
|
||||
protected Path getSolrHomeDir() {
|
||||
return solrHomeDir;
|
||||
}
|
||||
}
|
|
@ -1,280 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.filecache.DistributedCache;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.JobContext;
|
||||
import org.apache.hadoop.mapreduce.RecordWriter;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class SolrOutputFormat<K, V> extends FileOutputFormat<K, V> {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
/**
|
||||
* The parameter used to pass the solr config zip file information. This will
|
||||
* be the hdfs path to the configuration zip file
|
||||
*/
|
||||
public static final String SETUP_OK = "solr.output.format.setup";
|
||||
|
||||
/** The key used to pass the zip file name through the configuration. */
|
||||
public static final String ZIP_NAME = "solr.zip.name";
|
||||
|
||||
/**
|
||||
* The base name of the zip file containing the configuration information.
|
||||
* This file is passed via the distributed cache using a unique name, obtained
|
||||
* via {@link #getZipName(Configuration jobConf)}.
|
||||
*/
|
||||
public static final String ZIP_FILE_BASE_NAME = "solr.zip";
|
||||
|
||||
/**
|
||||
* The key used to pass the boolean configuration parameter that instructs for
|
||||
* regular or zip file output
|
||||
*/
|
||||
public static final String OUTPUT_ZIP_FILE = "solr.output.zip.format";
|
||||
|
||||
static int defaultSolrWriterThreadCount = 0;
|
||||
|
||||
public static final String SOLR_WRITER_THREAD_COUNT = "solr.record.writer.num.threads";
|
||||
|
||||
static int defaultSolrWriterQueueSize = 1;
|
||||
|
||||
public static final String SOLR_WRITER_QUEUE_SIZE = "solr.record.writer.max.queues.size";
|
||||
|
||||
static int defaultSolrBatchSize = 20;
|
||||
|
||||
public static final String SOLR_RECORD_WRITER_BATCH_SIZE = "solr.record.writer.batch.size";
|
||||
|
||||
public static final String SOLR_RECORD_WRITER_MAX_SEGMENTS = "solr.record.writer.maxSegments";
|
||||
|
||||
public static String getSetupOk() {
|
||||
return SETUP_OK;
|
||||
}
|
||||
|
||||
/** Get the number of threads used for index writing */
|
||||
public static void setSolrWriterThreadCount(int count, Configuration conf) {
|
||||
conf.setInt(SOLR_WRITER_THREAD_COUNT, count);
|
||||
}
|
||||
|
||||
/** Set the number of threads used for index writing */
|
||||
public static int getSolrWriterThreadCount(Configuration conf) {
|
||||
return conf.getInt(SOLR_WRITER_THREAD_COUNT, defaultSolrWriterThreadCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum size of the the queue for documents to be written to the
|
||||
* index.
|
||||
*/
|
||||
public static void setSolrWriterQueueSize(int count, Configuration conf) {
|
||||
conf.setInt(SOLR_WRITER_QUEUE_SIZE, count);
|
||||
}
|
||||
|
||||
/** Return the maximum size for the number of documents pending index writing. */
|
||||
public static int getSolrWriterQueueSize(Configuration conf) {
|
||||
return conf.getInt(SOLR_WRITER_QUEUE_SIZE, defaultSolrWriterQueueSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the file name portion of the configuration zip file, from the
|
||||
* configuration.
|
||||
*/
|
||||
public static String getZipName(Configuration conf) {
|
||||
return conf.get(ZIP_NAME, ZIP_FILE_BASE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* configure the job to output zip files of the output index, or full
|
||||
* directory trees. Zip files are about 1/5th the size of the raw index, and
|
||||
* much faster to write, but take more cpu to create.
|
||||
*
|
||||
* @param output true if should output zip files
|
||||
* @param conf to use
|
||||
*/
|
||||
public static void setOutputZipFormat(boolean output, Configuration conf) {
|
||||
conf.setBoolean(OUTPUT_ZIP_FILE, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* return true if the output should be a zip file of the index, rather than
|
||||
* the raw index
|
||||
*
|
||||
* @param conf to use
|
||||
* @return true if output zip files is on
|
||||
*/
|
||||
public static boolean isOutputZipFormat(Configuration conf) {
|
||||
return conf.getBoolean(OUTPUT_ZIP_FILE, false);
|
||||
}
|
||||
|
||||
public static String getOutputName(JobContext job) {
|
||||
return FileOutputFormat.getOutputName(job);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkOutputSpecs(JobContext job) throws IOException {
|
||||
super.checkOutputSpecs(job);
|
||||
if (job.getConfiguration().get(SETUP_OK) == null) {
|
||||
throw new IOException("Solr home cache not set up!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
|
||||
Utils.getLogConfigFile(context.getConfiguration());
|
||||
Path workDir = getDefaultWorkFile(context, "");
|
||||
int batchSize = getBatchSize(context.getConfiguration());
|
||||
return new SolrRecordWriter<>(context, workDir, batchSize);
|
||||
}
|
||||
|
||||
public static void setupSolrHomeCache(File solrHomeDir, Job job) throws IOException{
|
||||
File solrHomeZip = createSolrHomeZip(solrHomeDir);
|
||||
addSolrConfToDistributedCache(job, solrHomeZip);
|
||||
}
|
||||
|
||||
public static File createSolrHomeZip(File solrHomeDir) throws IOException {
|
||||
return createSolrHomeZip(solrHomeDir, false);
|
||||
}
|
||||
|
||||
private static File createSolrHomeZip(File solrHomeDir, boolean safeToModify) throws IOException {
|
||||
if (solrHomeDir == null || !(solrHomeDir.exists() && solrHomeDir.isDirectory())) {
|
||||
throw new IOException("Invalid solr home: " + solrHomeDir);
|
||||
}
|
||||
File solrHomeZip = File.createTempFile("solr", ".zip");
|
||||
createZip(solrHomeDir, solrHomeZip);
|
||||
return solrHomeZip;
|
||||
}
|
||||
|
||||
public static void addSolrConfToDistributedCache(Job job, File solrHomeZip)
|
||||
throws IOException {
|
||||
// Make a reasonably unique name for the zip file in the distributed cache
|
||||
// to avoid collisions if multiple jobs are running.
|
||||
String hdfsZipName = UUID.randomUUID().toString() + '.'
|
||||
+ ZIP_FILE_BASE_NAME;
|
||||
Configuration jobConf = job.getConfiguration();
|
||||
jobConf.set(ZIP_NAME, hdfsZipName);
|
||||
|
||||
Path zipPath = new Path("/tmp", getZipName(jobConf));
|
||||
FileSystem fs = FileSystem.get(jobConf);
|
||||
fs.copyFromLocalFile(new Path(solrHomeZip.toString()), zipPath);
|
||||
final URI baseZipUrl = fs.getUri().resolve(
|
||||
zipPath.toString() + '#' + getZipName(jobConf));
|
||||
|
||||
DistributedCache.addCacheArchive(baseZipUrl, jobConf);
|
||||
LOG.debug("Set Solr distributed cache: {}", Arrays.asList(job.getCacheArchives()));
|
||||
LOG.debug("Set zipPath: {}", zipPath);
|
||||
// Actually send the path for the configuration zip file
|
||||
jobConf.set(SETUP_OK, zipPath.toString());
|
||||
}
|
||||
|
||||
private static void createZip(File dir, File out) throws IOException {
|
||||
HashSet<File> files = new HashSet<>();
|
||||
// take only conf/ and lib/
|
||||
for (String allowedDirectory : SolrRecordWriter
|
||||
.getAllowedConfigDirectories()) {
|
||||
File configDir = new File(dir, allowedDirectory);
|
||||
boolean configDirExists;
|
||||
/** If the directory does not exist, and is required, bail out */
|
||||
if (!(configDirExists = configDir.exists())
|
||||
&& SolrRecordWriter.isRequiredConfigDirectory(allowedDirectory)) {
|
||||
throw new IOException(String.format(Locale.ENGLISH,
|
||||
"required configuration directory %s is not present in %s",
|
||||
allowedDirectory, dir));
|
||||
}
|
||||
if (!configDirExists) {
|
||||
continue;
|
||||
}
|
||||
listFiles(configDir, files); // Store the files in the existing, allowed
|
||||
// directory configDir, in the list of files
|
||||
// to store in the zip file
|
||||
}
|
||||
|
||||
Files.deleteIfExists(out.toPath());
|
||||
int subst = dir.toString().length();
|
||||
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(out));
|
||||
byte[] buf = new byte[1024];
|
||||
for (File f : files) {
|
||||
ZipEntry ze = new ZipEntry(f.toString().substring(subst));
|
||||
zos.putNextEntry(ze);
|
||||
InputStream is = new FileInputStream(f);
|
||||
int cnt;
|
||||
while ((cnt = is.read(buf)) >= 0) {
|
||||
zos.write(buf, 0, cnt);
|
||||
}
|
||||
is.close();
|
||||
zos.flush();
|
||||
zos.closeEntry();
|
||||
}
|
||||
|
||||
ZipEntry ze = new ZipEntry("solr.xml");
|
||||
zos.putNextEntry(ze);
|
||||
zos.write("<solr></solr>".getBytes("UTF-8"));
|
||||
zos.flush();
|
||||
zos.closeEntry();
|
||||
zos.close();
|
||||
}
|
||||
|
||||
private static void listFiles(File dir, Set<File> files) throws IOException {
|
||||
File[] list = dir.listFiles();
|
||||
|
||||
if (list == null && dir.isFile()) {
|
||||
files.add(dir);
|
||||
return;
|
||||
}
|
||||
|
||||
for (File f : list) {
|
||||
if (f.isFile()) {
|
||||
files.add(f);
|
||||
} else {
|
||||
listFiles(f, files);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static int getBatchSize(Configuration jobConf) {
|
||||
// TODO Auto-generated method stub
|
||||
return jobConf.getInt(SolrOutputFormat.SOLR_RECORD_WRITER_BATCH_SIZE,
|
||||
defaultSolrBatchSize);
|
||||
}
|
||||
|
||||
public static void setBatchSize(int count, Configuration jobConf) {
|
||||
jobConf.setInt(SOLR_RECORD_WRITER_BATCH_SIZE, count);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,479 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.filecache.DistributedCache;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapreduce.RecordWriter;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
import org.apache.hadoop.mapreduce.TaskID;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.core.CoreContainer;
|
||||
import org.apache.solr.core.CoreDescriptor;
|
||||
import org.apache.solr.core.DirectoryFactory;
|
||||
import org.apache.solr.core.HdfsDirectoryFactory;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
class SolrRecordWriter<K, V> extends RecordWriter<K, V> {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public final static List<String> allowedConfigDirectories = new ArrayList<>(
|
||||
Arrays.asList(new String[] { "conf", "lib", "solr.xml", "core1" }));
|
||||
|
||||
public final static Set<String> requiredConfigDirectories = new HashSet<>();
|
||||
|
||||
static {
|
||||
requiredConfigDirectories.add("conf");
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the list of directories names that may be included in the
|
||||
* configuration data passed to the tasks.
|
||||
*
|
||||
* @return an UnmodifiableList of directory names
|
||||
*/
|
||||
public static List<String> getAllowedConfigDirectories() {
|
||||
return Collections.unmodifiableList(allowedConfigDirectories);
|
||||
}
|
||||
|
||||
/**
|
||||
* check if the passed in directory is required to be present in the
|
||||
* configuration data set.
|
||||
*
|
||||
* @param directory The directory to check
|
||||
* @return true if the directory is required.
|
||||
*/
|
||||
public static boolean isRequiredConfigDirectory(final String directory) {
|
||||
return requiredConfigDirectories.contains(directory);
|
||||
}
|
||||
|
||||
/** The path that the final index will be written to */
|
||||
|
||||
/** The location in a local temporary directory that the index is built in. */
|
||||
|
||||
// /**
|
||||
// * If true, create a zip file of the completed index in the final storage
|
||||
// * location A .zip will be appended to the final output name if it is not
|
||||
// * already present.
|
||||
// */
|
||||
// private boolean outputZipFile = false;
|
||||
|
||||
private final HeartBeater heartBeater;
|
||||
private final BatchWriter batchWriter;
|
||||
private final List<SolrInputDocument> batch;
|
||||
private final int batchSize;
|
||||
private long numDocsWritten = 0;
|
||||
private long nextLogTime = System.nanoTime();
|
||||
|
||||
private static HashMap<TaskID, Reducer<?,?,?,?>.Context> contextMap = new HashMap<>();
|
||||
|
||||
public SolrRecordWriter(TaskAttemptContext context, Path outputShardDir, int batchSize) {
|
||||
this.batchSize = batchSize;
|
||||
this.batch = new ArrayList<>(batchSize);
|
||||
Configuration conf = context.getConfiguration();
|
||||
|
||||
// setLogLevel("org.apache.solr.core", "WARN");
|
||||
// setLogLevel("org.apache.solr.update", "WARN");
|
||||
|
||||
heartBeater = new HeartBeater(context);
|
||||
try {
|
||||
heartBeater.needHeartBeat();
|
||||
|
||||
Path solrHomeDir = SolrRecordWriter.findSolrConfig(conf);
|
||||
FileSystem fs = outputShardDir.getFileSystem(conf);
|
||||
EmbeddedSolrServer solr = createEmbeddedSolrServer(solrHomeDir, fs, outputShardDir);
|
||||
batchWriter = new BatchWriter(solr, batchSize,
|
||||
context.getTaskAttemptID().getTaskID(),
|
||||
SolrOutputFormat.getSolrWriterThreadCount(conf),
|
||||
SolrOutputFormat.getSolrWriterQueueSize(conf));
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new IllegalStateException(String.format(Locale.ENGLISH,
|
||||
"Failed to initialize record writer for %s, %s", context.getJobName(), conf
|
||||
.get("mapred.task.id")), e);
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
}
|
||||
}
|
||||
|
||||
public static EmbeddedSolrServer createEmbeddedSolrServer(Path solrHomeDir, FileSystem fs, Path outputShardDir)
|
||||
throws IOException {
|
||||
|
||||
LOG.info("Creating embedded Solr server with solrHomeDir: " + solrHomeDir + ", fs: " + fs + ", outputShardDir: " + outputShardDir);
|
||||
|
||||
Path solrDataDir = new Path(outputShardDir, "data");
|
||||
|
||||
String dataDirStr = solrDataDir.toUri().toString();
|
||||
|
||||
SolrResourceLoader loader = new SolrResourceLoader(Paths.get(solrHomeDir.toString()), null, null);
|
||||
|
||||
LOG.info(String
|
||||
.format(Locale.ENGLISH,
|
||||
"Constructed instance information solr.home %s (%s), instance dir %s, conf dir %s, writing index to solr.data.dir %s, with permdir %s",
|
||||
solrHomeDir, solrHomeDir.toUri(), loader.getInstancePath(),
|
||||
loader.getConfigDir(), dataDirStr, outputShardDir));
|
||||
|
||||
// TODO: This is fragile and should be well documented
|
||||
System.setProperty("solr.directoryFactory", HdfsDirectoryFactory.class.getName());
|
||||
System.setProperty("solr.lock.type", DirectoryFactory.LOCK_TYPE_HDFS);
|
||||
System.setProperty("solr.hdfs.nrtcachingdirectory", "false");
|
||||
System.setProperty("solr.hdfs.blockcache.enabled", "false");
|
||||
System.setProperty("solr.autoCommit.maxTime", "600000");
|
||||
System.setProperty("solr.autoSoftCommit.maxTime", "-1");
|
||||
|
||||
CoreContainer container = new CoreContainer(loader);
|
||||
container.load();
|
||||
SolrCore core = container.create("", ImmutableMap.of(CoreDescriptor.CORE_DATADIR, dataDirStr));
|
||||
|
||||
if (!(core.getDirectoryFactory() instanceof HdfsDirectoryFactory)) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Invalid configuration. Currently, the only DirectoryFactory supported is "
|
||||
+ HdfsDirectoryFactory.class.getSimpleName());
|
||||
}
|
||||
|
||||
EmbeddedSolrServer solr = new EmbeddedSolrServer(container, "");
|
||||
return solr;
|
||||
}
|
||||
|
||||
public static void incrementCounter(TaskID taskId, String groupName, String counterName, long incr) {
|
||||
Reducer<?,?,?,?>.Context context = contextMap.get(taskId);
|
||||
if (context != null) {
|
||||
context.getCounter(groupName, counterName).increment(incr);
|
||||
}
|
||||
}
|
||||
|
||||
public static void incrementCounter(TaskID taskId, Enum<?> counterName, long incr) {
|
||||
Reducer<?,?,?,?>.Context context = contextMap.get(taskId);
|
||||
if (context != null) {
|
||||
context.getCounter(counterName).increment(incr);
|
||||
}
|
||||
}
|
||||
|
||||
public static void addReducerContext(Reducer<?,?,?,?>.Context context) {
|
||||
TaskID taskID = context.getTaskAttemptID().getTaskID();
|
||||
contextMap.put(taskID, context);
|
||||
}
|
||||
|
||||
public static Path findSolrConfig(Configuration conf) throws IOException {
|
||||
// FIXME when mrunit supports the new cache apis
|
||||
//URI[] localArchives = context.getCacheArchives();
|
||||
Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
|
||||
for (Path unpackedDir : localArchives) {
|
||||
if (unpackedDir.getName().equals(SolrOutputFormat.getZipName(conf))) {
|
||||
LOG.info("Using this unpacked directory as solr home: {}", unpackedDir);
|
||||
return unpackedDir;
|
||||
}
|
||||
}
|
||||
throw new IOException(String.format(Locale.ENGLISH,
|
||||
"No local cache archives, where is %s:%s", SolrOutputFormat
|
||||
.getSetupOk(), SolrOutputFormat.getZipName(conf)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a record. This method accumulates records in to a batch, and when
|
||||
* {@link #batchSize} items are present flushes it to the indexer. The writes
|
||||
* can take a substantial amount of time, depending on {@link #batchSize}. If
|
||||
* there is heavy disk contention the writes may take more than the 600 second
|
||||
* default timeout.
|
||||
*/
|
||||
@Override
|
||||
public void write(K key, V value) throws IOException {
|
||||
heartBeater.needHeartBeat();
|
||||
try {
|
||||
try {
|
||||
SolrInputDocumentWritable sidw = (SolrInputDocumentWritable) value;
|
||||
batch.add(sidw.getSolrInputDocument());
|
||||
if (batch.size() >= batchSize) {
|
||||
batchWriter.queueBatch(batch);
|
||||
numDocsWritten += batch.size();
|
||||
if (System.nanoTime() >= nextLogTime) {
|
||||
LOG.info("docsWritten: {}", numDocsWritten);
|
||||
nextLogTime += TimeUnit.NANOSECONDS.convert(10, TimeUnit.SECONDS);
|
||||
}
|
||||
batch.clear();
|
||||
}
|
||||
} catch (SolrServerException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
|
||||
if (context != null) {
|
||||
heartBeater.setProgress(context);
|
||||
}
|
||||
try {
|
||||
heartBeater.needHeartBeat();
|
||||
if (batch.size() > 0) {
|
||||
batchWriter.queueBatch(batch);
|
||||
numDocsWritten += batch.size();
|
||||
batch.clear();
|
||||
}
|
||||
LOG.info("docsWritten: {}", numDocsWritten);
|
||||
batchWriter.close(context);
|
||||
// if (outputZipFile) {
|
||||
// context.setStatus("Writing Zip");
|
||||
// packZipFile(); // Written to the perm location
|
||||
// } else {
|
||||
// context.setStatus("Copying Index");
|
||||
// fs.completeLocalOutput(perm, temp); // copy to dfs
|
||||
// }
|
||||
} catch (Exception e) {
|
||||
if (e instanceof IOException) {
|
||||
throw (IOException) e;
|
||||
}
|
||||
throw new IOException(e);
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
heartBeater.close();
|
||||
// File tempFile = new File(temp.toString());
|
||||
// if (tempFile.exists()) {
|
||||
// FileUtils.forceDelete(new File(temp.toString()));
|
||||
// }
|
||||
}
|
||||
|
||||
context.setStatus("Done");
|
||||
}
|
||||
|
||||
// private void packZipFile() throws IOException {
|
||||
// FSDataOutputStream out = null;
|
||||
// ZipOutputStream zos = null;
|
||||
// int zipCount = 0;
|
||||
// LOG.info("Packing zip file for " + perm);
|
||||
// try {
|
||||
// out = fs.create(perm, false);
|
||||
// zos = new ZipOutputStream(out);
|
||||
//
|
||||
// String name = perm.getName().replaceAll(".zip$", "");
|
||||
// LOG.info("adding index directory" + temp);
|
||||
// zipCount = zipDirectory(conf, zos, name, temp.toString(), temp);
|
||||
// /**
|
||||
// for (String configDir : allowedConfigDirectories) {
|
||||
// if (!isRequiredConfigDirectory(configDir)) {
|
||||
// continue;
|
||||
// }
|
||||
// final Path confPath = new Path(solrHome, configDir);
|
||||
// LOG.info("adding configdirectory" + confPath);
|
||||
//
|
||||
// zipCount += zipDirectory(conf, zos, name, solrHome.toString(), confPath);
|
||||
// }
|
||||
// **/
|
||||
// } catch (Throwable ohFoo) {
|
||||
// LOG.error("packZipFile exception", ohFoo);
|
||||
// if (ohFoo instanceof RuntimeException) {
|
||||
// throw (RuntimeException) ohFoo;
|
||||
// }
|
||||
// if (ohFoo instanceof IOException) {
|
||||
// throw (IOException) ohFoo;
|
||||
// }
|
||||
// throw new IOException(ohFoo);
|
||||
//
|
||||
// } finally {
|
||||
// if (zos != null) {
|
||||
// if (zipCount == 0) { // If no entries were written, only close out, as
|
||||
// // the zip will throw an error
|
||||
// LOG.error("No entries written to zip file " + perm);
|
||||
// fs.delete(perm, false);
|
||||
// // out.close();
|
||||
// } else {
|
||||
// LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm,
|
||||
// temp));
|
||||
// zos.close();
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Write a file to a zip output stream, removing leading path name components
|
||||
// * from the actual file name when creating the zip file entry.
|
||||
// *
|
||||
// * The entry placed in the zip file is <code>baseName</code>/
|
||||
// * <code>relativePath</code>, where <code>relativePath</code> is constructed
|
||||
// * by removing a leading <code>root</code> from the path for
|
||||
// * <code>itemToZip</code>.
|
||||
// *
|
||||
// * If <code>itemToZip</code> is an empty directory, it is ignored. If
|
||||
// * <code>itemToZip</code> is a directory, the contents of the directory are
|
||||
// * added recursively.
|
||||
// *
|
||||
// * @param zos The zip output stream
|
||||
// * @param baseName The base name to use for the file name entry in the zip
|
||||
// * file
|
||||
// * @param root The path to remove from <code>itemToZip</code> to make a
|
||||
// * relative path name
|
||||
// * @param itemToZip The path to the file to be added to the zip file
|
||||
// * @return the number of entries added
|
||||
// * @throws IOException
|
||||
// */
|
||||
// static public int zipDirectory(final Configuration conf,
|
||||
// final ZipOutputStream zos, final String baseName, final String root,
|
||||
// final Path itemToZip) throws IOException {
|
||||
// LOG
|
||||
// .info(String
|
||||
// .format("zipDirectory: %s %s %s", baseName, root, itemToZip));
|
||||
// LocalFileSystem localFs = FileSystem.getLocal(conf);
|
||||
// int count = 0;
|
||||
//
|
||||
// final FileStatus itemStatus = localFs.getFileStatus(itemToZip);
|
||||
// if (itemStatus.isDirectory()) {
|
||||
// final FileStatus[] statai = localFs.listStatus(itemToZip);
|
||||
//
|
||||
// // Add a directory entry to the zip file
|
||||
// final String zipDirName = relativePathForZipEntry(itemToZip.toUri()
|
||||
// .getPath(), baseName, root);
|
||||
// final ZipEntry dirZipEntry = new ZipEntry(zipDirName
|
||||
// + Path.SEPARATOR_CHAR);
|
||||
// LOG.info(String.format("Adding directory %s to zip", zipDirName));
|
||||
// zos.putNextEntry(dirZipEntry);
|
||||
// zos.closeEntry();
|
||||
// count++;
|
||||
//
|
||||
// if (statai == null || statai.length == 0) {
|
||||
// LOG.info(String.format("Skipping empty directory %s", itemToZip));
|
||||
// return count;
|
||||
// }
|
||||
// for (FileStatus status : statai) {
|
||||
// count += zipDirectory(conf, zos, baseName, root, status.getPath());
|
||||
// }
|
||||
// LOG.info(String.format("Wrote %d entries for directory %s", count,
|
||||
// itemToZip));
|
||||
// return count;
|
||||
// }
|
||||
//
|
||||
// final String inZipPath = relativePathForZipEntry(itemToZip.toUri()
|
||||
// .getPath(), baseName, root);
|
||||
//
|
||||
// if (inZipPath.length() == 0) {
|
||||
// LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)",
|
||||
// itemToZip, root, baseName));
|
||||
// return 0;
|
||||
// }
|
||||
//
|
||||
// // Take empty files in case the place holder is needed
|
||||
// FSDataInputStream in = null;
|
||||
// try {
|
||||
// in = localFs.open(itemToZip);
|
||||
// final ZipEntry ze = new ZipEntry(inZipPath);
|
||||
// ze.setTime(itemStatus.getModificationTime());
|
||||
// // Comments confuse looking at the zip file
|
||||
// // ze.setComment(itemToZip.toString());
|
||||
// zos.putNextEntry(ze);
|
||||
//
|
||||
// IOUtils.copyBytes(in, zos, conf, false);
|
||||
// zos.closeEntry();
|
||||
// LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip));
|
||||
// return 1;
|
||||
// } finally {
|
||||
// in.close();
|
||||
// }
|
||||
//
|
||||
// }
|
||||
//
|
||||
// static String relativePathForZipEntry(final String rawPath,
|
||||
// final String baseName, final String root) {
|
||||
// String relativePath = rawPath.replaceFirst(Pattern.quote(root.toString()),
|
||||
// "");
|
||||
// LOG.info(String.format("RawPath %s, baseName %s, root %s, first %s",
|
||||
// rawPath, baseName, root, relativePath));
|
||||
//
|
||||
// if (relativePath.startsWith(Path.SEPARATOR)) {
|
||||
// relativePath = relativePath.substring(1);
|
||||
// }
|
||||
// LOG.info(String.format(
|
||||
// "RawPath %s, baseName %s, root %s, post leading slash %s", rawPath,
|
||||
// baseName, root, relativePath));
|
||||
// if (relativePath.isEmpty()) {
|
||||
// LOG.warn(String.format(
|
||||
// "No data after root (%s) removal from raw path %s", root, rawPath));
|
||||
// return baseName;
|
||||
// }
|
||||
// // Construct the path that will be written to the zip file, including
|
||||
// // removing any leading '/' characters
|
||||
// String inZipPath = baseName + Path.SEPARATOR_CHAR + relativePath;
|
||||
//
|
||||
// LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 1 %s",
|
||||
// rawPath, baseName, root, inZipPath));
|
||||
// if (inZipPath.startsWith(Path.SEPARATOR)) {
|
||||
// inZipPath = inZipPath.substring(1);
|
||||
// }
|
||||
// LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 2 %s",
|
||||
// rawPath, baseName, root, inZipPath));
|
||||
//
|
||||
// return inZipPath;
|
||||
//
|
||||
// }
|
||||
//
|
||||
/*
|
||||
static boolean setLogLevel(String packageName, String level) {
|
||||
Log logger = LogFactory.getLog(packageName);
|
||||
if (logger == null) {
|
||||
return false;
|
||||
}
|
||||
// look for: org.apache.commons.logging.impl.SLF4JLocationAwareLog
|
||||
LOG.warn("logger class:"+logger.getClass().getName());
|
||||
if (logger instanceof Log4JLogger) {
|
||||
process(((Log4JLogger) logger).getLogger(), level);
|
||||
return true;
|
||||
}
|
||||
if (logger instanceof Jdk14Logger) {
|
||||
process(((Jdk14Logger) logger).getLogger(), level);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static void process(org.apache.log4j.Logger log, String level) {
|
||||
if (level != null) {
|
||||
log.setLevel(org.apache.log4j.Level.toLevel(level));
|
||||
}
|
||||
}
|
||||
|
||||
public static void process(java.util.logging.Logger log, String level) {
|
||||
if (level != null) {
|
||||
log.setLevel(java.util.logging.Level.parse(level));
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
|
@ -1,188 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.hadoop.dedup.NoChangeUpdateConflictResolver;
|
||||
import org.apache.solr.hadoop.dedup.RetainMostRecentUpdateConflictResolver;
|
||||
import org.apache.solr.hadoop.dedup.UpdateConflictResolver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.kitesdk.morphline.api.ExceptionHandler;
|
||||
import org.kitesdk.morphline.base.FaultTolerance;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
/**
|
||||
* This class loads the mapper's SolrInputDocuments into one EmbeddedSolrServer
|
||||
* per reducer. Each such reducer and Solr server can be seen as a (micro)
|
||||
* shard. The Solr servers store their data in HDFS.
|
||||
*
|
||||
* More specifically, this class consumes a list of <docId, SolrInputDocument>
|
||||
* pairs, sorted by docId, and sends them to an embedded Solr server to generate
|
||||
* a Solr index shard from the documents.
|
||||
*/
|
||||
public class SolrReducer extends Reducer<Text, SolrInputDocumentWritable, Text, SolrInputDocumentWritable> {
|
||||
|
||||
private UpdateConflictResolver resolver;
|
||||
private HeartBeater heartBeater;
|
||||
private ExceptionHandler exceptionHandler;
|
||||
|
||||
public static final String UPDATE_CONFLICT_RESOLVER = SolrReducer.class.getName() + ".updateConflictResolver";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
protected void setup(Context context) throws IOException, InterruptedException {
|
||||
verifyPartitionAssignment(context);
|
||||
SolrRecordWriter.addReducerContext(context);
|
||||
Class<? extends UpdateConflictResolver> resolverClass = context.getConfiguration().getClass(
|
||||
UPDATE_CONFLICT_RESOLVER, RetainMostRecentUpdateConflictResolver.class, UpdateConflictResolver.class);
|
||||
|
||||
this.resolver = ReflectionUtils.newInstance(resolverClass, context.getConfiguration());
|
||||
/*
|
||||
* Note that ReflectionUtils.newInstance() above also implicitly calls
|
||||
* resolver.configure(context.getConfiguration()) if the resolver
|
||||
* implements org.apache.hadoop.conf.Configurable
|
||||
*/
|
||||
|
||||
this.exceptionHandler = new FaultTolerance(
|
||||
context.getConfiguration().getBoolean(FaultTolerance.IS_PRODUCTION_MODE, false),
|
||||
context.getConfiguration().getBoolean(FaultTolerance.IS_IGNORING_RECOVERABLE_EXCEPTIONS, false),
|
||||
context.getConfiguration().get(FaultTolerance.RECOVERABLE_EXCEPTION_CLASSES, SolrServerException.class.getName()));
|
||||
|
||||
this.heartBeater = new HeartBeater(context);
|
||||
}
|
||||
|
||||
protected void reduce(Text key, Iterable<SolrInputDocumentWritable> values, Context context) throws IOException, InterruptedException {
|
||||
heartBeater.needHeartBeat();
|
||||
try {
|
||||
values = resolve(key, values, context);
|
||||
super.reduce(key, values, context);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Unable to process key " + key, e);
|
||||
context.getCounter(getClass().getName() + ".errors", e.getClass().getName()).increment(1);
|
||||
exceptionHandler.handleException(e, null);
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
}
|
||||
}
|
||||
|
||||
private Iterable<SolrInputDocumentWritable> resolve(
|
||||
final Text key, final Iterable<SolrInputDocumentWritable> values, final Context context) {
|
||||
|
||||
if (resolver instanceof NoChangeUpdateConflictResolver) {
|
||||
return values; // fast path
|
||||
}
|
||||
return new Iterable<SolrInputDocumentWritable>() {
|
||||
@Override
|
||||
public Iterator<SolrInputDocumentWritable> iterator() {
|
||||
return new WrapIterator(resolver.orderUpdates(key, new UnwrapIterator(values.iterator()), context));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void cleanup(Context context) throws IOException, InterruptedException {
|
||||
heartBeater.close();
|
||||
super.cleanup(context);
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that if a mappers's partitioner sends an item to partition X it implies that said item
|
||||
* is sent to the reducer with taskID == X. This invariant is currently required for Solr
|
||||
* documents to end up in the right Solr shard.
|
||||
*/
|
||||
private void verifyPartitionAssignment(Context context) {
|
||||
if ("true".equals(System.getProperty("verifyPartitionAssignment", "true"))) {
|
||||
String partitionStr = context.getConfiguration().get("mapred.task.partition");
|
||||
if (partitionStr == null) {
|
||||
partitionStr = context.getConfiguration().get("mapreduce.task.partition");
|
||||
}
|
||||
int partition = Integer.parseInt(partitionStr);
|
||||
int taskId = context.getTaskAttemptID().getTaskID().getId();
|
||||
Preconditions.checkArgument(partition == taskId,
|
||||
"mapred.task.partition: " + partition + " not equal to reducer taskId: " + taskId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class WrapIterator implements Iterator<SolrInputDocumentWritable> {
|
||||
|
||||
private Iterator<SolrInputDocument> parent;
|
||||
|
||||
private WrapIterator(Iterator<SolrInputDocument> parent) {
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return parent.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrInputDocumentWritable next() {
|
||||
return new SolrInputDocumentWritable(parent.next());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class UnwrapIterator implements Iterator<SolrInputDocument> {
|
||||
|
||||
private Iterator<SolrInputDocumentWritable> parent;
|
||||
|
||||
private UnwrapIterator(Iterator<SolrInputDocumentWritable> parent) {
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return parent.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrInputDocument next() {
|
||||
return parent.next().getSolrInputDocument();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import net.sourceforge.argparse4j.ArgumentParsers;
|
||||
import net.sourceforge.argparse4j.helper.ASCIITextWidthCounter;
|
||||
import net.sourceforge.argparse4j.helper.TextHelper;
|
||||
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
/**
|
||||
* Nicely formats the output of
|
||||
* {@link ToolRunner#printGenericCommandUsage(PrintStream)} with the same look and feel that argparse4j uses for help text.
|
||||
*/
|
||||
class ToolRunnerHelpFormatter {
|
||||
|
||||
public static String getGenericCommandUsage() {
|
||||
ByteArrayOutputStream bout = new ByteArrayOutputStream();
|
||||
String msg;
|
||||
try {
|
||||
ToolRunner.printGenericCommandUsage(new PrintStream(bout, true, "UTF-8"));
|
||||
msg = new String(bout.toByteArray(), StandardCharsets.UTF_8);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new RuntimeException(e); // unreachable
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new StringReader(msg));
|
||||
StringBuilder result = new StringBuilder();
|
||||
while (true) {
|
||||
String line;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e); // unreachable
|
||||
}
|
||||
|
||||
if (line == null) {
|
||||
return result.toString(); // EOS
|
||||
}
|
||||
|
||||
if (!line.startsWith("-")) {
|
||||
result.append(line + "\n");
|
||||
} else {
|
||||
line = line.trim();
|
||||
int i = line.indexOf(" ");
|
||||
if (i < 0) {
|
||||
i = line.indexOf('\t');
|
||||
}
|
||||
if (i < 0) {
|
||||
result.append(line + "\n");
|
||||
} else {
|
||||
String title = line.substring(0, i).trim();
|
||||
if (title.length() >= 3 && Character.isLetterOrDigit(title.charAt(1)) && Character.isLetterOrDigit(title.charAt(2))) {
|
||||
title = "-" + title; // prefer "--libjars" long arg style over "-libjars" style but retain "-D foo" short arg style
|
||||
}
|
||||
String help = line.substring(i, line.length()).trim();
|
||||
StringWriter strWriter = new StringWriter();
|
||||
PrintWriter writer = new PrintWriter(strWriter, true);
|
||||
TextHelper.printHelp(writer, title, help, new ASCIITextWidthCounter(), ArgumentParsers.getFormatWidth());
|
||||
result.append(strWriter.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* For the meat see {@link TreeMergeOutputFormat}.
|
||||
*/
|
||||
public class TreeMergeMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public static final String MAX_SEGMENTS_ON_TREE_MERGE = "maxSegmentsOnTreeMerge";
|
||||
|
||||
public static final String SOLR_SHARD_NUMBER = "_solrShardNumber";
|
||||
|
||||
@Override
|
||||
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
|
||||
LOGGER.trace("map key: {}, value: {}", key, value);
|
||||
context.write(value, NullWritable.get());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,201 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.RecordWriter;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.LogMergePolicy;
|
||||
import org.apache.lucene.index.MergePolicy;
|
||||
import org.apache.lucene.index.TieredMergePolicy;
|
||||
import org.apache.lucene.misc.IndexMergeTool;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.solr.store.hdfs.HdfsDirectory;
|
||||
import org.apache.solr.update.SolrIndexWriter;
|
||||
import org.apache.solr.util.RTimer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* See {@link IndexMergeTool}.
|
||||
*/
|
||||
public class TreeMergeOutputFormat extends FileOutputFormat<Text, NullWritable> {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
@Override
|
||||
public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException {
|
||||
Utils.getLogConfigFile(context.getConfiguration());
|
||||
Path workDir = getDefaultWorkFile(context, "");
|
||||
return new TreeMergeRecordWriter(context, workDir);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class TreeMergeRecordWriter extends RecordWriter<Text,NullWritable> {
|
||||
|
||||
private final Path workDir;
|
||||
private final List<Path> shards = new ArrayList();
|
||||
private final HeartBeater heartBeater;
|
||||
private final TaskAttemptContext context;
|
||||
|
||||
private static final Logger LOG = log;
|
||||
|
||||
public TreeMergeRecordWriter(TaskAttemptContext context, Path workDir) {
|
||||
this.workDir = new Path(workDir, "data/index");
|
||||
this.heartBeater = new HeartBeater(context);
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Text key, NullWritable value) {
|
||||
LOG.info("map key: {}", key);
|
||||
heartBeater.needHeartBeat();
|
||||
try {
|
||||
Path path = new Path(key.toString());
|
||||
shards.add(path);
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close(TaskAttemptContext context) throws IOException {
|
||||
LOG.debug("Task " + context.getTaskAttemptID() + " merging into dstDir: " + workDir + ", srcDirs: " + shards);
|
||||
writeShardNumberFile(context);
|
||||
heartBeater.needHeartBeat();
|
||||
try {
|
||||
Directory mergedIndex = new HdfsDirectory(workDir, context.getConfiguration());
|
||||
|
||||
// TODO: shouldn't we pull the Version from the solrconfig.xml?
|
||||
IndexWriterConfig writerConfig = new IndexWriterConfig(null)
|
||||
.setOpenMode(OpenMode.CREATE).setUseCompoundFile(false)
|
||||
//.setMergePolicy(mergePolicy) // TODO: grab tuned MergePolicy from solrconfig.xml?
|
||||
//.setMergeScheduler(...) // TODO: grab tuned MergeScheduler from solrconfig.xml?
|
||||
;
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
writerConfig.setInfoStream(System.out);
|
||||
}
|
||||
// writerConfig.setRAMBufferSizeMB(100); // improve performance
|
||||
// writerConfig.setMaxThreadStates(1);
|
||||
|
||||
// disable compound file to improve performance
|
||||
// also see http://lucene.472066.n3.nabble.com/Questions-on-compound-file-format-td489105.html
|
||||
// also see defaults in SolrIndexConfig
|
||||
MergePolicy mergePolicy = writerConfig.getMergePolicy();
|
||||
LOG.debug("mergePolicy was: {}", mergePolicy);
|
||||
if (mergePolicy instanceof TieredMergePolicy) {
|
||||
((TieredMergePolicy) mergePolicy).setNoCFSRatio(0.0);
|
||||
// ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnceExplicit(10000);
|
||||
// ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnce(10000);
|
||||
// ((TieredMergePolicy) mergePolicy).setSegmentsPerTier(10000);
|
||||
} else if (mergePolicy instanceof LogMergePolicy) {
|
||||
((LogMergePolicy) mergePolicy).setNoCFSRatio(0.0);
|
||||
}
|
||||
LOG.info("Using mergePolicy: {}", mergePolicy);
|
||||
|
||||
IndexWriter writer = new IndexWriter(mergedIndex, writerConfig);
|
||||
|
||||
Directory[] indexes = new Directory[shards.size()];
|
||||
for (int i = 0; i < shards.size(); i++) {
|
||||
indexes[i] = new HdfsDirectory(shards.get(i), context.getConfiguration());
|
||||
}
|
||||
|
||||
context.setStatus("Logically merging " + shards.size() + " shards into one shard");
|
||||
LOG.info("Logically merging " + shards.size() + " shards into one shard: " + workDir);
|
||||
RTimer timer = new RTimer();
|
||||
|
||||
writer.addIndexes(indexes);
|
||||
// TODO: avoid intermediate copying of files into dst directory; rename the files into the dir instead (cp -> rename)
|
||||
// This can improve performance and turns this phase into a true "logical" merge, completing in constant time.
|
||||
// See https://issues.apache.org/jira/browse/LUCENE-4746
|
||||
|
||||
timer.stop();
|
||||
if (LOG.isDebugEnabled()) {
|
||||
context.getCounter(SolrCounters.class.getName(), SolrCounters.LOGICAL_TREE_MERGE_TIME.toString()).increment((long) timer.getTime());
|
||||
}
|
||||
LOG.info("Logical merge took {}ms", timer.getTime());
|
||||
int maxSegments = context.getConfiguration().getInt(TreeMergeMapper.MAX_SEGMENTS_ON_TREE_MERGE, Integer.MAX_VALUE);
|
||||
context.setStatus("Optimizing Solr: forcing mtree merge down to " + maxSegments + " segments");
|
||||
LOG.info("Optimizing Solr: forcing tree merge down to {} segments", maxSegments);
|
||||
timer = new RTimer();
|
||||
if (maxSegments < Integer.MAX_VALUE) {
|
||||
writer.forceMerge(maxSegments);
|
||||
// TODO: consider perf enhancement for no-deletes merges: bulk-copy the postings data
|
||||
// see http://lucene.472066.n3.nabble.com/Experience-with-large-merge-factors-tp1637832p1647046.html
|
||||
}
|
||||
timer.stop();
|
||||
if (LOG.isDebugEnabled()) {
|
||||
context.getCounter(SolrCounters.class.getName(), SolrCounters.PHYSICAL_TREE_MERGE_TIME.toString()).increment((long) timer.getTime());
|
||||
}
|
||||
LOG.info("Optimizing Solr: done forcing tree merge down to {} segments in {}ms", maxSegments, timer.getTime());
|
||||
|
||||
// Set Solr's commit data so the created index is usable by SolrCloud. E.g. Currently SolrCloud relies on
|
||||
// commitTimeMSec in the commit data to do replication.
|
||||
//TODO no commitUpdateCommand
|
||||
SolrIndexWriter.setCommitData(writer, -1);
|
||||
|
||||
timer = new RTimer();
|
||||
LOG.info("Optimizing Solr: Closing index writer");
|
||||
writer.close();
|
||||
LOG.info("Optimizing Solr: Done closing index writer in {}ms", timer.getTime());
|
||||
context.setStatus("Done");
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
heartBeater.close();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For background see MapReduceIndexerTool.renameTreeMergeShardDirs()
|
||||
*
|
||||
* Also see MapReduceIndexerTool.run() method where it uses
|
||||
* NLineInputFormat.setNumLinesPerSplit(job, options.fanout)
|
||||
*/
|
||||
private void writeShardNumberFile(TaskAttemptContext context) throws IOException {
|
||||
Preconditions.checkArgument(shards.size() > 0);
|
||||
String shard = shards.get(0).getParent().getParent().getName(); // move up from "data/index"
|
||||
String taskId = shard.substring("part-m-".length(), shard.length()); // e.g. part-m-00001
|
||||
int taskNum = Integer.parseInt(taskId);
|
||||
int outputShardNum = taskNum / shards.size();
|
||||
LOG.debug("Merging into outputShardNum: " + outputShardNum + " from taskId: " + taskId);
|
||||
Path shardNumberFile = new Path(workDir.getParent().getParent(), TreeMergeMapper.SOLR_SHARD_NUMBER);
|
||||
OutputStream out = shardNumberFile.getFileSystem(context.getConfiguration()).create(shardNumberFile);
|
||||
Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8);
|
||||
writer.write(String.valueOf(outputShardNum));
|
||||
writer.flush();
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,114 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataInput;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class UnbufferedDataInputInputStream extends org.apache.solr.common.util.DataInputInputStream {
|
||||
private final DataInputStream in;
|
||||
|
||||
public UnbufferedDataInputInputStream(DataInput in) {
|
||||
this.in = new DataInputStream(DataInputInputStream.constructInputStream(in));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(byte[] b) throws IOException {
|
||||
in.readFully(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(byte[] b, int off, int len) throws IOException {
|
||||
in.readFully(b, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int skipBytes(int n) throws IOException {
|
||||
return in.skipBytes(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean readBoolean() throws IOException {
|
||||
return in.readBoolean();
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() throws IOException {
|
||||
return in.readByte();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readUnsignedByte() throws IOException {
|
||||
return in.readUnsignedByte();
|
||||
}
|
||||
|
||||
@Override
|
||||
public short readShort() throws IOException {
|
||||
return in.readShort();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readUnsignedShort() throws IOException {
|
||||
return in.readUnsignedShort();
|
||||
}
|
||||
|
||||
@Override
|
||||
public char readChar() throws IOException {
|
||||
return in.readChar();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readInt() throws IOException {
|
||||
return in.readInt();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long readLong() throws IOException {
|
||||
return in.readLong();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float readFloat() throws IOException {
|
||||
return in.readFloat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double readDouble() throws IOException {
|
||||
return in.readDouble();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String readLine() throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
|
||||
return reader.readLine();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String readUTF() throws IOException {
|
||||
return in.readUTF();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return in.read();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,59 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.log4j.PropertyConfigurator;
|
||||
|
||||
import com.google.common.annotations.Beta;
|
||||
import org.apache.solr.common.util.SuppressForbidden;
|
||||
|
||||
|
||||
@Beta
|
||||
public final class Utils {
|
||||
|
||||
private static final String LOG_CONFIG_FILE = "hadoop.log4j.configuration";
|
||||
|
||||
public static void setLogConfigFile(File file, Configuration conf) {
|
||||
conf.set(LOG_CONFIG_FILE, file.getName());
|
||||
}
|
||||
|
||||
public static void getLogConfigFile(Configuration conf) {
|
||||
String log4jPropertiesFile = conf.get(LOG_CONFIG_FILE);
|
||||
configureLog4jProperties(log4jPropertiesFile);
|
||||
}
|
||||
|
||||
@SuppressForbidden(reason = "method is specific to log4j")
|
||||
public static void configureLog4jProperties(String log4jPropertiesFile) {
|
||||
if (log4jPropertiesFile != null) {
|
||||
PropertyConfigurator.configure(log4jPropertiesFile);
|
||||
}
|
||||
}
|
||||
|
||||
public static String getShortClassName(Class clazz) {
|
||||
return getShortClassName(clazz.getName());
|
||||
}
|
||||
|
||||
public static String getShortClassName(String className) {
|
||||
int i = className.lastIndexOf('.'); // regular class
|
||||
int j = className.lastIndexOf('$'); // inner class
|
||||
return className.substring(1 + Math.max(i, j));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,213 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.io.Files;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.solr.cloud.ZkController;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.cloud.Aliases;
|
||||
import org.apache.solr.common.cloud.ClusterState;
|
||||
import org.apache.solr.common.cloud.DocCollection;
|
||||
import org.apache.solr.common.cloud.Replica;
|
||||
import org.apache.solr.common.cloud.Slice;
|
||||
import org.apache.solr.common.cloud.SolrZkClient;
|
||||
import org.apache.solr.common.cloud.ZkConfigManager;
|
||||
import org.apache.solr.common.cloud.ZkCoreNodeProps;
|
||||
import org.apache.solr.common.cloud.ZkNodeProps;
|
||||
import org.apache.solr.common.cloud.ZkStateReader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Extracts SolrCloud information from ZooKeeper.
|
||||
*/
|
||||
final class ZooKeeperInspector {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public List<List<String>> extractShardUrls(String zkHost, String collection) {
|
||||
|
||||
DocCollection docCollection = extractDocCollection(zkHost, collection);
|
||||
List<Slice> slices = getSortedSlices(docCollection.getSlices());
|
||||
List<List<String>> solrUrls = new ArrayList<>(slices.size());
|
||||
for (Slice slice : slices) {
|
||||
if (slice.getLeader() == null) {
|
||||
throw new IllegalArgumentException("Cannot find SolrCloud slice leader. " +
|
||||
"It looks like not all of your shards are registered in ZooKeeper yet");
|
||||
}
|
||||
Collection<Replica> replicas = slice.getReplicas();
|
||||
List<String> urls = new ArrayList<>(replicas.size());
|
||||
for (Replica replica : replicas) {
|
||||
ZkCoreNodeProps props = new ZkCoreNodeProps(replica);
|
||||
urls.add(props.getCoreUrl());
|
||||
}
|
||||
solrUrls.add(urls);
|
||||
}
|
||||
return solrUrls;
|
||||
}
|
||||
|
||||
public DocCollection extractDocCollection(String zkHost, String collection) {
|
||||
if (collection == null) {
|
||||
throw new IllegalArgumentException("collection must not be null");
|
||||
}
|
||||
SolrZkClient zkClient = getZkClient(zkHost);
|
||||
|
||||
try (ZkStateReader zkStateReader = new ZkStateReader(zkClient)) {
|
||||
try {
|
||||
// first check for alias
|
||||
collection = checkForAlias(zkClient, collection);
|
||||
zkStateReader.createClusterStateWatchersAndUpdate();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Cannot find expected information for SolrCloud in ZooKeeper: " + zkHost, e);
|
||||
}
|
||||
|
||||
try {
|
||||
return zkStateReader.getClusterState().getCollection(collection);
|
||||
} catch (SolrException e) {
|
||||
throw new IllegalArgumentException("Cannot find collection '" + collection + "' in ZooKeeper: " + zkHost, e);
|
||||
}
|
||||
} finally {
|
||||
zkClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
public SolrZkClient getZkClient(String zkHost) {
|
||||
if (zkHost == null) {
|
||||
throw new IllegalArgumentException("zkHost must not be null");
|
||||
}
|
||||
|
||||
SolrZkClient zkClient;
|
||||
try {
|
||||
zkClient = new SolrZkClient(zkHost, 30000);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Cannot connect to ZooKeeper: " + zkHost, e);
|
||||
}
|
||||
return zkClient;
|
||||
}
|
||||
|
||||
public List<Slice> getSortedSlices(Collection<Slice> slices) {
|
||||
List<Slice> sorted = new ArrayList(slices);
|
||||
Collections.sort(sorted, (slice1, slice2) -> {
|
||||
Comparator c = new AlphaNumericComparator();
|
||||
return c.compare(slice1.getName(), slice2.getName());
|
||||
});
|
||||
LOG.trace("Sorted slices: {}", sorted);
|
||||
return sorted;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns config value given collection name
|
||||
* Borrowed heavily from Solr's ZKController.
|
||||
*/
|
||||
public String readConfigName(SolrZkClient zkClient, String collection)
|
||||
throws KeeperException, InterruptedException {
|
||||
if (collection == null) {
|
||||
throw new IllegalArgumentException("collection must not be null");
|
||||
}
|
||||
String configName = null;
|
||||
|
||||
// first check for alias
|
||||
collection = checkForAlias(zkClient, collection);
|
||||
|
||||
String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
|
||||
if (LOG.isInfoEnabled()) {
|
||||
LOG.info("Load collection config from:" + path);
|
||||
}
|
||||
byte[] data = zkClient.getData(path, null, null, true);
|
||||
|
||||
if(data != null) {
|
||||
ZkNodeProps props = ZkNodeProps.load(data);
|
||||
configName = props.getStr(ZkController.CONFIGNAME_PROP);
|
||||
}
|
||||
|
||||
if (configName != null && !zkClient.exists(ZkConfigManager.CONFIGS_ZKNODE + "/" + configName, true)) {
|
||||
LOG.error("Specified config does not exist in ZooKeeper:" + configName);
|
||||
throw new IllegalArgumentException("Specified config does not exist in ZooKeeper:"
|
||||
+ configName);
|
||||
}
|
||||
|
||||
return configName;
|
||||
}
|
||||
|
||||
private String checkForAlias(SolrZkClient zkClient, String collection)
|
||||
throws KeeperException, InterruptedException {
|
||||
byte[] aliasData = zkClient.getData(ZkStateReader.ALIASES, null, null, true);
|
||||
Aliases aliases = ClusterState.load(aliasData);
|
||||
String alias = aliases.getCollectionAlias(collection);
|
||||
if (alias != null) {
|
||||
List<String> aliasList = StrUtils.splitSmart(alias, ",", true);
|
||||
if (aliasList.size() > 1) {
|
||||
throw new IllegalArgumentException("collection cannot be an alias that maps to multiple collections");
|
||||
}
|
||||
collection = aliasList.get(0);
|
||||
}
|
||||
return collection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download and return the config directory from ZK
|
||||
*/
|
||||
public File downloadConfigDir(SolrZkClient zkClient, String configName)
|
||||
throws IOException, InterruptedException, KeeperException {
|
||||
File dir = Files.createTempDir();
|
||||
dir.deleteOnExit();
|
||||
ZkConfigManager configManager = new ZkConfigManager(zkClient);
|
||||
configManager.downloadConfigDir(configName, dir.toPath());
|
||||
File confDir = new File(dir, "conf");
|
||||
if (!confDir.isDirectory()) {
|
||||
// create a temporary directory with "conf" subdir and mv the config in there. This is
|
||||
// necessary because of CDH-11188; solrctl does not generate nor accept directories with e.g.
|
||||
// conf/solrconfig.xml which is necessary for proper solr operation. This should work
|
||||
// even if solrctl changes.
|
||||
confDir = new File(Files.createTempDir().getAbsolutePath(), "conf");
|
||||
confDir.getParentFile().deleteOnExit();
|
||||
Files.move(dir, confDir);
|
||||
dir = confDir.getParentFile();
|
||||
}
|
||||
FileUtils.writeStringToFile(new File(dir, "solr.xml"), "<solr><solrcloud></solrcloud></solr>", "UTF-8");
|
||||
verifyConfigDir(confDir);
|
||||
return dir;
|
||||
}
|
||||
|
||||
private void verifyConfigDir(File confDir) throws IOException {
|
||||
File solrConfigFile = new File(confDir, "solrconfig.xml");
|
||||
if (!solrConfigFile.exists()) {
|
||||
throw new IOException("Detected invalid Solr config dir in ZooKeeper - Reason: File not found: "
|
||||
+ solrConfigFile.getName());
|
||||
}
|
||||
if (!solrConfigFile.isFile()) {
|
||||
throw new IOException("Detected invalid Solr config dir in ZooKeeper - Reason: Not a file: "
|
||||
+ solrConfigFile.getName());
|
||||
}
|
||||
if (!solrConfigFile.canRead()) {
|
||||
throw new IOException("Insufficient permissions to read file: " + solrConfigFile);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer.Context;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
/**
|
||||
* UpdateConflictResolver implementation that returns the solr documents in the
|
||||
* same order as they are received on input, i.e. without change in order.
|
||||
*/
|
||||
public final class NoChangeUpdateConflictResolver implements UpdateConflictResolver {
|
||||
|
||||
@Override
|
||||
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
|
||||
return updates;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer.Context;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
/**
|
||||
* UpdateConflictResolver implementation that rejects multiple documents with
|
||||
* the same key with an exception.
|
||||
*/
|
||||
public final class RejectingUpdateConflictResolver implements UpdateConflictResolver {
|
||||
|
||||
@Override
|
||||
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
|
||||
SolrInputDocument firstUpdate = null;
|
||||
while (updates.hasNext()) {
|
||||
if (firstUpdate == null) {
|
||||
firstUpdate = updates.next();
|
||||
assert firstUpdate != null;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Update conflict! Documents with the same unique key are forbidden: "
|
||||
+ key);
|
||||
}
|
||||
}
|
||||
assert firstUpdate != null;
|
||||
return Collections.singletonList(firstUpdate).iterator();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,114 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer.Context;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.hadoop.HdfsFileFieldNames;
|
||||
import org.apache.solr.hadoop.Utils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* UpdateConflictResolver implementation that ignores all but the most recent
|
||||
* document version, based on a configurable numeric Solr field, which defaults
|
||||
* to the file_last_modified timestamp.
|
||||
*/
|
||||
public class RetainMostRecentUpdateConflictResolver implements UpdateConflictResolver, Configurable {
|
||||
|
||||
private Configuration conf;
|
||||
private String orderByFieldName = ORDER_BY_FIELD_NAME_DEFAULT;
|
||||
|
||||
public static final String ORDER_BY_FIELD_NAME_KEY =
|
||||
RetainMostRecentUpdateConflictResolver.class.getName() + ".orderByFieldName";
|
||||
|
||||
public static final String ORDER_BY_FIELD_NAME_DEFAULT = HdfsFileFieldNames.FILE_LAST_MODIFIED;
|
||||
|
||||
public static final String COUNTER_GROUP = Utils.getShortClassName(RetainMostRecentUpdateConflictResolver.class);
|
||||
public static final String DUPLICATES_COUNTER_NAME = "Number of documents ignored as duplicates";
|
||||
public static final String OUTDATED_COUNTER_NAME = "Number of documents ignored as outdated";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
this.conf = conf;
|
||||
this.orderByFieldName = conf.get(ORDER_BY_FIELD_NAME_KEY, orderByFieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Configuration getConf() {
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected String getOrderByFieldName() {
|
||||
return orderByFieldName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
|
||||
return getMaximum(updates, getOrderByFieldName(), new SolrInputDocumentComparator.TimeStampComparator(), ctx);
|
||||
}
|
||||
|
||||
/** Returns the most recent document among the colliding updates */
|
||||
protected Iterator<SolrInputDocument> getMaximum(Iterator<SolrInputDocument> updates, String fieldName,
|
||||
Comparator child, Context context) {
|
||||
|
||||
SolrInputDocumentComparator comp = new SolrInputDocumentComparator(fieldName, child);
|
||||
SolrInputDocument max = null;
|
||||
long numDupes = 0;
|
||||
long numOutdated = 0;
|
||||
while (updates.hasNext()) {
|
||||
SolrInputDocument next = updates.next();
|
||||
assert next != null;
|
||||
if (max == null) {
|
||||
max = next;
|
||||
} else {
|
||||
int c = comp.compare(next, max);
|
||||
if (c == 0) {
|
||||
LOG.debug("Ignoring document version because it is a duplicate: {}", next);
|
||||
numDupes++;
|
||||
} else if (c > 0) {
|
||||
LOG.debug("Ignoring document version because it is outdated: {}", max);
|
||||
max = next;
|
||||
numOutdated++;
|
||||
} else {
|
||||
LOG.debug("Ignoring document version because it is outdated: {}", next);
|
||||
numOutdated++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert max != null;
|
||||
if (numDupes > 0) {
|
||||
context.getCounter(COUNTER_GROUP, DUPLICATES_COUNTER_NAME).increment(numDupes);
|
||||
}
|
||||
if (numOutdated > 0) {
|
||||
context.getCounter(COUNTER_GROUP, OUTDATED_COUNTER_NAME).increment(numOutdated);
|
||||
}
|
||||
return Collections.singletonList(max).iterator();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,84 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
|
||||
/**
|
||||
* Default mechanism of determining which of two Solr documents with the same
|
||||
* key is the more recent version.
|
||||
*/
|
||||
public final class SolrInputDocumentComparator implements Comparator<SolrInputDocument> {
|
||||
|
||||
private Comparator child;
|
||||
private String fieldName;
|
||||
|
||||
SolrInputDocumentComparator(String fieldName, Comparator child) {
|
||||
this.child = child;
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(SolrInputDocument doc1, SolrInputDocument doc2) {
|
||||
SolrInputField f1 = doc1.getField(fieldName);
|
||||
SolrInputField f2 = doc2.getField(fieldName);
|
||||
if (f1 == f2) {
|
||||
return 0;
|
||||
} else if (f1 == null) {
|
||||
return -1;
|
||||
} else if (f2 == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
Object v1 = f1.getFirstValue();
|
||||
Object v2 = f2.getFirstValue();
|
||||
return child.compare(v1, v2);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
public static final class TimeStampComparator implements Comparator {
|
||||
|
||||
@Override
|
||||
public int compare(Object v1, Object v2) {
|
||||
if (v1 == v2) {
|
||||
return 0;
|
||||
} else if (v1 == null) {
|
||||
return -1;
|
||||
} else if (v2 == null) {
|
||||
return 1;
|
||||
}
|
||||
long t1 = getLong(v1);
|
||||
long t2 = getLong(v2);
|
||||
return (t1 < t2 ? -1 : (t1==t2 ? 0 : 1));
|
||||
}
|
||||
|
||||
private long getLong(Object v) {
|
||||
if (v instanceof Long) {
|
||||
return ((Long) v).longValue();
|
||||
} else {
|
||||
return Long.parseLong(v.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,79 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer.Context;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.hadoop.HdfsFileFieldNames;
|
||||
|
||||
/**
|
||||
* UpdateConflictResolver implementation that orders colliding updates ascending
|
||||
* from least recent to most recent (partial) update, based on a configurable
|
||||
* numeric Solr field, which defaults to the file_last_modified timestamp.
|
||||
*/
|
||||
public class SortingUpdateConflictResolver implements UpdateConflictResolver, Configurable {
|
||||
|
||||
private Configuration conf;
|
||||
private String orderByFieldName = ORDER_BY_FIELD_NAME_DEFAULT;
|
||||
|
||||
public static final String ORDER_BY_FIELD_NAME_KEY =
|
||||
SortingUpdateConflictResolver.class.getName() + ".orderByFieldName";
|
||||
|
||||
public static final String ORDER_BY_FIELD_NAME_DEFAULT = HdfsFileFieldNames.FILE_LAST_MODIFIED;
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
this.conf = conf;
|
||||
this.orderByFieldName = conf.get(ORDER_BY_FIELD_NAME_KEY, orderByFieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Configuration getConf() {
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected String getOrderByFieldName() {
|
||||
return orderByFieldName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
|
||||
return sort(updates, getOrderByFieldName(), new SolrInputDocumentComparator.TimeStampComparator());
|
||||
}
|
||||
|
||||
protected Iterator<SolrInputDocument> sort(Iterator<SolrInputDocument> updates, String fieldName, Comparator child) {
|
||||
// TODO: use an external merge sort in the pathological case where there are a huge amount of collisions
|
||||
List<SolrInputDocument> sortedUpdates = new ArrayList(1);
|
||||
while (updates.hasNext()) {
|
||||
sortedUpdates.add(updates.next());
|
||||
}
|
||||
if (sortedUpdates.size() > 1) { // conflicts are rare
|
||||
Collections.sort(sortedUpdates, new SolrInputDocumentComparator(fieldName, child));
|
||||
}
|
||||
return sortedUpdates.iterator();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,71 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.apache.hadoop.mapreduce.Reducer.Context;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
/**
|
||||
* Interface that enables deduplication and ordering of a series of document
|
||||
* updates for the same unique document key.
|
||||
*
|
||||
* For example, a MapReduce batch job might index multiple files in the same job
|
||||
* where some of the files contain old and new versions of the very same
|
||||
* document, using the same unique document key.
|
||||
*
|
||||
* Typically, implementations of this interface forbid collisions by throwing an
|
||||
* exception, or ignore all but the most recent document version, or, in the
|
||||
* general case, order colliding updates ascending from least recent to most
|
||||
* recent (partial) update.
|
||||
*
|
||||
* The caller of this interface (i.e. the Hadoop Reducer) will then apply the
|
||||
* updates to Solr in the order returned by the orderUpdates() method.
|
||||
*
|
||||
* Configuration: If an UpdateConflictResolver implementation also implements
|
||||
* {@link Configurable} then the Hadoop Reducer will call
|
||||
* {@link Configurable#setConf(org.apache.hadoop.conf.Configuration)} on
|
||||
* instance construction and pass the standard Hadoop configuration information.
|
||||
*/
|
||||
public interface UpdateConflictResolver {
|
||||
|
||||
/**
|
||||
* Given a list of all colliding document updates for the same unique document
|
||||
* key, this method returns zero or more documents in an application specific
|
||||
* order.
|
||||
*
|
||||
* The caller will then apply the updates for this key to Solr in the order
|
||||
* returned by the orderUpdate() method.
|
||||
*
|
||||
* @param uniqueKey
|
||||
* the document key common to all collidingUpdates mentioned below
|
||||
* @param collidingUpdates
|
||||
* all updates in the MapReduce job that have a key equal to
|
||||
* {@code uniqueKey} mentioned above. The input order is unspecified.
|
||||
* @param context
|
||||
* The <code>Context</code> passed from the {@link Reducer}
|
||||
* implementations.
|
||||
* @return the order in which the updates shall be applied to Solr
|
||||
*/
|
||||
Iterator<SolrInputDocument> orderUpdates(
|
||||
Text uniqueKey, Iterator<SolrInputDocument> collidingUpdates, Context context);
|
||||
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Dedupe related code.
|
||||
*/
|
||||
package org.apache.solr.hadoop.dedup;
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.morphline;
|
||||
|
||||
import org.apache.solr.hadoop.Utils;
|
||||
|
||||
public enum MorphlineCounters {
|
||||
|
||||
FILES_READ (getClassName(MorphlineMapper.class) + ": Number of files read"),
|
||||
|
||||
FILE_BYTES_READ (getClassName(MorphlineMapper.class) + ": Number of file bytes read"),
|
||||
|
||||
DOCS_READ (getClassName(MorphlineMapper.class) + ": Number of documents read"),
|
||||
|
||||
PARSER_OUTPUT_BYTES (getClassName(MorphlineMapper.class) + ": Number of document bytes generated by Tika parser"),
|
||||
|
||||
ERRORS (getClassName(MorphlineMapper.class) + ": Number of errors");
|
||||
|
||||
private final String label;
|
||||
|
||||
private MorphlineCounters(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return label;
|
||||
}
|
||||
|
||||
private static String getClassName(Class clazz) {
|
||||
return Utils.getShortClassName(clazz);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,268 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.morphline;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.mapreduce.Mapper.Context;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.hadoop.HdfsFileFieldNames;
|
||||
import org.apache.solr.hadoop.PathParts;
|
||||
import org.apache.solr.hadoop.Utils;
|
||||
import org.apache.solr.morphlines.solr.DocumentLoader;
|
||||
import org.apache.solr.morphlines.solr.SolrLocator;
|
||||
import org.apache.solr.morphlines.solr.SolrMorphlineContext;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.kitesdk.morphline.api.Command;
|
||||
import org.kitesdk.morphline.api.MorphlineCompilationException;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.Record;
|
||||
import org.kitesdk.morphline.base.Compiler;
|
||||
import org.kitesdk.morphline.base.FaultTolerance;
|
||||
import org.kitesdk.morphline.base.Fields;
|
||||
import org.kitesdk.morphline.base.Metrics;
|
||||
import org.kitesdk.morphline.base.Notifications;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.codahale.metrics.Timer;
|
||||
import com.google.common.annotations.Beta;
|
||||
import com.typesafe.config.Config;
|
||||
import com.typesafe.config.ConfigFactory;
|
||||
|
||||
/**
|
||||
* Internal helper for {@link MorphlineMapper} and dryRun mode; This API is for *INTERNAL* use only
|
||||
* and should not be considered public.
|
||||
*/
|
||||
@Beta
|
||||
public final class MorphlineMapRunner {
|
||||
|
||||
private MorphlineContext morphlineContext;
|
||||
private Command morphline;
|
||||
private IndexSchema schema;
|
||||
private Map<String, String> commandLineMorphlineHeaders;
|
||||
private boolean disableFileOpen;
|
||||
private String morphlineFileAndId;
|
||||
private final Timer elapsedTime;
|
||||
|
||||
public static final String MORPHLINE_FILE_PARAM = "morphlineFile";
|
||||
public static final String MORPHLINE_ID_PARAM = "morphlineId";
|
||||
|
||||
/**
|
||||
* Morphline variables can be passed from the CLI to the Morphline, e.g.:
|
||||
* hadoop ... -D morphlineVariable.zkHost=127.0.0.1:2181/solr
|
||||
*/
|
||||
public static final String MORPHLINE_VARIABLE_PARAM = "morphlineVariable";
|
||||
|
||||
/**
|
||||
* Headers, including MIME types, can also explicitly be passed by force from the CLI to Morphline, e.g:
|
||||
* hadoop ... -D morphlineField._attachment_mimetype=text/csv
|
||||
*/
|
||||
public static final String MORPHLINE_FIELD_PREFIX = "morphlineField.";
|
||||
|
||||
/**
|
||||
* Flag to disable reading of file contents if indexing just file metadata is sufficient.
|
||||
* This improves performance and confidentiality.
|
||||
*/
|
||||
public static final String DISABLE_FILE_OPEN = "morphlineDisableFileOpen";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
MorphlineContext getMorphlineContext() {
|
||||
return morphlineContext;
|
||||
}
|
||||
|
||||
IndexSchema getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public MorphlineMapRunner(Configuration configuration, DocumentLoader loader, String solrHomeDir) throws IOException {
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("CWD is {}", new File(".").getCanonicalPath());
|
||||
TreeMap map = new TreeMap();
|
||||
for (Map.Entry<String,String> entry : configuration) {
|
||||
map.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
LOG.trace("Configuration:\n" +
|
||||
map.entrySet().stream().map(Object::toString).collect(Collectors.joining("\n")));
|
||||
}
|
||||
|
||||
FaultTolerance faultTolerance = new FaultTolerance(
|
||||
configuration.getBoolean(FaultTolerance.IS_PRODUCTION_MODE, false),
|
||||
configuration.getBoolean(FaultTolerance.IS_IGNORING_RECOVERABLE_EXCEPTIONS, false),
|
||||
configuration.get(FaultTolerance.RECOVERABLE_EXCEPTION_CLASSES, SolrServerException.class.getName())
|
||||
);
|
||||
|
||||
morphlineContext = new SolrMorphlineContext.Builder()
|
||||
.setDocumentLoader(loader)
|
||||
.setExceptionHandler(faultTolerance)
|
||||
.setMetricRegistry(new MetricRegistry())
|
||||
.build();
|
||||
|
||||
class MySolrLocator extends SolrLocator { // trick to access protected ctor
|
||||
public MySolrLocator(MorphlineContext ctx) {
|
||||
super(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
SolrLocator locator = new MySolrLocator(morphlineContext);
|
||||
locator.setSolrHomeDir(solrHomeDir);
|
||||
schema = locator.getIndexSchema();
|
||||
|
||||
// rebuild context, now with schema
|
||||
morphlineContext = new SolrMorphlineContext.Builder()
|
||||
.setIndexSchema(schema)
|
||||
.setDocumentLoader(loader)
|
||||
.setExceptionHandler(faultTolerance)
|
||||
.setMetricRegistry(morphlineContext.getMetricRegistry())
|
||||
.build();
|
||||
|
||||
String morphlineFile = configuration.get(MORPHLINE_FILE_PARAM);
|
||||
String morphlineId = configuration.get(MORPHLINE_ID_PARAM);
|
||||
if (morphlineFile == null || morphlineFile.trim().length() == 0) {
|
||||
throw new MorphlineCompilationException("Missing parameter: " + MORPHLINE_FILE_PARAM, null);
|
||||
}
|
||||
Map morphlineVariables = new HashMap();
|
||||
for (Map.Entry<String, String> entry : configuration) {
|
||||
String variablePrefix = MORPHLINE_VARIABLE_PARAM + ".";
|
||||
if (entry.getKey().startsWith(variablePrefix)) {
|
||||
morphlineVariables.put(entry.getKey().substring(variablePrefix.length()), entry.getValue());
|
||||
}
|
||||
}
|
||||
Config override = ConfigFactory.parseMap(morphlineVariables);
|
||||
morphline = new Compiler().compile(new File(morphlineFile), morphlineId, morphlineContext, null, override);
|
||||
morphlineFileAndId = morphlineFile + "@" + morphlineId;
|
||||
|
||||
disableFileOpen = configuration.getBoolean(DISABLE_FILE_OPEN, false);
|
||||
LOG.debug("disableFileOpen: {}", disableFileOpen);
|
||||
|
||||
commandLineMorphlineHeaders = new HashMap();
|
||||
for (Map.Entry<String,String> entry : configuration) {
|
||||
if (entry.getKey().startsWith(MORPHLINE_FIELD_PREFIX)) {
|
||||
commandLineMorphlineHeaders.put(entry.getKey().substring(MORPHLINE_FIELD_PREFIX.length()), entry.getValue());
|
||||
}
|
||||
}
|
||||
LOG.debug("Headers, including MIME types, passed by force from the CLI to morphline: {}", commandLineMorphlineHeaders);
|
||||
|
||||
String metricName = MetricRegistry.name(Utils.getShortClassName(getClass()), Metrics.ELAPSED_TIME);
|
||||
this.elapsedTime = morphlineContext.getMetricRegistry().timer(metricName);
|
||||
Notifications.notifyBeginTransaction(morphline);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract content from the path specified in the value. Key is useless.
|
||||
*/
|
||||
public void map(String value, Configuration configuration, Context context) throws IOException {
|
||||
LOG.info("Processing file {}", value);
|
||||
InputStream in = null;
|
||||
Record record = null;
|
||||
Timer.Context timerContext = elapsedTime.time();
|
||||
try {
|
||||
PathParts parts = new PathParts(value.toString(), configuration);
|
||||
record = getRecord(parts);
|
||||
if (record == null) {
|
||||
return; // ignore
|
||||
}
|
||||
for (Map.Entry<String, String> entry : commandLineMorphlineHeaders.entrySet()) {
|
||||
record.replaceValues(entry.getKey(), entry.getValue());
|
||||
}
|
||||
long fileLength = parts.getFileStatus().getLen();
|
||||
if (disableFileOpen) {
|
||||
in = new ByteArrayInputStream(new byte[0]);
|
||||
} else {
|
||||
in = new BufferedInputStream(parts.getFileSystem().open(parts.getUploadPath()));
|
||||
}
|
||||
record.put(Fields.ATTACHMENT_BODY, in);
|
||||
Notifications.notifyStartSession(morphline);
|
||||
if (!morphline.process(record)) {
|
||||
LOG.warn("Morphline {} failed to process record: {}", morphlineFileAndId, record);
|
||||
}
|
||||
if (context != null) {
|
||||
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.FILES_READ.toString()).increment(1);
|
||||
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.FILE_BYTES_READ.toString()).increment(fileLength);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.error("Unable to process file " + value, e);
|
||||
if (context != null) {
|
||||
context.getCounter(getClass().getName() + ".errors", e.getClass().getName()).increment(1);
|
||||
}
|
||||
morphlineContext.getExceptionHandler().handleException(e, record);
|
||||
} finally {
|
||||
timerContext.stop();
|
||||
if (in != null) {
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected Record getRecord(PathParts parts) {
|
||||
FileStatus stats;
|
||||
try {
|
||||
stats = parts.getFileStatus();
|
||||
} catch (IOException e) {
|
||||
stats = null;
|
||||
}
|
||||
if (stats == null) {
|
||||
LOG.warn("Ignoring file that somehow has become unavailable since the job was submitted: {}",
|
||||
parts.getUploadURL());
|
||||
return null;
|
||||
}
|
||||
|
||||
Record headers = new Record();
|
||||
//headers.put(getSchema().getUniqueKeyField().getName(), parts.getId()); // use HDFS file path as docId if no docId is specified
|
||||
headers.put(Fields.BASE_ID, parts.getId()); // with sanitizeUniqueKey command, use HDFS file path as docId if no docId is specified
|
||||
headers.put(Fields.ATTACHMENT_NAME, parts.getName()); // Tika can use the file name in guessing the right MIME type
|
||||
|
||||
// enable indexing and storing of file meta data in Solr
|
||||
headers.put(HdfsFileFieldNames.FILE_UPLOAD_URL, parts.getUploadURL());
|
||||
headers.put(HdfsFileFieldNames.FILE_DOWNLOAD_URL, parts.getDownloadURL());
|
||||
headers.put(HdfsFileFieldNames.FILE_SCHEME, parts.getScheme());
|
||||
headers.put(HdfsFileFieldNames.FILE_HOST, parts.getHost());
|
||||
headers.put(HdfsFileFieldNames.FILE_PORT, String.valueOf(parts.getPort()));
|
||||
headers.put(HdfsFileFieldNames.FILE_PATH, parts.getURIPath());
|
||||
headers.put(HdfsFileFieldNames.FILE_NAME, parts.getName());
|
||||
headers.put(HdfsFileFieldNames.FILE_LAST_MODIFIED, String.valueOf(stats.getModificationTime())); // FIXME also add in SpoolDirectorySource
|
||||
headers.put(HdfsFileFieldNames.FILE_LENGTH, String.valueOf(stats.getLen())); // FIXME also add in SpoolDirectorySource
|
||||
headers.put(HdfsFileFieldNames.FILE_OWNER, stats.getOwner());
|
||||
headers.put(HdfsFileFieldNames.FILE_GROUP, stats.getGroup());
|
||||
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_USER, stats.getPermission().getUserAction().SYMBOL);
|
||||
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_GROUP, stats.getPermission().getGroupAction().SYMBOL);
|
||||
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_OTHER, stats.getPermission().getOtherAction().SYMBOL);
|
||||
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_STICKYBIT, String.valueOf(stats.getPermission().getStickyBit()));
|
||||
// TODO: consider to add stats.getAccessTime(), stats.getReplication(), stats.isSymlink(), stats.getBlockSize()
|
||||
|
||||
return headers;
|
||||
}
|
||||
|
||||
public void cleanup() {
|
||||
Notifications.notifyCommitTransaction(morphline);
|
||||
Notifications.notifyShutdown(morphline);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,193 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.morphline;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
import org.apache.solr.hadoop.HeartBeater;
|
||||
import org.apache.solr.hadoop.SolrInputDocumentWritable;
|
||||
import org.apache.solr.hadoop.SolrMapper;
|
||||
import org.apache.solr.morphlines.solr.DocumentLoader;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.codahale.metrics.Counter;
|
||||
import com.codahale.metrics.Counting;
|
||||
import com.codahale.metrics.Histogram;
|
||||
import com.codahale.metrics.Meter;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.codahale.metrics.Timer;
|
||||
|
||||
/**
|
||||
* This class takes the input files, extracts the relevant content, transforms
|
||||
* it and hands SolrInputDocuments to a set of reducers.
|
||||
*
|
||||
* More specifically, it consumes a list of <offset, hdfsFilePath> input pairs.
|
||||
* For each such pair extracts a set of zero or more SolrInputDocuments and
|
||||
* sends them to a downstream Reducer. The key for the reducer is the unique id
|
||||
* of the SolrInputDocument specified in Solr schema.xml.
|
||||
*/
|
||||
public class MorphlineMapper extends SolrMapper<LongWritable, Text> {
|
||||
|
||||
private Context context;
|
||||
private MorphlineMapRunner runner;
|
||||
private HeartBeater heartBeater;
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
protected IndexSchema getSchema() {
|
||||
return runner.getSchema();
|
||||
}
|
||||
|
||||
protected Context getContext() {
|
||||
return context;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setup(Context context) throws IOException, InterruptedException {
|
||||
super.setup(context);
|
||||
this.context = context;
|
||||
heartBeater = new HeartBeater(context);
|
||||
this.runner = new MorphlineMapRunner(
|
||||
context.getConfiguration(), new MyDocumentLoader(), getSolrHomeDir().toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract content from the path specified in the value. Key is useless.
|
||||
*/
|
||||
@Override
|
||||
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
|
||||
heartBeater.needHeartBeat();
|
||||
try {
|
||||
runner.map(value.toString(), context.getConfiguration(), context);
|
||||
} finally {
|
||||
heartBeater.cancelHeartBeat();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void cleanup(Context context) throws IOException, InterruptedException {
|
||||
heartBeater.close();
|
||||
runner.cleanup();
|
||||
addMetricsToMRCounters(runner.getMorphlineContext().getMetricRegistry(), context);
|
||||
super.cleanup(context);
|
||||
}
|
||||
|
||||
private void addMetricsToMRCounters(MetricRegistry metricRegistry, Context context) {
|
||||
for (Map.Entry<String, Counter> entry : metricRegistry.getCounters().entrySet()) {
|
||||
addCounting(entry.getKey(), entry.getValue(), 1);
|
||||
}
|
||||
for (Map.Entry<String, Histogram> entry : metricRegistry.getHistograms().entrySet()) {
|
||||
addCounting(entry.getKey(), entry.getValue(), 1);
|
||||
}
|
||||
for (Map.Entry<String, Meter> entry : metricRegistry.getMeters().entrySet()) {
|
||||
addCounting(entry.getKey(), entry.getValue(), 1);
|
||||
}
|
||||
for (Map.Entry<String, Timer> entry : metricRegistry.getTimers().entrySet()) {
|
||||
long nanosPerMilliSec = 1000 * 1000;
|
||||
addCounting(entry.getKey(), entry.getValue(), nanosPerMilliSec);
|
||||
}
|
||||
}
|
||||
|
||||
private void addCounting(String metricName, Counting value, long scale) {
|
||||
context.getCounter("morphline", metricName).increment(value.getCount() / scale);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private final class MyDocumentLoader implements DocumentLoader {
|
||||
|
||||
@Override
|
||||
public void beginTransaction() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void load(SolrInputDocument doc) throws IOException, SolrServerException {
|
||||
String uniqueKeyFieldName = getSchema().getUniqueKeyField().getName();
|
||||
Object id = doc.getFieldValue(uniqueKeyFieldName);
|
||||
if (id == null) {
|
||||
throw new IllegalArgumentException("Missing value for (required) unique document key: " + uniqueKeyFieldName
|
||||
+ " (see Solr schema.xml)");
|
||||
}
|
||||
try {
|
||||
context.write(new Text(id.toString()), new SolrInputDocumentWritable(doc));
|
||||
} catch (InterruptedException e) {
|
||||
throw new IOException("Interrupted while writing " + doc, e);
|
||||
}
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
long numParserOutputBytes = 0;
|
||||
for (SolrInputField field : doc.values()) {
|
||||
numParserOutputBytes += sizeOf(field.getValue());
|
||||
}
|
||||
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.PARSER_OUTPUT_BYTES.toString()).increment(numParserOutputBytes);
|
||||
}
|
||||
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.DOCS_READ.toString()).increment(1);
|
||||
}
|
||||
|
||||
// just an approximation
|
||||
private long sizeOf(Object value) {
|
||||
if (value instanceof CharSequence) {
|
||||
return ((CharSequence) value).length();
|
||||
} else if (value instanceof Integer) {
|
||||
return 4;
|
||||
} else if (value instanceof Long) {
|
||||
return 8;
|
||||
} else if (value instanceof Collection) {
|
||||
long size = 0;
|
||||
for (Object val : (Collection) value) {
|
||||
size += sizeOf(val);
|
||||
}
|
||||
return size;
|
||||
} else {
|
||||
return String.valueOf(value).length();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void commitTransaction() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateResponse rollbackTransaction() throws SolrServerException, IOException {
|
||||
return new UpdateResponse();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrPingResponse ping() throws SolrServerException, IOException {
|
||||
return new SolrPingResponse();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Morphlines related code.
|
||||
*/
|
||||
package org.apache.solr.hadoop.morphline;
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* {@link org.apache.solr.hadoop.MapReduceIndexerTool} and related code.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
Apache Solr Search Server: Solr MapReduce contrib
|
||||
</body>
|
||||
</html>
|
|
@ -1 +0,0 @@
|
|||
The test-files by this module are located in the morphlines-core module.
|
|
@ -1,46 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
public class AlphaNumericComparatorTest extends Assert {
|
||||
|
||||
@Test
|
||||
public void testBasic() {
|
||||
Comparator c = new AlphaNumericComparator();
|
||||
assertTrue(c.compare("a", "b") < 0);
|
||||
assertTrue(c.compare("shard1", "shard1") == 0);
|
||||
//assertTrue(c.compare("shard01", "shard1") == 0);
|
||||
assertTrue(c.compare("shard10", "shard10") == 0);
|
||||
assertTrue(c.compare("shard1", "shard2") < 0);
|
||||
assertTrue(c.compare("shard9", "shard10") < 0);
|
||||
assertTrue(c.compare("shard09", "shard10") < 0);
|
||||
assertTrue(c.compare("shard019", "shard10") > 0);
|
||||
assertTrue(c.compare("shard10", "shard11") < 0);
|
||||
assertTrue(c.compare("shard10z", "shard10z") == 0);
|
||||
assertTrue(c.compare("shard10z", "shard11z") < 0);
|
||||
assertTrue(c.compare("shard10a", "shard10z") < 0);
|
||||
assertTrue(c.compare("shard10z", "shard10a") > 0);
|
||||
assertTrue(c.compare("shard1z", "shard1z") == 0);
|
||||
assertTrue(c.compare("shard2", "shard1") > 0);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class IdentityMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
|
||||
LOGGER.info("map key: {}, value: {}", key, value);
|
||||
context.write(value, NullWritable.get());
|
||||
}
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class IdentityReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
|
||||
LOGGER.info("reduce key: {}, value: {}", key, values);
|
||||
context.write(key, NullWritable.get());
|
||||
}
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
|
||||
import org.apache.hadoop.mrunit.types.Pair;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class LineRandomizerMapperReducerTest extends Assert {
|
||||
|
||||
private MapReduceDriver<LongWritable, Text, LongWritable, Text, Text, NullWritable> mapReduceDriver;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
LineRandomizerMapper mapper = new LineRandomizerMapper();
|
||||
LineRandomizerReducer reducer = new LineRandomizerReducer();
|
||||
mapReduceDriver = MapReduceDriver.newMapReduceDriver(mapper, reducer);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMapReduce1Item() throws IOException {
|
||||
mapReduceDriver.withInput(new LongWritable(0), new Text("hello"));
|
||||
mapReduceDriver.withOutput(new Text("hello"), NullWritable.get());
|
||||
mapReduceDriver.runTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMapReduce2Items() throws IOException {
|
||||
mapReduceDriver.withAll(Arrays.asList(
|
||||
new Pair<>(new LongWritable(0), new Text("hello")),
|
||||
new Pair<>(new LongWritable(1), new Text("world"))
|
||||
));
|
||||
mapReduceDriver.withAllOutput(Arrays.asList(
|
||||
new Pair<>(new Text("world"), NullWritable.get()),
|
||||
new Pair<>(new Text("hello"), NullWritable.get())
|
||||
));
|
||||
mapReduceDriver.runTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMapReduce3Items() throws IOException {
|
||||
mapReduceDriver.withAll(Arrays.asList(
|
||||
new Pair<>(new LongWritable(0), new Text("hello")),
|
||||
new Pair<>(new LongWritable(1), new Text("world")),
|
||||
new Pair<>(new LongWritable(2), new Text("nadja"))
|
||||
));
|
||||
mapReduceDriver.withAllOutput(Arrays.asList(
|
||||
new Pair<>(new Text("nadja"), NullWritable.get()),
|
||||
new Pair<>(new Text("world"), NullWritable.get()),
|
||||
new Pair<>(new Text("hello"), NullWritable.get())
|
||||
));
|
||||
mapReduceDriver.runTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMapReduce4Items() throws IOException {
|
||||
mapReduceDriver.withAll(Arrays.asList(
|
||||
new Pair<>(new LongWritable(0), new Text("hello")),
|
||||
new Pair<>(new LongWritable(1), new Text("world")),
|
||||
new Pair<>(new LongWritable(2), new Text("nadja")),
|
||||
new Pair<>(new LongWritable(3), new Text("basti"))
|
||||
));
|
||||
mapReduceDriver.withAllOutput(Arrays.asList(
|
||||
new Pair<>(new Text("nadja"), NullWritable.get()),
|
||||
new Pair<>(new Text("world"), NullWritable.get()),
|
||||
new Pair<>(new Text("basti"), NullWritable.get()),
|
||||
new Pair<>(new Text("hello"), NullWritable.get())
|
||||
));
|
||||
mapReduceDriver.runTest();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public abstract class MRUnitBase extends SolrTestCaseJ4 {
|
||||
|
||||
protected static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
|
||||
protected static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
|
||||
protected static File solrHomeZip;
|
||||
|
||||
@BeforeClass
|
||||
public static void setupClass() throws Exception {
|
||||
assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
|
||||
new Locale("tr").getLanguage().equals(Locale.getDefault().getLanguage()));
|
||||
solrHomeZip = SolrOutputFormat.createSolrHomeZip(new File(RESOURCES_DIR + "/solr/mrunit"));
|
||||
assertNotNull(solrHomeZip);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void teardownClass() throws Exception {
|
||||
if (solrHomeZip != null) Files.delete(solrHomeZip.toPath());
|
||||
solrHomeZip = null;
|
||||
}
|
||||
|
||||
protected void setupHadoopConfig(Configuration config) throws IOException {
|
||||
|
||||
String tempDir = createTempDir().toFile().getAbsolutePath();
|
||||
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
|
||||
|
||||
config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf");
|
||||
config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,468 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.cloud.AbstractZkTestCase;
|
||||
import org.apache.solr.hadoop.dedup.NoChangeUpdateConflictResolver;
|
||||
import org.apache.solr.hadoop.dedup.RetainMostRecentUpdateConflictResolver;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class MapReduceIndexerToolArgumentParserTest extends SolrTestCaseJ4 {
|
||||
|
||||
private Configuration conf;
|
||||
private MapReduceIndexerTool.MyArgumentParser parser;
|
||||
private MapReduceIndexerTool.Options opts;
|
||||
private PrintStream oldSystemOut;
|
||||
private PrintStream oldSystemErr;
|
||||
private ByteArrayOutputStream bout;
|
||||
private ByteArrayOutputStream berr;
|
||||
|
||||
private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
|
||||
private static final File MINIMR_INSTANCE_DIR = new File(RESOURCES_DIR + "/solr/minimr");
|
||||
|
||||
private static final String MORPHLINE_FILE = RESOURCES_DIR + "/test-morphlines/solrCellDocumentTypes.conf";
|
||||
|
||||
private final File solrHomeDirectory = createTempDir().toFile();
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() {
|
||||
assumeFalse("Does not work on Windows, because it uses UNIX shell commands or POSIX paths", Constants.WINDOWS);
|
||||
assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
|
||||
new Locale("tr").getLanguage().equals(Locale.getDefault().getLanguage()));
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
|
||||
FileUtils.copyDirectory(MINIMR_INSTANCE_DIR, solrHomeDirectory);
|
||||
|
||||
conf = new Configuration();
|
||||
parser = new MapReduceIndexerTool.MyArgumentParser();
|
||||
opts = new MapReduceIndexerTool.Options();
|
||||
oldSystemOut = System.out;
|
||||
bout = new ByteArrayOutputStream();
|
||||
System.setOut(new PrintStream(bout, true, "UTF-8"));
|
||||
oldSystemErr = System.err;
|
||||
berr = new ByteArrayOutputStream();
|
||||
System.setErr(new PrintStream(berr, true, "UTF-8"));
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
super.tearDown();
|
||||
System.setOut(oldSystemOut);
|
||||
System.setErr(oldSystemErr);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserTypicalUse() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--morphline-id", "morphline_xyz",
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--mappers", "10",
|
||||
"--reducers", "9",
|
||||
"--fanout", "8",
|
||||
"--max-segments", "7",
|
||||
"--shards", "1",
|
||||
"--verbose",
|
||||
"file:///home",
|
||||
"file:///dev",
|
||||
};
|
||||
Integer res = parser.parseArgs(args, conf, opts);
|
||||
assertNull(res != null ? res.toString() : "", res);
|
||||
assertEquals(Collections.singletonList(new Path("file:///tmp")), opts.inputLists);
|
||||
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
|
||||
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
|
||||
assertEquals(10, opts.mappers);
|
||||
assertEquals(9, opts.reducers);
|
||||
assertEquals(8, opts.fanout);
|
||||
assertEquals(7, opts.maxSegments);
|
||||
assertEquals(new Integer(1), opts.shards);
|
||||
assertEquals(null, opts.fairSchedulerPool);
|
||||
assertTrue(opts.isVerbose);
|
||||
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
|
||||
assertEquals(RetainMostRecentUpdateConflictResolver.class.getName(), opts.updateConflictResolver);
|
||||
assertEquals(MORPHLINE_FILE, opts.morphlineFile.getPath());
|
||||
assertEquals("morphline_xyz", opts.morphlineId);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserMultipleSpecsOfSameKind() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--input-list", "file:///",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
"file:///home",
|
||||
"file:///dev",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEquals(Arrays.asList(new Path("file:///tmp"), new Path("file:///")), opts.inputLists);
|
||||
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
|
||||
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
|
||||
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserTypicalUseWithEqualsSign() {
|
||||
String[] args = new String[] {
|
||||
"--input-list=file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir=file:/tmp/foo",
|
||||
"--solr-home-dir=" + MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--mappers=10",
|
||||
"--shards", "1",
|
||||
"--verbose",
|
||||
"file:///home",
|
||||
"file:///dev",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEquals(Collections.singletonList(new Path("file:///tmp")), opts.inputLists);
|
||||
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
|
||||
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
|
||||
assertEquals(10, opts.mappers);
|
||||
assertEquals(new Integer(1), opts.shards);
|
||||
assertEquals(null, opts.fairSchedulerPool);
|
||||
assertTrue(opts.isVerbose);
|
||||
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserMultipleSpecsOfSameKindWithEqualsSign() {
|
||||
String[] args = new String[] {
|
||||
"--input-list=file:///tmp",
|
||||
"--input-list=file:///",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir=file:/tmp/foo",
|
||||
"--solr-home-dir=" + MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
"file:///home",
|
||||
"file:///dev",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEquals(Arrays.asList(new Path("file:///tmp"), new Path("file:///")), opts.inputLists);
|
||||
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
|
||||
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
|
||||
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserHelp() throws UnsupportedEncodingException {
|
||||
String[] args = new String[] { "--help" };
|
||||
assertEquals(new Integer(0), parser.parseArgs(args, conf, opts));
|
||||
String helpText = new String(bout.toByteArray(), StandardCharsets.UTF_8);
|
||||
assertTrue(helpText.contains("MapReduce batch job driver that "));
|
||||
assertTrue(helpText.contains("bin/hadoop command"));
|
||||
assertEquals(0, berr.toByteArray().length);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserOk() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEquals(new Integer(1), opts.shards);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserUpdateConflictResolver() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
"--update-conflict-resolver", NoChangeUpdateConflictResolver.class.getName(),
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEquals(NoChangeUpdateConflictResolver.class.getName(), opts.updateConflictResolver);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserUnknownArgName() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--xxxxxxxxinputlist", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserFileNotFound1() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/fileNotFound/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserFileNotFound2() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", "/fileNotFound",
|
||||
"--shards", "1",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserIntOutOfRange() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
"--mappers", "-20"
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserIllegalFanout() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
"--fanout", "1" // must be >= 2
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsParserSolrHomeMustContainSolrConfigFile() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--shards", "1",
|
||||
"--solr-home-dir", "/",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsShardUrlOk() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shard-url", "http://localhost:8983/solr/collection1",
|
||||
"--shard-url", "http://localhost:8983/solr/collection2",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEquals(Arrays.asList(
|
||||
Collections.singletonList("http://localhost:8983/solr/collection1"),
|
||||
Collections.singletonList("http://localhost:8983/solr/collection2")),
|
||||
opts.shardUrls);
|
||||
assertEquals(new Integer(2), opts.shards);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsShardUrlMustHaveAParam() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shard-url",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsShardUrlAndShardsSucceeds() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shards", "1",
|
||||
"--shard-url", "http://localhost:8983/solr/collection1",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsShardUrlNoGoLive() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shard-url", "http://localhost:8983/solr/collection1"
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
assertEquals(new Integer(1), opts.shards);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsShardUrlsAndZkhostAreMutuallyExclusive() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shard-url", "http://localhost:8983/solr/collection1",
|
||||
"--shard-url", "http://localhost:8983/solr/collection1",
|
||||
"--zk-host", "http://localhost:2185",
|
||||
"--go-live"
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsGoLiveAndSolrUrl() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--shard-url", "http://localhost:8983/solr/collection1",
|
||||
"--shard-url", "http://localhost:8983/solr/collection1",
|
||||
"--go-live"
|
||||
};
|
||||
Integer result = parser.parseArgs(args, conf, opts);
|
||||
assertNull(result);
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsZkHostNoGoLive() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--zk-host", "http://localhost:2185",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsGoLiveZkHostNoCollection() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--zk-host", "http://localhost:2185",
|
||||
"--go-live"
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArgsGoLiveNoZkHostOrSolrUrl() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
|
||||
"--go-live"
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoSolrHomeDirOrZKHost() throws Exception {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--shards", "1",
|
||||
};
|
||||
assertArgumentParserException(args);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testZKHostNoSolrHomeDirOk() {
|
||||
String[] args = new String[] {
|
||||
"--input-list", "file:///tmp",
|
||||
"--morphline-file", MORPHLINE_FILE,
|
||||
"--output-dir", "file:/tmp/foo",
|
||||
"--zk-host", "http://localhost:2185",
|
||||
"--collection", "collection1",
|
||||
};
|
||||
assertNull(parser.parseArgs(args, conf, opts));
|
||||
assertEmptySystemErrAndEmptySystemOut();
|
||||
}
|
||||
|
||||
private void assertEmptySystemErrAndEmptySystemOut() {
|
||||
assertEquals(0, bout.toByteArray().length);
|
||||
assertEquals(0, berr.toByteArray().length);
|
||||
}
|
||||
|
||||
private void assertArgumentParserException(String[] args) throws UnsupportedEncodingException {
|
||||
assertEquals("should have returned fail code", new Integer(1), parser.parseArgs(args, conf, opts));
|
||||
assertEquals("no sys out expected:" + new String(bout.toByteArray(), StandardCharsets.UTF_8), 0, bout.toByteArray().length);
|
||||
String usageText;
|
||||
usageText = new String(berr.toByteArray(), StandardCharsets.UTF_8);
|
||||
|
||||
assertTrue("should start with usage msg \"usage: hadoop \":" + usageText, usageText.startsWith("usage: hadoop "));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,415 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.lang.reflect.Array;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.security.authorize.ProxyUsers;
|
||||
import org.apache.hadoop.util.JarFinder;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.cloud.AbstractZkTestCase;
|
||||
import org.apache.solr.hadoop.hack.MiniMRCluster;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.apache.solr.util.BadHdfsThreadsFilter;
|
||||
import org.apache.solr.util.BadMrClusterThreadsFilter;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.Nightly;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
|
||||
|
||||
@ThreadLeakAction({Action.WARN})
|
||||
@ThreadLeakLingering(linger = 0)
|
||||
@ThreadLeakZombies(Consequence.CONTINUE)
|
||||
@ThreadLeakFilters(defaultFilters = true, filters = {
|
||||
BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s)
|
||||
})
|
||||
@Slow
|
||||
@Nightly
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076")
|
||||
public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 {
|
||||
|
||||
private static final boolean ENABLE_LOCAL_JOB_RUNNER = false; // for debugging only
|
||||
private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
|
||||
private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
|
||||
private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr");
|
||||
|
||||
private static String SEARCH_ARCHIVES_JAR;
|
||||
|
||||
private static MiniDFSCluster dfsCluster = null;
|
||||
private static MiniMRCluster mrCluster = null;
|
||||
private static int numRuns = 0;
|
||||
|
||||
private final String inputAvroFile;
|
||||
private final int count;
|
||||
|
||||
private static String tempDir;
|
||||
|
||||
private static File solrHomeDirectory;
|
||||
|
||||
protected MapReduceIndexerTool createTool() {
|
||||
return new MapReduceIndexerTool();
|
||||
}
|
||||
|
||||
public MorphlineBasicMiniMRTest() {
|
||||
int data = random().nextInt(3);
|
||||
switch (data) {
|
||||
case 0:
|
||||
this.inputAvroFile = "sample-statuses-20120906-141433.avro";
|
||||
this.count = 2;
|
||||
break;
|
||||
case 1:
|
||||
this.inputAvroFile = "sample-statuses-20120521-100919.avro";
|
||||
this.count = 20;
|
||||
break;
|
||||
case 2:
|
||||
this.inputAvroFile = "sample-statuses-20120906-141433-medium.avro";
|
||||
this.count = 2104;
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Test setup is broken");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void setupClass() throws Exception {
|
||||
solrHomeDirectory = createTempDir().toFile();
|
||||
|
||||
assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
|
||||
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
|
||||
|
||||
assumeFalse("FIXME: This test does not work with Windows because of native library requirements", Constants.WINDOWS);
|
||||
|
||||
AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
|
||||
FileUtils.copyDirectory(MINIMR_CONF_DIR, solrHomeDirectory);
|
||||
File dataDir = createTempDir().toFile();
|
||||
tempDir = dataDir.getAbsolutePath();
|
||||
new File(tempDir).mkdirs();
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
|
||||
|
||||
System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());
|
||||
|
||||
int taskTrackers = 1;
|
||||
int dataNodes = 2;
|
||||
// String proxyUser = System.getProperty("user.name");
|
||||
// String proxyGroup = "g";
|
||||
// StringBuilder sb = new StringBuilder();
|
||||
// sb.append("127.0.0.1,localhost");
|
||||
// for (InetAddress i : InetAddress.getAllByName(InetAddress.getLocalHost().getHostName())) {
|
||||
// sb.append(",").append(i.getCanonicalHostName());
|
||||
// }
|
||||
|
||||
new File(dataDir, "nm-local-dirs").mkdirs();
|
||||
|
||||
System.setProperty("solr.hdfs.blockcache.enabled", "false");
|
||||
|
||||
System.setProperty("test.build.dir", dataDir + File.separator + "hdfs" + File.separator + "test-build-dir");
|
||||
System.setProperty("test.build.data", dataDir + File.separator + "hdfs" + File.separator + "build");
|
||||
System.setProperty("test.cache.data", dataDir + File.separator + "hdfs" + File.separator + "cache");
|
||||
|
||||
// Initialize AFTER test.build.dir is set, JarFinder uses it.
|
||||
SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class);
|
||||
|
||||
JobConf conf = new JobConf();
|
||||
conf.set("dfs.block.access.token.enable", "false");
|
||||
conf.set("dfs.permissions", "true");
|
||||
conf.set("hadoop.security.authentication", "simple");
|
||||
conf.set(YarnConfiguration.NM_LOCAL_DIRS, dataDir.getPath() + File.separator + "nm-local-dirs");
|
||||
conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, dataDir + File.separator + "nm-logs");
|
||||
conf.set("testWorkDir", dataDir.getPath() + File.separator + "testWorkDir");
|
||||
conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false");
|
||||
conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0");
|
||||
|
||||
dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
|
||||
FileSystem fileSystem = dfsCluster.getFileSystem();
|
||||
fileSystem.mkdirs(new Path("/tmp"));
|
||||
fileSystem.mkdirs(new Path("/user"));
|
||||
fileSystem.mkdirs(new Path("/hadoop/mapred/system"));
|
||||
fileSystem.setPermission(new Path("/tmp"), FsPermission.valueOf("-rwxrwxrwx"));
|
||||
fileSystem.setPermission(new Path("/user"), FsPermission.valueOf("-rwxrwxrwx"));
|
||||
fileSystem.setPermission(new Path("/hadoop/mapred/system"), FsPermission.valueOf("-rwx------"));
|
||||
String nnURI = fileSystem.getUri().toString();
|
||||
int numDirs = 1;
|
||||
String[] racks = null;
|
||||
String[] hosts = null;
|
||||
|
||||
mrCluster = new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks, hosts, null, conf);
|
||||
ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void teardownClass() throws Exception {
|
||||
System.clearProperty("solr.hdfs.blockcache.enabled");
|
||||
System.clearProperty("test.build.dir");
|
||||
System.clearProperty("test.build.data");
|
||||
System.clearProperty("test.cache.data");
|
||||
|
||||
if (mrCluster != null) {
|
||||
mrCluster.shutdown();
|
||||
mrCluster = null;
|
||||
}
|
||||
if (dfsCluster != null) {
|
||||
dfsCluster.shutdown();
|
||||
dfsCluster = null;
|
||||
}
|
||||
|
||||
FileSystem.closeAll();
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
System.clearProperty("hadoop.log.dir");
|
||||
System.clearProperty("solr.hdfs.blockcache.enabled");
|
||||
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private JobConf getJobConf() {
|
||||
return mrCluster.createJobConf();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPathParts() throws Exception { // see PathParts
|
||||
FileSystem fs = dfsCluster.getFileSystem();
|
||||
int dfsClusterPort = fs.getWorkingDirectory().toUri().getPort();
|
||||
assertTrue(dfsClusterPort > 0);
|
||||
JobConf jobConf = getJobConf();
|
||||
Configuration simpleConf = new Configuration();
|
||||
|
||||
for (Configuration conf : Arrays.asList(jobConf, simpleConf)) {
|
||||
for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
|
||||
for (String up : Arrays.asList("", "../")) {
|
||||
String down = up.length() == 0 ? "foo/" : "";
|
||||
String uploadURL = "hdfs://localhost:12345/user/foo/" + up + "bar.txt" + queryAndFragment;
|
||||
PathParts parts = new PathParts(uploadURL, conf);
|
||||
assertEquals(uploadURL, parts.getUploadURL());
|
||||
assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
|
||||
assertEquals("bar.txt", parts.getName());
|
||||
assertEquals("hdfs", parts.getScheme());
|
||||
assertEquals("localhost", parts.getHost());
|
||||
assertEquals(12345, parts.getPort());
|
||||
assertEquals("hdfs://localhost:12345/user/" + down + "bar.txt", parts.getId());
|
||||
assertEquals(parts.getId(), parts.getDownloadURL());
|
||||
assertFileNotFound(parts);
|
||||
|
||||
uploadURL = "hdfs://localhost/user/foo/" + up + "bar.txt" + queryAndFragment;
|
||||
parts = new PathParts(uploadURL, conf);
|
||||
assertEquals(uploadURL, parts.getUploadURL());
|
||||
assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
|
||||
assertEquals("bar.txt", parts.getName());
|
||||
assertEquals("hdfs", parts.getScheme());
|
||||
assertEquals("localhost", parts.getHost());
|
||||
assertEquals(8020, parts.getPort());
|
||||
assertEquals("hdfs://localhost:8020/user/" + down + "bar.txt", parts.getId());
|
||||
assertEquals(parts.getId(), parts.getDownloadURL());
|
||||
assertFileNotFound(parts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Configuration conf : Arrays.asList(jobConf)) {
|
||||
for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
|
||||
for (String up : Arrays.asList("", "../")) {
|
||||
// verify using absolute path
|
||||
String down = up.length() == 0 ? "foo/" : "";
|
||||
String uploadURL = "/user/foo/" + up + "bar.txt" + queryAndFragment;
|
||||
PathParts parts = new PathParts(uploadURL, conf);
|
||||
assertEquals(uploadURL, parts.getUploadURL());
|
||||
assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
|
||||
assertEquals("bar.txt", parts.getName());
|
||||
assertEquals("hdfs", parts.getScheme());
|
||||
assertTrue("localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
|
||||
assertEquals(dfsClusterPort, parts.getPort());
|
||||
assertTrue(parts.getId().equals("hdfs://localhost:" + dfsClusterPort + "/user/" + down + "bar.txt")
|
||||
|| parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + "/user/" + down + "bar.txt")
|
||||
);
|
||||
assertFileNotFound(parts);
|
||||
|
||||
// verify relative path is interpreted to be relative to user's home dir and resolved to an absolute path
|
||||
uploadURL = "xuser/foo/" + up + "bar.txt" + queryAndFragment;
|
||||
parts = new PathParts(uploadURL, conf);
|
||||
assertEquals(uploadURL, parts.getUploadURL());
|
||||
String homeDir = "/user/" + System.getProperty("user.name");
|
||||
assertEquals(homeDir + "/xuser/" + down + "bar.txt", parts.getURIPath());
|
||||
assertEquals("bar.txt", parts.getName());
|
||||
assertEquals("hdfs", parts.getScheme());
|
||||
assertTrue("localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
|
||||
assertEquals(dfsClusterPort, parts.getPort());
|
||||
assertTrue(parts.getId().equals("hdfs://localhost:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
|
||||
|| parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
|
||||
);
|
||||
assertFileNotFound(parts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
new PathParts("/user/foo/bar.txt", simpleConf);
|
||||
fail("host/port resolution requires minimr conf, not a simple conf");
|
||||
} catch (IllegalArgumentException e) {
|
||||
; // expected
|
||||
}
|
||||
}
|
||||
|
||||
private void assertFileNotFound(PathParts parts) {
|
||||
try {
|
||||
parts.getFileSystem().getFileStatus(parts.getUploadPath());
|
||||
fail();
|
||||
} catch (IOException e) {
|
||||
; // expected
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mrRun() throws Exception {
|
||||
FileSystem fs = dfsCluster.getFileSystem();
|
||||
Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
|
||||
fs.delete(inDir, true);
|
||||
String DATADIR = "/user/testing/testMapperReducer/data";
|
||||
Path dataDir = fs.makeQualified(new Path(DATADIR));
|
||||
fs.delete(dataDir, true);
|
||||
Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
|
||||
fs.delete(outDir, true);
|
||||
|
||||
assertTrue(fs.mkdirs(inDir));
|
||||
Path INPATH = new Path(inDir, "input.txt");
|
||||
OutputStream os = fs.create(INPATH);
|
||||
Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
|
||||
wr.write(DATADIR + "/" + inputAvroFile);
|
||||
wr.close();
|
||||
|
||||
assertTrue(fs.mkdirs(dataDir));
|
||||
fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir);
|
||||
|
||||
JobConf jobConf = getJobConf();
|
||||
jobConf.set("jobclient.output.filter", "ALL");
|
||||
if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger and set breakpoints
|
||||
jobConf.set("mapred.job.tracker", "local");
|
||||
}
|
||||
jobConf.setMaxMapAttempts(1);
|
||||
jobConf.setMaxReduceAttempts(1);
|
||||
jobConf.setJar(SEARCH_ARCHIVES_JAR);
|
||||
|
||||
int shards = 2;
|
||||
int maxReducers = Integer.MAX_VALUE;
|
||||
if (ENABLE_LOCAL_JOB_RUNNER) {
|
||||
// local job runner has a couple of limitations: only one reducer is supported and the DistributedCache doesn't work.
|
||||
// see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/
|
||||
maxReducers = 1;
|
||||
shards = 1;
|
||||
}
|
||||
|
||||
String[] args = new String[] {
|
||||
"--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
|
||||
"--morphline-id=morphline1",
|
||||
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--shards=" + shards,
|
||||
"--verbose",
|
||||
numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
|
||||
numRuns % 3 == 0 ? "--reducers=" + shards : (numRuns % 3 == 1 ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers))
|
||||
};
|
||||
if (numRuns % 3 == 2) {
|
||||
args = concat(args, new String[] {"--fanout=2"});
|
||||
}
|
||||
if (numRuns == 0) {
|
||||
// force (slow) MapReduce based randomization to get coverage for that as well
|
||||
args = concat(new String[] {"-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1"}, args);
|
||||
}
|
||||
MapReduceIndexerTool tool = createTool();
|
||||
int res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
Job job = tool.job;
|
||||
assertTrue(job.isComplete());
|
||||
assertTrue(job.isSuccessful());
|
||||
|
||||
if (numRuns % 3 != 2) {
|
||||
// Only run this check if mtree merge is disabled.
|
||||
// With mtree merge enabled the BatchWriter counters aren't available anymore because
|
||||
// variable "job" now refers to the merge job rather than the indexing job
|
||||
assertEquals("Invalid counter " + SolrRecordWriter.class.getName() + "." + SolrCounters.DOCUMENTS_WRITTEN,
|
||||
count, job.getCounters().findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString()).getValue());
|
||||
}
|
||||
|
||||
// Check the output is as expected
|
||||
outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR);
|
||||
Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir));
|
||||
|
||||
System.out.println("outputfiles:" + Arrays.toString(outputFiles));
|
||||
|
||||
UtilsForTests.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards);
|
||||
|
||||
// run again with --dryrun mode:
|
||||
tool = createTool();
|
||||
args = concat(args, new String[] {"--dry-run"});
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
|
||||
numRuns++;
|
||||
}
|
||||
|
||||
protected static <T> T[] concat(T[]... arrays) {
|
||||
if (arrays.length <= 0) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
Class clazz = null;
|
||||
int length = 0;
|
||||
for (T[] array : arrays) {
|
||||
clazz = array.getClass();
|
||||
length += array.length;
|
||||
}
|
||||
T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length);
|
||||
int pos = 0;
|
||||
for (T[] array : arrays) {
|
||||
System.arraycopy(array, 0, result, pos, array.length);
|
||||
pos += array.length;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,881 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.io.Writer;
|
||||
import java.lang.reflect.Array;
|
||||
import java.net.URI;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hadoop.security.authorize.ProxyUsers;
|
||||
import org.apache.hadoop.util.JarFinder;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrQuery.ORDER;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.cloud.AbstractFullDistribZkTestBase;
|
||||
import org.apache.solr.cloud.AbstractZkTestCase;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.cloud.Replica;
|
||||
import org.apache.solr.common.cloud.Slice;
|
||||
import org.apache.solr.common.cloud.SolrZkClient;
|
||||
import org.apache.solr.common.cloud.ZkCoreNodeProps;
|
||||
import org.apache.solr.common.params.CollectionParams.CollectionAction;
|
||||
import org.apache.solr.common.params.CoreAdminParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.hadoop.hack.MiniMRClientCluster;
|
||||
import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.apache.solr.util.BadHdfsThreadsFilter;
|
||||
import org.apache.solr.util.BadMrClusterThreadsFilter;
|
||||
import org.apache.solr.util.TimeOut;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.Nightly;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
|
||||
|
||||
@ThreadLeakAction({Action.WARN})
|
||||
@ThreadLeakLingering(linger = 0)
|
||||
@ThreadLeakZombies(Consequence.CONTINUE)
|
||||
@ThreadLeakFilters(defaultFilters = true, filters = {
|
||||
BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s)
|
||||
})
|
||||
@SuppressSSL // SSL does not work with this test for currently unknown reasons
|
||||
@Slow
|
||||
@Nightly
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076")
|
||||
public class MorphlineGoLiveMiniMRTest extends AbstractFullDistribZkTestBase {
|
||||
|
||||
private static final int RECORD_COUNT = 2104;
|
||||
private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
|
||||
private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
|
||||
private static final File MINIMR_INSTANCE_DIR = new File(RESOURCES_DIR + "/solr/minimr");
|
||||
private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr");
|
||||
|
||||
private static String SEARCH_ARCHIVES_JAR;
|
||||
|
||||
private static MiniDFSCluster dfsCluster = null;
|
||||
private static MiniMRClientCluster mrCluster = null;
|
||||
private static String tempDir;
|
||||
|
||||
private final String inputAvroFile1;
|
||||
private final String inputAvroFile2;
|
||||
private final String inputAvroFile3;
|
||||
|
||||
private static File solrHomeDirectory;
|
||||
|
||||
@Override
|
||||
public String getSolrHome() {
|
||||
return solrHomeDirectory.getPath();
|
||||
}
|
||||
|
||||
public MorphlineGoLiveMiniMRTest() {
|
||||
this.inputAvroFile1 = "sample-statuses-20120521-100919.avro";
|
||||
this.inputAvroFile2 = "sample-statuses-20120906-141433.avro";
|
||||
this.inputAvroFile3 = "sample-statuses-20120906-141433-medium.avro";
|
||||
|
||||
sliceCount = TEST_NIGHTLY ? 5 : 3;
|
||||
fixShardCount(TEST_NIGHTLY ? 5 : 3);
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void setupClass() throws Exception {
|
||||
System.setProperty("solr.hdfs.blockcache.global", Boolean.toString(LuceneTestCase.random().nextBoolean()));
|
||||
System.setProperty("solr.hdfs.blockcache.enabled", Boolean.toString(LuceneTestCase.random().nextBoolean()));
|
||||
System.setProperty("solr.hdfs.blockcache.blocksperbank", "2048");
|
||||
|
||||
solrHomeDirectory = createTempDir().toFile();
|
||||
|
||||
assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
|
||||
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
|
||||
|
||||
assumeFalse("FIXME: This test does not work with Windows because of native library requirements", Constants.WINDOWS);
|
||||
|
||||
AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
|
||||
FileUtils.copyDirectory(MINIMR_INSTANCE_DIR, AbstractZkTestCase.SOLRHOME);
|
||||
tempDir = createTempDir().toFile().getAbsolutePath();
|
||||
|
||||
new File(tempDir).mkdirs();
|
||||
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
|
||||
|
||||
|
||||
System.setProperty("hadoop.log.dir", new File(tempDir, "logs").getAbsolutePath());
|
||||
|
||||
int dataNodes = 2;
|
||||
|
||||
JobConf conf = new JobConf();
|
||||
conf.set("dfs.block.access.token.enable", "false");
|
||||
conf.set("dfs.permissions", "true");
|
||||
conf.set("hadoop.security.authentication", "simple");
|
||||
conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false");
|
||||
conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0");
|
||||
|
||||
conf.set(YarnConfiguration.NM_LOCAL_DIRS, tempDir + File.separator + "nm-local-dirs");
|
||||
conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, tempDir + File.separator + "nm-logs");
|
||||
|
||||
|
||||
new File(tempDir + File.separator + "nm-local-dirs").mkdirs();
|
||||
|
||||
System.setProperty("test.build.dir", tempDir + File.separator + "hdfs" + File.separator + "test-build-dir");
|
||||
System.setProperty("test.build.data", tempDir + File.separator + "hdfs" + File.separator + "build");
|
||||
System.setProperty("test.cache.data", tempDir + File.separator + "hdfs" + File.separator + "cache");
|
||||
|
||||
// Initialize AFTER test.build.dir is set, JarFinder uses it.
|
||||
SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class);
|
||||
|
||||
dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
|
||||
FileSystem fileSystem = dfsCluster.getFileSystem();
|
||||
fileSystem.mkdirs(new Path("/tmp"));
|
||||
fileSystem.mkdirs(new Path("/user"));
|
||||
fileSystem.mkdirs(new Path("/hadoop/mapred/system"));
|
||||
fileSystem.setPermission(new Path("/tmp"),
|
||||
FsPermission.valueOf("-rwxrwxrwx"));
|
||||
fileSystem.setPermission(new Path("/user"),
|
||||
FsPermission.valueOf("-rwxrwxrwx"));
|
||||
fileSystem.setPermission(new Path("/hadoop/mapred/system"),
|
||||
FsPermission.valueOf("-rwx------"));
|
||||
|
||||
mrCluster = MiniMRClientClusterFactory.create(MorphlineGoLiveMiniMRTest.class, 1, conf, new File(tempDir, "mrCluster"));
|
||||
|
||||
//new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks,
|
||||
//hosts, null, conf);
|
||||
|
||||
ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void distribSetUp() throws Exception {
|
||||
super.distribSetUp();
|
||||
System.setProperty("host", "127.0.0.1");
|
||||
System.setProperty("numShards", Integer.toString(sliceCount));
|
||||
URI uri = dfsCluster.getFileSystem().getUri();
|
||||
System.setProperty("solr.hdfs.home", uri.toString() + "/" + this.getClass().getName());
|
||||
uploadConfFiles();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void distribTearDown() throws Exception {
|
||||
super.distribTearDown();
|
||||
System.clearProperty("host");
|
||||
System.clearProperty("numShards");
|
||||
System.clearProperty("solr.hdfs.home");
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void teardownClass() throws Exception {
|
||||
System.clearProperty("solr.hdfs.blockcache.global");
|
||||
System.clearProperty("solr.hdfs.blockcache.blocksperbank");
|
||||
System.clearProperty("solr.hdfs.blockcache.enabled");
|
||||
System.clearProperty("hadoop.log.dir");
|
||||
System.clearProperty("test.build.dir");
|
||||
System.clearProperty("test.build.data");
|
||||
System.clearProperty("test.cache.data");
|
||||
|
||||
if (mrCluster != null) {
|
||||
mrCluster.stop();
|
||||
mrCluster = null;
|
||||
}
|
||||
if (dfsCluster != null) {
|
||||
dfsCluster.shutdown();
|
||||
dfsCluster = null;
|
||||
}
|
||||
FileSystem.closeAll();
|
||||
}
|
||||
|
||||
private JobConf getJobConf() throws IOException {
|
||||
JobConf jobConf = new JobConf(mrCluster.getConfig());
|
||||
return jobConf;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBuildShardUrls() throws Exception {
|
||||
// 2x3
|
||||
Integer numShards = 2;
|
||||
List<Object> urls = new ArrayList<>();
|
||||
urls.add("shard1");
|
||||
urls.add("shard2");
|
||||
urls.add("shard3");
|
||||
urls.add("shard4");
|
||||
urls.add("shard5");
|
||||
urls.add("shard6");
|
||||
List<List<String>> shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 2, shardUrls.size());
|
||||
|
||||
for (List<String> u : shardUrls) {
|
||||
assertEquals(3, u.size());
|
||||
}
|
||||
|
||||
// 1x6
|
||||
numShards = 1;
|
||||
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 1, shardUrls.size());
|
||||
|
||||
for (List<String> u : shardUrls) {
|
||||
assertEquals(6, u.size());
|
||||
}
|
||||
|
||||
// 6x1
|
||||
numShards = 6;
|
||||
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 6, shardUrls.size());
|
||||
|
||||
for (List<String> u : shardUrls) {
|
||||
assertEquals(1, u.size());
|
||||
}
|
||||
|
||||
// 3x2
|
||||
numShards = 3;
|
||||
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 3, shardUrls.size());
|
||||
|
||||
for (List<String> u : shardUrls) {
|
||||
assertEquals(2, u.size());
|
||||
}
|
||||
|
||||
// null shards, 6x1
|
||||
numShards = null;
|
||||
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 6, shardUrls.size());
|
||||
|
||||
for (List<String> u : shardUrls) {
|
||||
assertEquals(1, u.size());
|
||||
}
|
||||
|
||||
// null shards 3x1
|
||||
numShards = null;
|
||||
|
||||
urls = new ArrayList<>();
|
||||
urls.add("shard1");
|
||||
urls.add("shard2");
|
||||
urls.add("shard3");
|
||||
|
||||
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 3, shardUrls.size());
|
||||
|
||||
for (List<String> u : shardUrls) {
|
||||
assertEquals(1, u.size());
|
||||
}
|
||||
|
||||
// 2x(2,3) off balance
|
||||
numShards = 2;
|
||||
urls = new ArrayList<>();
|
||||
urls.add("shard1");
|
||||
urls.add("shard2");
|
||||
urls.add("shard3");
|
||||
urls.add("shard4");
|
||||
urls.add("shard5");
|
||||
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
|
||||
|
||||
assertEquals(shardUrls.toString(), 2, shardUrls.size());
|
||||
|
||||
Set<Integer> counts = new HashSet<>();
|
||||
counts.add(shardUrls.get(0).size());
|
||||
counts.add(shardUrls.get(1).size());
|
||||
|
||||
assertTrue(counts.contains(2));
|
||||
assertTrue(counts.contains(3));
|
||||
}
|
||||
|
||||
private String[] prependInitialArgs(String[] args) {
|
||||
String[] head = new String[] {
|
||||
"--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
|
||||
"--morphline-id=morphline1",
|
||||
};
|
||||
return concat(head, args);
|
||||
}
|
||||
|
||||
@Nightly
|
||||
@Test
|
||||
public void test() throws Exception {
|
||||
|
||||
waitForRecoveriesToFinish(false);
|
||||
|
||||
FileSystem fs = dfsCluster.getFileSystem();
|
||||
Path inDir = fs.makeQualified(new Path(
|
||||
"/user/testing/testMapperReducer/input"));
|
||||
fs.delete(inDir, true);
|
||||
String DATADIR = "/user/testing/testMapperReducer/data";
|
||||
Path dataDir = fs.makeQualified(new Path(DATADIR));
|
||||
fs.delete(dataDir, true);
|
||||
Path outDir = fs.makeQualified(new Path(
|
||||
"/user/testing/testMapperReducer/output"));
|
||||
fs.delete(outDir, true);
|
||||
|
||||
assertTrue(fs.mkdirs(inDir));
|
||||
Path INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile1);
|
||||
|
||||
JobConf jobConf = getJobConf();
|
||||
jobConf.set("jobclient.output.filter", "ALL");
|
||||
// enable mapred.job.tracker = local to run in debugger and set breakpoints
|
||||
// jobConf.set("mapred.job.tracker", "local");
|
||||
jobConf.setMaxMapAttempts(1);
|
||||
jobConf.setMaxReduceAttempts(1);
|
||||
jobConf.setJar(SEARCH_ARCHIVES_JAR);
|
||||
|
||||
MapReduceIndexerTool tool;
|
||||
int res;
|
||||
QueryResponse results;
|
||||
String[] args = new String[]{};
|
||||
List<String> argList = new ArrayList<>();
|
||||
|
||||
try (HttpSolrClient server = getHttpSolrClient(cloudJettys.get(0).url)) {
|
||||
|
||||
args = new String[]{
|
||||
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--log4j=" + getFile("log4j.properties").getAbsolutePath(),
|
||||
"--mappers=3",
|
||||
random().nextBoolean() ? "--input-list=" + INPATH.toString() : dataDir.toString(),
|
||||
"--go-live-threads", Integer.toString(random().nextInt(15) + 1),
|
||||
"--verbose",
|
||||
"--go-live"
|
||||
};
|
||||
args = prependInitialArgs(args);
|
||||
getShardUrlArgs(argList);
|
||||
args = concat(args, argList.toArray(new String[0]));
|
||||
|
||||
if (true) {
|
||||
tool = new MapReduceIndexerTool();
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
assertTrue(tool.job.isComplete());
|
||||
assertTrue(tool.job.isSuccessful());
|
||||
results = server.query(new SolrQuery("*:*"));
|
||||
assertEquals(20, results.getResults().getNumFound());
|
||||
}
|
||||
|
||||
fs.delete(inDir, true);
|
||||
fs.delete(outDir, true);
|
||||
fs.delete(dataDir, true);
|
||||
assertTrue(fs.mkdirs(inDir));
|
||||
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile2);
|
||||
|
||||
args = new String[]{
|
||||
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--mappers=3",
|
||||
"--verbose",
|
||||
"--go-live",
|
||||
random().nextBoolean() ? "--input-list=" + INPATH.toString() : dataDir.toString(),
|
||||
"--go-live-threads", Integer.toString(random().nextInt(15) + 1)
|
||||
};
|
||||
args = prependInitialArgs(args);
|
||||
|
||||
getShardUrlArgs(argList);
|
||||
args = concat(args, argList.toArray(new String[0]));
|
||||
|
||||
if (true) {
|
||||
tool = new MapReduceIndexerTool();
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
assertTrue(tool.job.isComplete());
|
||||
assertTrue(tool.job.isSuccessful());
|
||||
results = server.query(new SolrQuery("*:*"));
|
||||
|
||||
assertEquals(22, results.getResults().getNumFound());
|
||||
}
|
||||
|
||||
// try using zookeeper
|
||||
String collection = "collection1";
|
||||
if (random().nextBoolean()) {
|
||||
// sometimes, use an alias
|
||||
createAlias("updatealias", "collection1");
|
||||
collection = "updatealias";
|
||||
}
|
||||
|
||||
fs.delete(inDir, true);
|
||||
fs.delete(outDir, true);
|
||||
fs.delete(dataDir, true);
|
||||
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);
|
||||
|
||||
cloudClient.deleteByQuery("*:*");
|
||||
cloudClient.commit();
|
||||
assertEquals(0, cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound());
|
||||
|
||||
args = new String[]{
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--mappers=3",
|
||||
"--reducers=6",
|
||||
"--fanout=2",
|
||||
"--verbose",
|
||||
"--go-live",
|
||||
random().nextBoolean() ? "--input-list=" + INPATH.toString() : dataDir.toString(),
|
||||
"--zk-host", zkServer.getZkAddress(),
|
||||
"--collection", collection
|
||||
};
|
||||
args = prependInitialArgs(args);
|
||||
|
||||
if (true) {
|
||||
tool = new MapReduceIndexerTool();
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
assertTrue(tool.job.isComplete());
|
||||
assertTrue(tool.job.isSuccessful());
|
||||
|
||||
SolrDocumentList resultDocs = executeSolrQuery(cloudClient, "*:*");
|
||||
assertEquals(RECORD_COUNT, resultDocs.getNumFound());
|
||||
assertEquals(RECORD_COUNT, resultDocs.size());
|
||||
|
||||
// perform updates
|
||||
for (int i = 0; i < RECORD_COUNT; i++) {
|
||||
SolrDocument doc = resultDocs.get(i);
|
||||
SolrInputDocument update = new SolrInputDocument();
|
||||
for (Map.Entry<String, Object> entry : doc.entrySet()) {
|
||||
update.setField(entry.getKey(), entry.getValue());
|
||||
}
|
||||
update.setField("user_screen_name", "Nadja" + i);
|
||||
update.removeField("_version_");
|
||||
cloudClient.add(update);
|
||||
}
|
||||
cloudClient.commit();
|
||||
|
||||
// verify updates
|
||||
SolrDocumentList resultDocs2 = executeSolrQuery(cloudClient, "*:*");
|
||||
assertEquals(RECORD_COUNT, resultDocs2.getNumFound());
|
||||
assertEquals(RECORD_COUNT, resultDocs2.size());
|
||||
for (int i = 0; i < RECORD_COUNT; i++) {
|
||||
SolrDocument doc = resultDocs.get(i);
|
||||
SolrDocument doc2 = resultDocs2.get(i);
|
||||
assertEquals(doc.getFirstValue("id"), doc2.getFirstValue("id"));
|
||||
assertEquals("Nadja" + i, doc2.getFirstValue("user_screen_name"));
|
||||
assertEquals(doc.getFirstValue("text"), doc2.getFirstValue("text"));
|
||||
|
||||
// perform delete
|
||||
cloudClient.deleteById((String) doc.getFirstValue("id"));
|
||||
}
|
||||
cloudClient.commit();
|
||||
|
||||
// verify deletes
|
||||
assertEquals(0, executeSolrQuery(cloudClient, "*:*").size());
|
||||
}
|
||||
|
||||
cloudClient.deleteByQuery("*:*");
|
||||
cloudClient.commit();
|
||||
assertEquals(0, cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound());
|
||||
}
|
||||
|
||||
// try using zookeeper with replication
|
||||
String replicatedCollection = "replicated_collection";
|
||||
if (TEST_NIGHTLY) {
|
||||
createCollection(replicatedCollection, 3, 3, 3);
|
||||
} else {
|
||||
createCollection(replicatedCollection, 2, 3, 2);
|
||||
}
|
||||
waitForRecoveriesToFinish(false);
|
||||
cloudClient.setDefaultCollection(replicatedCollection);
|
||||
fs.delete(inDir, true);
|
||||
fs.delete(outDir, true);
|
||||
fs.delete(dataDir, true);
|
||||
assertTrue(fs.mkdirs(dataDir));
|
||||
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);
|
||||
|
||||
args = new String[] {
|
||||
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--mappers=3",
|
||||
"--reducers=12",
|
||||
"--fanout=2",
|
||||
"--verbose",
|
||||
"--go-live",
|
||||
"--zk-host", zkServer.getZkAddress(),
|
||||
"--collection", replicatedCollection, dataDir.toString()
|
||||
};
|
||||
args = prependInitialArgs(args);
|
||||
|
||||
if (true) {
|
||||
tool = new MapReduceIndexerTool();
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
assertTrue(tool.job.isComplete());
|
||||
assertTrue(tool.job.isSuccessful());
|
||||
|
||||
SolrDocumentList resultDocs = executeSolrQuery(cloudClient, "*:*");
|
||||
assertEquals(RECORD_COUNT, resultDocs.getNumFound());
|
||||
assertEquals(RECORD_COUNT, resultDocs.size());
|
||||
|
||||
checkConsistency(replicatedCollection);
|
||||
|
||||
// perform updates
|
||||
for (int i = 0; i < RECORD_COUNT; i++) {
|
||||
SolrDocument doc = resultDocs.get(i);
|
||||
SolrInputDocument update = new SolrInputDocument();
|
||||
for (Map.Entry<String, Object> entry : doc.entrySet()) {
|
||||
update.setField(entry.getKey(), entry.getValue());
|
||||
}
|
||||
update.setField("user_screen_name", "@Nadja" + i);
|
||||
update.removeField("_version_");
|
||||
cloudClient.add(update);
|
||||
}
|
||||
cloudClient.commit();
|
||||
|
||||
// verify updates
|
||||
SolrDocumentList resultDocs2 = executeSolrQuery(cloudClient, "*:*");
|
||||
assertEquals(RECORD_COUNT, resultDocs2.getNumFound());
|
||||
assertEquals(RECORD_COUNT, resultDocs2.size());
|
||||
for (int i = 0; i < RECORD_COUNT; i++) {
|
||||
SolrDocument doc = resultDocs.get(i);
|
||||
SolrDocument doc2 = resultDocs2.get(i);
|
||||
assertEquals(doc.getFieldValues("id"), doc2.getFieldValues("id"));
|
||||
assertEquals(1, doc.getFieldValues("id").size());
|
||||
assertEquals(Arrays.asList("@Nadja" + i), doc2.getFieldValues("user_screen_name"));
|
||||
assertEquals(doc.getFieldValues("text"), doc2.getFieldValues("text"));
|
||||
|
||||
// perform delete
|
||||
cloudClient.deleteById((String)doc.getFirstValue("id"));
|
||||
}
|
||||
cloudClient.commit();
|
||||
|
||||
// verify deletes
|
||||
assertEquals(0, executeSolrQuery(cloudClient, "*:*").size());
|
||||
}
|
||||
|
||||
// try using solr_url with replication
|
||||
cloudClient.deleteByQuery("*:*");
|
||||
cloudClient.commit();
|
||||
assertEquals(0, executeSolrQuery(cloudClient, "*:*").getNumFound());
|
||||
assertEquals(0, executeSolrQuery(cloudClient, "*:*").size());
|
||||
fs.delete(inDir, true);
|
||||
fs.delete(dataDir, true);
|
||||
assertTrue(fs.mkdirs(dataDir));
|
||||
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);
|
||||
|
||||
args = new String[] {
|
||||
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--shards", "2",
|
||||
"--mappers=3",
|
||||
"--verbose",
|
||||
"--go-live",
|
||||
"--go-live-threads", Integer.toString(random().nextInt(15) + 1), dataDir.toString()
|
||||
};
|
||||
args = prependInitialArgs(args);
|
||||
|
||||
argList = new ArrayList<>();
|
||||
getShardUrlArgs(argList, replicatedCollection);
|
||||
args = concat(args, argList.toArray(new String[0]));
|
||||
|
||||
if (true) {
|
||||
tool = new MapReduceIndexerTool();
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
assertTrue(tool.job.isComplete());
|
||||
assertTrue(tool.job.isSuccessful());
|
||||
|
||||
checkConsistency(replicatedCollection);
|
||||
|
||||
assertEquals(RECORD_COUNT, executeSolrQuery(cloudClient, "*:*").size());
|
||||
}
|
||||
|
||||
// delete collection
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("action", CollectionAction.DELETE.toString());
|
||||
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, true);
|
||||
params.set(CoreAdminParams.DELETE_DATA_DIR, true);
|
||||
params.set(CoreAdminParams.DELETE_INDEX, true);
|
||||
params.set("name", replicatedCollection);
|
||||
QueryRequest request = new QueryRequest(params);
|
||||
request.setPath("/admin/collections");
|
||||
cloudClient.request(request);
|
||||
|
||||
final TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS);
|
||||
while (cloudClient.getZkStateReader().getClusterState().hasCollection(replicatedCollection)) {
|
||||
if (timeout.hasTimedOut()) {
|
||||
throw new AssertionError("Timeout waiting to see removed collection leave clusterstate");
|
||||
}
|
||||
|
||||
Thread.sleep(200);
|
||||
}
|
||||
|
||||
if (TEST_NIGHTLY) {
|
||||
createCollection(replicatedCollection, 3, 3, 3);
|
||||
} else {
|
||||
createCollection(replicatedCollection, 2, 3, 2);
|
||||
}
|
||||
|
||||
waitForRecoveriesToFinish(replicatedCollection, false);
|
||||
printLayout();
|
||||
assertEquals(0, executeSolrQuery(cloudClient, "*:*").getNumFound());
|
||||
|
||||
|
||||
args = new String[] {
|
||||
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
|
||||
"--output-dir=" + outDir.toString(),
|
||||
"--shards", "2",
|
||||
"--mappers=3",
|
||||
"--verbose",
|
||||
"--go-live",
|
||||
"--go-live-threads", Integer.toString(random().nextInt(15) + 1), dataDir.toString()
|
||||
};
|
||||
args = prependInitialArgs(args);
|
||||
|
||||
argList = new ArrayList<>();
|
||||
getShardUrlArgs(argList, replicatedCollection);
|
||||
args = concat(args, argList.toArray(new String[0]));
|
||||
|
||||
tool = new MapReduceIndexerTool();
|
||||
res = ToolRunner.run(jobConf, tool, args);
|
||||
assertEquals(0, res);
|
||||
assertTrue(tool.job.isComplete());
|
||||
assertTrue(tool.job.isSuccessful());
|
||||
|
||||
checkConsistency(replicatedCollection);
|
||||
|
||||
assertEquals(RECORD_COUNT, executeSolrQuery(cloudClient, "*:*").size());
|
||||
}
|
||||
|
||||
private void getShardUrlArgs(List<String> args) {
|
||||
for (int i = 0; i < getShardCount(); i++) {
|
||||
args.add("--shard-url");
|
||||
args.add(cloudJettys.get(i).url);
|
||||
}
|
||||
}
|
||||
|
||||
private SolrDocumentList executeSolrQuery(SolrClient collection, String queryString) throws SolrServerException, IOException {
|
||||
SolrQuery query = new SolrQuery(queryString).setRows(2 * RECORD_COUNT).addSort("id", ORDER.asc);
|
||||
QueryResponse response = collection.query(query);
|
||||
return response.getResults();
|
||||
}
|
||||
|
||||
private void checkConsistency(String replicatedCollection)
|
||||
throws Exception {
|
||||
Collection<Slice> slices = cloudClient.getZkStateReader().getClusterState()
|
||||
.getSlices(replicatedCollection);
|
||||
for (Slice slice : slices) {
|
||||
Collection<Replica> replicas = slice.getReplicas();
|
||||
long found = -1;
|
||||
for (Replica replica : replicas) {
|
||||
try (HttpSolrClient client = getHttpSolrClient(new ZkCoreNodeProps(replica).getCoreUrl())) {
|
||||
SolrQuery query = new SolrQuery("*:*");
|
||||
query.set("distrib", false);
|
||||
QueryResponse replicaResults = client.query(query);
|
||||
long count = replicaResults.getResults().getNumFound();
|
||||
if (found != -1) {
|
||||
assertEquals(slice.getName() + " is inconsistent "
|
||||
+ new ZkCoreNodeProps(replica).getCoreUrl(), found, count);
|
||||
}
|
||||
found = count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void getShardUrlArgs(List<String> args, String replicatedCollection) {
|
||||
Collection<Slice> slices = cloudClient.getZkStateReader().getClusterState().getSlices(replicatedCollection);
|
||||
for (Slice slice : slices) {
|
||||
Collection<Replica> replicas = slice.getReplicas();
|
||||
for (Replica replica : replicas) {
|
||||
args.add("--shard-url");
|
||||
args.add(new ZkCoreNodeProps(replica).getCoreUrl());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Path upAvroFile(FileSystem fs, Path inDir, String DATADIR,
|
||||
Path dataDir, String localFile) throws IOException, UnsupportedEncodingException {
|
||||
Path INPATH = new Path(inDir, "input.txt");
|
||||
OutputStream os = fs.create(INPATH);
|
||||
Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
|
||||
wr.write(DATADIR + File.separator + localFile);
|
||||
wr.close();
|
||||
|
||||
assertTrue(fs.mkdirs(dataDir));
|
||||
fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, localFile), dataDir);
|
||||
return INPATH;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JettySolrRunner createJetty(File solrHome, String dataDir,
|
||||
String shardList, String solrConfigOverride, String schemaOverride)
|
||||
throws Exception {
|
||||
|
||||
Properties props = new Properties();
|
||||
if (solrConfigOverride != null)
|
||||
props.setProperty("solrconfig", solrConfigOverride);
|
||||
if (schemaOverride != null)
|
||||
props.setProperty("schema", schemaOverride);
|
||||
if (shardList != null)
|
||||
props.setProperty("shards", shardList);
|
||||
|
||||
String collection = System.getProperty("collection");
|
||||
if (collection == null)
|
||||
collection = "collection1";
|
||||
props.setProperty("collection", collection);
|
||||
|
||||
JettySolrRunner jetty = new JettySolrRunner(solrHome.getAbsolutePath(), props, buildJettyConfig(context));
|
||||
jetty.start();
|
||||
|
||||
return jetty;
|
||||
}
|
||||
|
||||
private static void putConfig(SolrZkClient zkClient, File solrhome, String name) throws Exception {
|
||||
putConfig(zkClient, solrhome, name, name);
|
||||
}
|
||||
|
||||
private static void putConfig(SolrZkClient zkClient, File solrhome, String srcName, String destName)
|
||||
throws Exception {
|
||||
|
||||
File file = new File(solrhome, "conf" + File.separator + srcName);
|
||||
if (!file.exists()) {
|
||||
// LOG.info("skipping " + file.getAbsolutePath() +
|
||||
// " because it doesn't exist");
|
||||
return;
|
||||
}
|
||||
|
||||
String destPath = "/configs/conf1/" + destName;
|
||||
// LOG.info("put " + file.getAbsolutePath() + " to " + destPath);
|
||||
zkClient.makePath(destPath, file, false, true);
|
||||
}
|
||||
|
||||
private void uploadConfFiles() throws Exception {
|
||||
// upload our own config files
|
||||
SolrZkClient zkClient = new SolrZkClient(zkServer.getZkAddress(), 10000);
|
||||
putConfig(zkClient, new File(RESOURCES_DIR + "/solr/solrcloud"),
|
||||
"solrconfig.xml");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "schema.xml");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "elevate.xml");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_en.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ar.txt");
|
||||
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_bg.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ca.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_cz.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_da.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_el.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_es.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_eu.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_de.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_fa.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_fi.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_fr.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ga.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_gl.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_hi.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_hu.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_hy.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_id.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_it.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ja.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_lv.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_nl.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_no.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_pt.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ro.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ru.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_sv.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_th.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_tr.txt");
|
||||
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_ca.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_fr.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_ga.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_it.txt");
|
||||
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stemdict_nl.txt");
|
||||
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "lang/hyphenations_ga.txt");
|
||||
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "stopwords.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "protwords.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "currency.xml");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "open-exchange-rates.json");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "mapping-ISOLatin1Accent.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "old_synonyms.txt");
|
||||
putConfig(zkClient, MINIMR_CONF_DIR, "synonyms.txt");
|
||||
zkClient.close();
|
||||
}
|
||||
|
||||
protected static <T> T[] concat(T[]... arrays) {
|
||||
if (arrays.length <= 0) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
Class clazz = null;
|
||||
int length = 0;
|
||||
for (T[] array : arrays) {
|
||||
clazz = array.getClass();
|
||||
length += array.length;
|
||||
}
|
||||
T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length);
|
||||
int pos = 0;
|
||||
for (T[] array : arrays) {
|
||||
System.arraycopy(array, 0, result, pos, array.length);
|
||||
pos += array.length;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private NamedList<Object> createAlias(String alias, String collections) throws SolrServerException, IOException {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("collections", collections);
|
||||
params.set("name", alias);
|
||||
params.set("action", CollectionAction.CREATEALIAS.toString());
|
||||
QueryRequest request = new QueryRequest(params);
|
||||
request.setPath("/admin/collections");
|
||||
return cloudClient.request(request);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
|
||||
import org.apache.hadoop.mrunit.types.Pair;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.hadoop.morphline.MorphlineMapper;
|
||||
import org.apache.solr.util.BadHdfsThreadsFilter;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
|
||||
|
||||
@ThreadLeakFilters(defaultFilters = true, filters = {
|
||||
BadHdfsThreadsFilter.class // hdfs currently leaks thread(s)
|
||||
})
|
||||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-9220")
|
||||
public class MorphlineMapperTest extends MRUnitBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() {
|
||||
assumeFalse("Does not work on Windows, because it uses UNIX shell commands or POSIX paths", Constants.WINDOWS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMapper() throws Exception {
|
||||
MorphlineMapper mapper = new MorphlineMapper();
|
||||
MapDriver<LongWritable, Text, Text, SolrInputDocumentWritable> mapDriver = MapDriver.newMapDriver(mapper);;
|
||||
|
||||
Configuration config = mapDriver.getConfiguration();
|
||||
setupHadoopConfig(config);
|
||||
|
||||
mapDriver.withInput(new LongWritable(0L), new Text("hdfs://localhost/" +
|
||||
URLEncoder.encode(DOCUMENTS_DIR, "UTF-8").replace("+", "%20") +
|
||||
"/sample-statuses-20120906-141433.avro"));
|
||||
|
||||
SolrInputDocument sid = new SolrInputDocument();
|
||||
sid.addField("id", "uniqueid1");
|
||||
sid.addField("user_name", "user1");
|
||||
sid.addField("text", "content of record one");
|
||||
SolrInputDocumentWritable sidw = new SolrInputDocumentWritable(sid);
|
||||
|
||||
mapDriver
|
||||
.withCacheArchive(solrHomeZip.getAbsolutePath())
|
||||
.withOutput(new Text("0"), sidw);
|
||||
//mapDriver.runTest();
|
||||
List<Pair<Text, SolrInputDocumentWritable>> result = mapDriver.run();
|
||||
for (Pair<Text, SolrInputDocumentWritable> p: result) {
|
||||
System.out.println(p.getFirst());
|
||||
System.out.println(p.getSecond());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,131 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.TaskID;
|
||||
import org.apache.hadoop.mapreduce.InputFormat;
|
||||
import org.apache.hadoop.mapreduce.InputSplit;
|
||||
import org.apache.hadoop.mapreduce.JobContext;
|
||||
import org.apache.hadoop.mapreduce.RecordReader;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
||||
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.mockito.invocation.InvocationOnMock;
|
||||
import org.mockito.stubbing.Answer;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
@Ignore("This test cannot currently work because it uses a local filesystem output path for the indexes and Solr requires hdfs output paths")
|
||||
public class MorphlineReducerTest extends MRUnitBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass2() {
|
||||
assumeFalse("Does not work on Windows, because it uses UNIX shell commands or POSIX paths", Constants.WINDOWS);
|
||||
|
||||
System.setProperty("verifyPartitionAssignment", "false");
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass2() {
|
||||
System.clearProperty("verifyPartitionAssignment");
|
||||
}
|
||||
|
||||
public static class MySolrReducer extends SolrReducer {
|
||||
Context context;
|
||||
|
||||
@Override
|
||||
protected void setup(Context context) throws IOException, InterruptedException {
|
||||
this.context = context;
|
||||
|
||||
// handle a bug in MRUnit - should be fixed in MRUnit 1.0.0
|
||||
when(context.getTaskAttemptID()).thenAnswer(new Answer<TaskAttemptID>() {
|
||||
@Override
|
||||
public TaskAttemptID answer(final InvocationOnMock invocation) {
|
||||
// FIXME MRUNIT seems to pass taskid to the reduce task as mapred.TaskID rather than mapreduce.TaskID
|
||||
return new TaskAttemptID(new TaskID("000000000000", 0, true, 0), 0);
|
||||
}
|
||||
});
|
||||
|
||||
super.setup(context);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class NullInputFormat<K, V> extends InputFormat<K, V> {
|
||||
@Override
|
||||
public List<InputSplit> getSplits(JobContext context) throws IOException,
|
||||
InterruptedException {
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<K, V> createRecordReader(InputSplit split,
|
||||
TaskAttemptContext context) throws IOException, InterruptedException {
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducer() throws Exception {
|
||||
MySolrReducer myReducer = new MySolrReducer();
|
||||
try {
|
||||
ReduceDriver<Text,SolrInputDocumentWritable,Text,SolrInputDocumentWritable> reduceDriver = ReduceDriver
|
||||
.newReduceDriver(myReducer);
|
||||
|
||||
Configuration config = reduceDriver.getConfiguration();
|
||||
setupHadoopConfig(config);
|
||||
|
||||
List<SolrInputDocumentWritable> values = new ArrayList<>();
|
||||
SolrInputDocument sid = new SolrInputDocument();
|
||||
String id = "myid1";
|
||||
sid.addField("id", id);
|
||||
sid.addField("text", "some unique text");
|
||||
SolrInputDocumentWritable sidw = new SolrInputDocumentWritable(sid);
|
||||
values.add(sidw);
|
||||
reduceDriver.withInput(new Text(id), values);
|
||||
|
||||
reduceDriver.withCacheArchive(solrHomeZip.getAbsolutePath());
|
||||
|
||||
reduceDriver.withOutputFormat(SolrOutputFormat.class,
|
||||
NullInputFormat.class);
|
||||
|
||||
reduceDriver.run();
|
||||
|
||||
assertEquals("Expected 1 counter increment", 1,
|
||||
reduceDriver.getCounters().findCounter(SolrCounters.class.getName(),
|
||||
SolrCounters.DOCUMENTS_WRITTEN.toString()).getValue());
|
||||
} finally {
|
||||
myReducer.cleanup(myReducer.context);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
|
||||
public class UtilsForTests {
|
||||
|
||||
public static void validateSolrServerDocumentCount(File solrHomeDir, FileSystem fs, Path outDir, int expectedDocs, int expectedShards)
|
||||
throws IOException, SolrServerException {
|
||||
|
||||
long actualDocs = 0;
|
||||
int actualShards = 0;
|
||||
for (FileStatus dir : fs.listStatus(outDir)) { // for each shard
|
||||
if (dir.getPath().getName().startsWith("part") && dir.isDirectory()) {
|
||||
actualShards++;
|
||||
try (EmbeddedSolrServer solr
|
||||
= SolrRecordWriter.createEmbeddedSolrServer(new Path(solrHomeDir.getAbsolutePath()), fs, dir.getPath())) {
|
||||
SolrQuery query = new SolrQuery();
|
||||
query.setQuery("*:*");
|
||||
QueryResponse resp = solr.query(query);
|
||||
long numDocs = resp.getResults().getNumFound();
|
||||
actualDocs += numDocs;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals(expectedShards, actualShards);
|
||||
assertEquals(expectedDocs, actualDocs);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.hack;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
/*
|
||||
* A simple interface for a client MR cluster used for testing. This interface
|
||||
* provides basic methods which are independent of the underlying Mini Cluster (
|
||||
* either through MR1 or MR2).
|
||||
*/
|
||||
public interface MiniMRClientCluster {
|
||||
|
||||
public void start() throws IOException;
|
||||
|
||||
/**
|
||||
* Stop and start back the cluster using the same configuration.
|
||||
*/
|
||||
public void restart() throws IOException;
|
||||
|
||||
public void stop() throws IOException;
|
||||
|
||||
public Configuration getConfig() throws IOException;
|
||||
|
||||
}
|
|
@ -1,88 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.hack;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.util.JarFinder;
|
||||
|
||||
/**
|
||||
* A MiniMRCluster factory. In MR2, it provides a wrapper MiniMRClientCluster
|
||||
* interface around the MiniMRYarnCluster. While in MR1, it provides such
|
||||
* wrapper around MiniMRCluster. This factory should be used in tests to provide
|
||||
* an easy migration of tests across MR1 and MR2.
|
||||
*/
|
||||
public class MiniMRClientClusterFactory {
|
||||
|
||||
public static MiniMRClientCluster create(Class<?> caller, int noOfNMs,
|
||||
Configuration conf, File testWorkDir) throws IOException {
|
||||
return create(caller, caller.getSimpleName(), noOfNMs, conf, testWorkDir);
|
||||
}
|
||||
|
||||
public static MiniMRClientCluster create(Class<?> caller, String identifier,
|
||||
int noOfNMs, Configuration conf, File testWorkDir) throws IOException {
|
||||
|
||||
if (conf == null) {
|
||||
conf = new Configuration();
|
||||
}
|
||||
|
||||
FileSystem fs = FileSystem.get(conf);
|
||||
|
||||
Path testRootDir = new Path(testWorkDir.getPath(), identifier + "-tmpDir")
|
||||
.makeQualified(fs);
|
||||
Path appJar = new Path(testRootDir, "MRAppJar.jar");
|
||||
|
||||
// Copy MRAppJar and make it private.
|
||||
Path appMasterJar = new Path(MiniMRYarnCluster.APPJAR);
|
||||
|
||||
fs.copyFromLocalFile(appMasterJar, appJar);
|
||||
fs.setPermission(appJar, new FsPermission("744"));
|
||||
|
||||
Job job = Job.getInstance(conf);
|
||||
|
||||
job.addFileToClassPath(appJar);
|
||||
|
||||
Path callerJar = new Path(JarFinder.getJar(caller));
|
||||
Path remoteCallerJar = new Path(testRootDir, callerJar.getName());
|
||||
fs.copyFromLocalFile(callerJar, remoteCallerJar);
|
||||
fs.setPermission(remoteCallerJar, new FsPermission("744"));
|
||||
job.addFileToClassPath(remoteCallerJar);
|
||||
|
||||
MiniMRYarnCluster miniMRYarnCluster;
|
||||
try {
|
||||
miniMRYarnCluster = new MiniMRYarnCluster(identifier,
|
||||
noOfNMs, testWorkDir);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
job.getConfiguration().set("minimrclientcluster.caller.name",
|
||||
identifier);
|
||||
job.getConfiguration().setInt("minimrclientcluster.nodemanagers.number",
|
||||
noOfNMs);
|
||||
miniMRYarnCluster.init(job.getConfiguration());
|
||||
miniMRYarnCluster.start();
|
||||
|
||||
return new MiniMRYarnClusterAdapter(miniMRYarnCluster, testWorkDir);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,266 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.hack;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hadoop.mapred.JobID;
|
||||
import org.apache.hadoop.mapred.JobPriority;
|
||||
import org.apache.hadoop.mapred.MapTaskCompletionEventsUpdate;
|
||||
import org.apache.hadoop.mapred.TaskCompletionEvent;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
||||
/**
|
||||
* This class is an MR2 replacement for older MR1 MiniMRCluster, that was used
|
||||
* by tests prior to MR2. This replacement class uses the new MiniMRYarnCluster
|
||||
* in MR2 but provides the same old MR1 interface, so tests can be migrated from
|
||||
* MR1 to MR2 with minimal changes.
|
||||
*
|
||||
* Due to major differences between MR1 and MR2, a number of methods are either
|
||||
* unimplemented/unsupported or were re-implemented to provide wrappers around
|
||||
* MR2 functionality.
|
||||
*
|
||||
* @deprecated Use {@link org.apache.hadoop.mapred.MiniMRClientClusterFactory}
|
||||
* instead
|
||||
*/
|
||||
@Deprecated
|
||||
public class MiniMRCluster {
|
||||
private static final Log LOG = LogFactory.getLog(MiniMRCluster.class);
|
||||
|
||||
private MiniMRClientCluster mrClientCluster;
|
||||
|
||||
public String getTaskTrackerLocalDir(int taskTracker) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public String[] getTaskTrackerLocalDirs(int taskTracker) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
class JobTrackerRunner {
|
||||
// Mock class
|
||||
}
|
||||
|
||||
class TaskTrackerRunner {
|
||||
// Mock class
|
||||
}
|
||||
|
||||
public JobTrackerRunner getJobTrackerRunner() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
TaskTrackerRunner getTaskTrackerRunner(int id) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public int getNumTaskTrackers() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void setInlineCleanupThreads() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void waitUntilIdle() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private void waitTaskTrackers() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public int getJobTrackerPort() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public JobConf createJobConf() {
|
||||
JobConf jobConf = null;
|
||||
try {
|
||||
jobConf = new JobConf(mrClientCluster.getConfig());
|
||||
} catch (IOException e) {
|
||||
LOG.error(e);
|
||||
}
|
||||
return jobConf;
|
||||
}
|
||||
|
||||
public JobConf createJobConf(JobConf conf) {
|
||||
JobConf jobConf = null;
|
||||
try {
|
||||
jobConf = new JobConf(mrClientCluster.getConfig());
|
||||
} catch (IOException e) {
|
||||
LOG.error(e);
|
||||
}
|
||||
return jobConf;
|
||||
}
|
||||
|
||||
static JobConf configureJobConf(JobConf conf, String namenode,
|
||||
int jobTrackerPort, int jobTrackerInfoPort, UserGroupInformation ugi) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public MiniMRCluster(int numTaskTrackers, String namenode, int numDir,
|
||||
String[] racks, String[] hosts) throws Exception {
|
||||
this(0, 0, numTaskTrackers, namenode, numDir, racks, hosts);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int numTaskTrackers, String namenode, int numDir,
|
||||
String[] racks, String[] hosts, JobConf conf) throws Exception {
|
||||
this(0, 0, numTaskTrackers, namenode, numDir, racks, hosts, null, conf);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int numTaskTrackers, String namenode, int numDir)
|
||||
throws Exception {
|
||||
this(0, 0, numTaskTrackers, namenode, numDir);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
|
||||
int numTaskTrackers, String namenode, int numDir) throws Exception {
|
||||
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
|
||||
null);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
|
||||
int numTaskTrackers, String namenode, int numDir, String[] racks)
|
||||
throws Exception {
|
||||
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
|
||||
racks, null);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
|
||||
int numTaskTrackers, String namenode, int numDir, String[] racks,
|
||||
String[] hosts) throws Exception {
|
||||
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
|
||||
racks, hosts, null);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
|
||||
int numTaskTrackers, String namenode, int numDir, String[] racks,
|
||||
String[] hosts, UserGroupInformation ugi) throws Exception {
|
||||
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
|
||||
racks, hosts, ugi, null);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
|
||||
int numTaskTrackers, String namenode, int numDir, String[] racks,
|
||||
String[] hosts, UserGroupInformation ugi, JobConf conf)
|
||||
throws Exception {
|
||||
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
|
||||
racks, hosts, ugi, conf, 0);
|
||||
}
|
||||
|
||||
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
|
||||
int numTaskTrackers, String namenode, int numDir, String[] racks,
|
||||
String[] hosts, UserGroupInformation ugi, JobConf conf,
|
||||
int numTrackerToExclude) throws Exception {
|
||||
if (conf == null) conf = new JobConf();
|
||||
FileSystem.setDefaultUri(conf, namenode);
|
||||
String identifier = this.getClass().getSimpleName() + "_"
|
||||
+ Integer.toString(LuceneTestCase.random().nextInt(Integer.MAX_VALUE));
|
||||
mrClientCluster = MiniMRClientClusterFactory.create(this.getClass(),
|
||||
identifier, numTaskTrackers, conf, new File(conf.get("testWorkDir")));
|
||||
}
|
||||
|
||||
public UserGroupInformation getUgi() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public TaskCompletionEvent[] getTaskCompletionEvents(JobID id, int from,
|
||||
int max) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void setJobPriority(JobID jobId, JobPriority priority)
|
||||
throws AccessControlException, IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public JobPriority getJobPriority(JobID jobId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public long getJobFinishTime(JobID jobId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void initializeJob(JobID jobId) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public MapTaskCompletionEventsUpdate getMapTaskCompletionEventsUpdates(
|
||||
int index, JobID jobId, int max) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public JobConf getJobTrackerConf() {
|
||||
JobConf jobConf = null;
|
||||
try {
|
||||
jobConf = new JobConf(mrClientCluster.getConfig());
|
||||
} catch (IOException e) {
|
||||
LOG.error(e);
|
||||
}
|
||||
return jobConf;
|
||||
}
|
||||
|
||||
public int getFaultCount(String hostName) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void startJobTracker() {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
public void startJobTracker(boolean wait) {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
public void stopJobTracker() {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
public void stopTaskTracker(int id) {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
public void startTaskTracker(String host, String rack, int idx, int numDir)
|
||||
throws IOException {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void addTaskTracker(TaskTrackerRunner taskTracker) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
int getTaskTrackerID(String trackerName) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void shutdown() {
|
||||
try {
|
||||
mrClientCluster.stop();
|
||||
} catch (IOException e) {
|
||||
LOG.error(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,205 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.hack;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapred.LocalContainerLauncher;
|
||||
import org.apache.hadoop.mapred.ShuffleHandler;
|
||||
import org.apache.hadoop.mapreduce.MRConfig;
|
||||
import org.apache.hadoop.mapreduce.MRJobConfig;
|
||||
import org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer;
|
||||
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
|
||||
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
import org.apache.hadoop.service.Service;
|
||||
import org.apache.hadoop.util.JarFinder;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
|
||||
/**
|
||||
* Configures and starts the MR-specific components in the YARN cluster.
|
||||
*
|
||||
*/
|
||||
public class MiniMRYarnCluster extends MiniYARNCluster {
|
||||
|
||||
public static final String APPJAR = JarFinder.getJar(LocalContainerLauncher.class);
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(MiniMRYarnCluster.class);
|
||||
private JobHistoryServer historyServer;
|
||||
private JobHistoryServerWrapper historyServerWrapper;
|
||||
|
||||
public MiniMRYarnCluster(String testName, File testWorkDir) {
|
||||
this(testName, 1, testWorkDir);
|
||||
}
|
||||
|
||||
public MiniMRYarnCluster(String testName, int noOfNMs, File testWorkDir) {
|
||||
super(testName, noOfNMs, 4, 4, testWorkDir);
|
||||
//TODO: add the history server
|
||||
historyServerWrapper = new JobHistoryServerWrapper();
|
||||
addService(historyServerWrapper);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void serviceInit(Configuration conf) throws Exception {
|
||||
conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME);
|
||||
if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
|
||||
conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(),
|
||||
"apps_staging_dir/").getAbsolutePath());
|
||||
}
|
||||
|
||||
// By default, VMEM monitoring disabled, PMEM monitoring enabled.
|
||||
if (!conf.getBoolean(
|
||||
MRConfig.MAPREDUCE_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
|
||||
MRConfig.DEFAULT_MAPREDUCE_MINICLUSTER_CONTROL_RESOURCE_MONITORING)) {
|
||||
conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||
}
|
||||
|
||||
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");
|
||||
|
||||
try {
|
||||
Path stagingPath = FileContext.getFileContext(conf).makeQualified(
|
||||
new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
|
||||
/*
|
||||
* Re-configure the staging path on Windows if the file system is localFs.
|
||||
* We need to use a absolute path that contains the drive letter. The unit
|
||||
* test could run on a different drive than the AM. We can run into the
|
||||
* issue that job files are localized to the drive where the test runs on,
|
||||
* while the AM starts on a different drive and fails to find the job
|
||||
* metafiles. Using absolute path can avoid this ambiguity.
|
||||
*/
|
||||
if (Path.WINDOWS) {
|
||||
if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
|
||||
conf.set(MRJobConfig.MR_AM_STAGING_DIR,
|
||||
new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR))
|
||||
.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
FileContext fc=FileContext.getFileContext(stagingPath.toUri(), conf);
|
||||
if (fc.util().exists(stagingPath)) {
|
||||
LOG.info(stagingPath + " exists! deleting...");
|
||||
fc.delete(stagingPath, true);
|
||||
}
|
||||
LOG.info("mkdir: " + stagingPath);
|
||||
//mkdir the staging directory so that right permissions are set while running as proxy user
|
||||
fc.mkdir(stagingPath, null, true);
|
||||
//mkdir done directory as well
|
||||
String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
|
||||
Path doneDirPath = fc.makeQualified(new Path(doneDir));
|
||||
fc.mkdir(doneDirPath, null, true);
|
||||
} catch (IOException e) {
|
||||
throw new YarnRuntimeException("Could not create staging directory. ", e);
|
||||
}
|
||||
conf.set(MRConfig.MASTER_ADDRESS, "test"); // The default is local because of
|
||||
// which shuffle doesn't happen
|
||||
//configure the shuffle service in NM
|
||||
conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
|
||||
new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
|
||||
conf.setClass(String.format(Locale.ENGLISH, YarnConfiguration.NM_AUX_SERVICE_FMT,
|
||||
ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class,
|
||||
Service.class);
|
||||
|
||||
// Non-standard shuffle port
|
||||
conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);
|
||||
|
||||
conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
|
||||
DefaultContainerExecutor.class, ContainerExecutor.class);
|
||||
|
||||
// TestMRJobs is for testing non-uberized operation only; see TestUberAM
|
||||
// for corresponding uberized tests.
|
||||
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
|
||||
|
||||
super.serviceInit(conf);
|
||||
}
|
||||
|
||||
private class JobHistoryServerWrapper extends AbstractService {
|
||||
public JobHistoryServerWrapper() {
|
||||
super(JobHistoryServerWrapper.class.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void serviceStart() throws Exception {
|
||||
try {
|
||||
if (!getConfig().getBoolean(
|
||||
JHAdminConfig.MR_HISTORY_MINICLUSTER_FIXED_PORTS,
|
||||
JHAdminConfig.DEFAULT_MR_HISTORY_MINICLUSTER_FIXED_PORTS)) {
|
||||
// pick free random ports.
|
||||
getConfig().set(JHAdminConfig.MR_HISTORY_ADDRESS,
|
||||
MiniYARNCluster.getHostname() + ":0");
|
||||
getConfig().set(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS,
|
||||
MiniYARNCluster.getHostname() + ":0");
|
||||
}
|
||||
historyServer = new JobHistoryServer();
|
||||
historyServer.init(getConfig());
|
||||
new Thread() {
|
||||
public void run() {
|
||||
historyServer.start();
|
||||
};
|
||||
}.start();
|
||||
while (historyServer.getServiceState() == STATE.INITED) {
|
||||
LOG.info("Waiting for HistoryServer to start...");
|
||||
Thread.sleep(1500);
|
||||
}
|
||||
//TODO Add a timeout. State.STOPPED check ?
|
||||
if (historyServer.getServiceState() != STATE.STARTED) {
|
||||
throw new IOException("HistoryServer failed to start");
|
||||
}
|
||||
super.serviceStart();
|
||||
} catch (Throwable t) {
|
||||
throw new YarnRuntimeException(t);
|
||||
}
|
||||
//need to do this because historyServer.init creates a new Configuration
|
||||
getConfig().set(JHAdminConfig.MR_HISTORY_ADDRESS,
|
||||
historyServer.getConfig().get(JHAdminConfig.MR_HISTORY_ADDRESS));
|
||||
getConfig().set(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS,
|
||||
historyServer.getConfig().get(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS));
|
||||
|
||||
LOG.info("MiniMRYARN ResourceManager address: " +
|
||||
getConfig().get(YarnConfiguration.RM_ADDRESS));
|
||||
LOG.info("MiniMRYARN ResourceManager web address: " +
|
||||
getConfig().get(YarnConfiguration.RM_WEBAPP_ADDRESS));
|
||||
LOG.info("MiniMRYARN HistoryServer address: " +
|
||||
getConfig().get(JHAdminConfig.MR_HISTORY_ADDRESS));
|
||||
LOG.info("MiniMRYARN HistoryServer web address: " +
|
||||
getConfig().get(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS));
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void serviceStop() throws Exception {
|
||||
if (historyServer != null) {
|
||||
historyServer.stop();
|
||||
}
|
||||
super.serviceStop();
|
||||
}
|
||||
}
|
||||
|
||||
public JobHistoryServer getHistoryServer() {
|
||||
return this.historyServer;
|
||||
}
|
||||
}
|
|
@ -1,78 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.hack;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
|
||||
import org.apache.hadoop.service.Service.STATE;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
|
||||
/**
|
||||
* An adapter for MiniMRYarnCluster providing a MiniMRClientCluster interface.
|
||||
* This interface could be used by tests across both MR1 and MR2.
|
||||
*/
|
||||
public class MiniMRYarnClusterAdapter implements MiniMRClientCluster {
|
||||
|
||||
private MiniMRYarnCluster miniMRYarnCluster;
|
||||
|
||||
private File testWorkDir;
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(MiniMRYarnClusterAdapter.class);
|
||||
|
||||
public MiniMRYarnClusterAdapter(MiniMRYarnCluster miniMRYarnCluster, File testWorkDir) {
|
||||
this.miniMRYarnCluster = miniMRYarnCluster;
|
||||
this.testWorkDir = testWorkDir;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Configuration getConfig() {
|
||||
return miniMRYarnCluster.getConfig();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start() {
|
||||
miniMRYarnCluster.start();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop() {
|
||||
miniMRYarnCluster.stop();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void restart() {
|
||||
if (!miniMRYarnCluster.getServiceState().equals(STATE.STARTED)){
|
||||
LOG.warn("Cannot restart the mini cluster, start it first");
|
||||
return;
|
||||
}
|
||||
Configuration oldConf = new Configuration(getConfig());
|
||||
String callerName = oldConf.get("minimrclientcluster.caller.name",
|
||||
this.getClass().getName());
|
||||
int noOfNMs = oldConf.getInt("minimrclientcluster.nodemanagers.number", 1);
|
||||
oldConf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_FIXED_PORTS, true);
|
||||
oldConf.setBoolean(JHAdminConfig.MR_HISTORY_MINICLUSTER_FIXED_PORTS, true);
|
||||
stop();
|
||||
miniMRYarnCluster = new MiniMRYarnCluster(callerName, noOfNMs, testWorkDir);
|
||||
miniMRYarnCluster.init(oldConf);
|
||||
miniMRYarnCluster.start();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,409 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.hadoop.hack;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
import org.apache.hadoop.service.CompositeService;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.server.api.ResourceTracker;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService;
|
||||
|
||||
public class MiniYARNCluster extends CompositeService {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(MiniYARNCluster.class);
|
||||
|
||||
// temp fix until metrics system can auto-detect itself running in unit test:
|
||||
static {
|
||||
DefaultMetricsSystem.setMiniClusterMode(true);
|
||||
}
|
||||
|
||||
private NodeManager[] nodeManagers;
|
||||
private ResourceManager resourceManager;
|
||||
|
||||
private ResourceManagerWrapper resourceManagerWrapper;
|
||||
|
||||
private File testWorkDir;
|
||||
|
||||
// Number of nm-local-dirs per nodemanager
|
||||
private int numLocalDirs;
|
||||
// Number of nm-log-dirs per nodemanager
|
||||
private int numLogDirs;
|
||||
|
||||
/**
|
||||
* @param testName name of the test
|
||||
* @param noOfNodeManagers the number of node managers in the cluster
|
||||
* @param numLocalDirs the number of nm-local-dirs per nodemanager
|
||||
* @param numLogDirs the number of nm-log-dirs per nodemanager
|
||||
*/
|
||||
public MiniYARNCluster(String testName, int noOfNodeManagers,
|
||||
int numLocalDirs, int numLogDirs, File testWorkDir) {
|
||||
super(testName.replace("$", ""));
|
||||
this.numLocalDirs = numLocalDirs;
|
||||
this.numLogDirs = numLogDirs;
|
||||
String testSubDir = testName.replace("$", "");
|
||||
File targetWorkDir = new File(testWorkDir, testSubDir);
|
||||
try {
|
||||
FileContext.getLocalFSFileContext().delete(
|
||||
new Path(targetWorkDir.getAbsolutePath()), true);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("COULD NOT CLEANUP", e);
|
||||
throw new YarnRuntimeException("could not cleanup test dir: "+ e, e);
|
||||
}
|
||||
|
||||
if (Shell.WINDOWS) {
|
||||
// The test working directory can exceed the maximum path length supported
|
||||
// by some Windows APIs and cmd.exe (260 characters). To work around this,
|
||||
// create a symlink in temporary storage with a much shorter path,
|
||||
// targeting the full path to the test working directory. Then, use the
|
||||
// symlink as the test working directory.
|
||||
String targetPath = targetWorkDir.getAbsolutePath();
|
||||
File link = new File(System.getProperty("java.io.tmpdir"),
|
||||
String.valueOf(System.nanoTime()));
|
||||
String linkPath = link.getAbsolutePath();
|
||||
|
||||
try {
|
||||
FileContext.getLocalFSFileContext().delete(new Path(linkPath), true);
|
||||
} catch (IOException e) {
|
||||
throw new YarnRuntimeException("could not cleanup symlink: " + linkPath, e);
|
||||
}
|
||||
|
||||
// Guarantee target exists before creating symlink.
|
||||
targetWorkDir.mkdirs();
|
||||
|
||||
ShellCommandExecutor shexec = new ShellCommandExecutor(
|
||||
Shell.getSymlinkCommand(targetPath, linkPath));
|
||||
try {
|
||||
shexec.execute();
|
||||
} catch (IOException e) {
|
||||
throw new YarnRuntimeException(String.format(Locale.ENGLISH,
|
||||
"failed to create symlink from %s to %s, shell output: %s", linkPath,
|
||||
targetPath, shexec.getOutput()), e);
|
||||
}
|
||||
|
||||
this.testWorkDir = link;
|
||||
} else {
|
||||
this.testWorkDir = targetWorkDir;
|
||||
}
|
||||
|
||||
resourceManagerWrapper = new ResourceManagerWrapper();
|
||||
addService(resourceManagerWrapper);
|
||||
nodeManagers = new CustomNodeManager[noOfNodeManagers];
|
||||
for(int index = 0; index < noOfNodeManagers; index++) {
|
||||
addService(new NodeManagerWrapper(index));
|
||||
nodeManagers[index] = new CustomNodeManager();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void serviceInit(Configuration conf) throws Exception {
|
||||
super.serviceInit(conf instanceof YarnConfiguration ? conf
|
||||
: new YarnConfiguration(
|
||||
conf));
|
||||
}
|
||||
|
||||
public File getTestWorkDir() {
|
||||
return testWorkDir;
|
||||
}
|
||||
|
||||
public ResourceManager getResourceManager() {
|
||||
return this.resourceManager;
|
||||
}
|
||||
|
||||
public NodeManager getNodeManager(int i) {
|
||||
return this.nodeManagers[i];
|
||||
}
|
||||
|
||||
public static String getHostname() {
|
||||
try {
|
||||
return InetAddress.getLocalHost().getHostName();
|
||||
}
|
||||
catch (UnknownHostException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private class ResourceManagerWrapper extends AbstractService {
|
||||
public ResourceManagerWrapper() {
|
||||
super(ResourceManagerWrapper.class.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void serviceStart() throws Exception {
|
||||
try {
|
||||
getConfig().setBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, true);
|
||||
if (!getConfig().getBoolean(
|
||||
YarnConfiguration.YARN_MINICLUSTER_FIXED_PORTS,
|
||||
YarnConfiguration.DEFAULT_YARN_MINICLUSTER_FIXED_PORTS)) {
|
||||
// pick free random ports.
|
||||
String hostname = MiniYARNCluster.getHostname();
|
||||
getConfig().set(YarnConfiguration.RM_ADDRESS,
|
||||
hostname + ":0");
|
||||
getConfig().set(YarnConfiguration.RM_ADMIN_ADDRESS,
|
||||
hostname + ":0");
|
||||
getConfig().set(YarnConfiguration.RM_SCHEDULER_ADDRESS,
|
||||
hostname + ":0");
|
||||
getConfig().set(YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
|
||||
hostname + ":0");
|
||||
getConfig().set(YarnConfiguration.RM_WEBAPP_ADDRESS,
|
||||
hostname + ":0");
|
||||
}
|
||||
resourceManager = new ResourceManager() {
|
||||
@Override
|
||||
protected void doSecureLogin() throws IOException {
|
||||
// Don't try to login using keytab in the testcase.
|
||||
};
|
||||
};
|
||||
resourceManager.init(getConfig());
|
||||
new Thread() {
|
||||
public void run() {
|
||||
resourceManager.start();
|
||||
};
|
||||
}.start();
|
||||
int waitCount = 0;
|
||||
while (resourceManager.getServiceState() == STATE.INITED
|
||||
&& waitCount++ < 60) {
|
||||
LOG.info("Waiting for RM to start...");
|
||||
Thread.sleep(1500);
|
||||
}
|
||||
if (resourceManager.getServiceState() != STATE.STARTED) {
|
||||
// RM could have failed.
|
||||
throw new IOException(
|
||||
"ResourceManager failed to start. Final state is "
|
||||
+ resourceManager.getServiceState());
|
||||
}
|
||||
super.serviceStart();
|
||||
} catch (Throwable t) {
|
||||
throw new YarnRuntimeException(t);
|
||||
}
|
||||
LOG.info("MiniYARN ResourceManager address: " +
|
||||
getConfig().get(YarnConfiguration.RM_ADDRESS));
|
||||
LOG.info("MiniYARN ResourceManager web address: " +
|
||||
getConfig().get(YarnConfiguration.RM_WEBAPP_ADDRESS));
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void serviceStop() throws Exception {
|
||||
if (resourceManager != null) {
|
||||
resourceManager.stop();
|
||||
}
|
||||
super.serviceStop();
|
||||
|
||||
if (Shell.WINDOWS) {
|
||||
// On Windows, clean up the short temporary symlink that was created to
|
||||
// work around path length limitation.
|
||||
String testWorkDirPath = testWorkDir.getAbsolutePath();
|
||||
try {
|
||||
FileContext.getLocalFSFileContext().delete(new Path(testWorkDirPath),
|
||||
true);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("could not cleanup symlink: " +
|
||||
testWorkDir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class NodeManagerWrapper extends AbstractService {
|
||||
int index = 0;
|
||||
|
||||
public NodeManagerWrapper(int i) {
|
||||
super(NodeManagerWrapper.class.getName() + "_" + i);
|
||||
index = i;
|
||||
}
|
||||
|
||||
public synchronized void serviceInit(Configuration conf) throws Exception {
|
||||
Configuration config = new YarnConfiguration(conf);
|
||||
super.serviceInit(config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create local/log directories
|
||||
* @param dirType type of directories i.e. local dirs or log dirs
|
||||
* @param numDirs number of directories
|
||||
* @return the created directories as a comma delimited String
|
||||
*/
|
||||
private String prepareDirs(String dirType, int numDirs) {
|
||||
File []dirs = new File[numDirs];
|
||||
String dirsString = "";
|
||||
for (int i = 0; i < numDirs; i++) {
|
||||
dirs[i]= new File(testWorkDir, MiniYARNCluster.this.getName()
|
||||
+ "-" + dirType + "Dir-nm-" + index + "_" + i);
|
||||
dirs[i].mkdirs();
|
||||
LOG.info("Created " + dirType + "Dir in " + dirs[i].getAbsolutePath());
|
||||
String delimiter = (i > 0) ? "," : "";
|
||||
dirsString = dirsString.concat(delimiter + dirs[i].getAbsolutePath());
|
||||
}
|
||||
return dirsString;
|
||||
}
|
||||
|
||||
public synchronized void serviceStart() throws Exception {
|
||||
try {
|
||||
// create nm-local-dirs and configure them for the nodemanager
|
||||
String localDirsString = prepareDirs("local", numLocalDirs);
|
||||
getConfig().set(YarnConfiguration.NM_LOCAL_DIRS, localDirsString);
|
||||
// create nm-log-dirs and configure them for the nodemanager
|
||||
String logDirsString = prepareDirs("log", numLogDirs);
|
||||
getConfig().set(YarnConfiguration.NM_LOG_DIRS, logDirsString);
|
||||
|
||||
File remoteLogDir =
|
||||
new File(testWorkDir, MiniYARNCluster.this.getName()
|
||||
+ "-remoteLogDir-nm-" + index);
|
||||
remoteLogDir.mkdir();
|
||||
getConfig().set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
|
||||
remoteLogDir.getAbsolutePath());
|
||||
// By default AM + 2 containers
|
||||
getConfig().setInt(YarnConfiguration.NM_PMEM_MB, 4*1024);
|
||||
getConfig().set(YarnConfiguration.NM_ADDRESS,
|
||||
MiniYARNCluster.getHostname() + ":0");
|
||||
getConfig().set(YarnConfiguration.NM_LOCALIZER_ADDRESS,
|
||||
MiniYARNCluster.getHostname() + ":0");
|
||||
getConfig().set(YarnConfiguration.NM_WEBAPP_ADDRESS,
|
||||
MiniYARNCluster.getHostname() + ":0");
|
||||
|
||||
// Disable resource checks by default
|
||||
if (!getConfig().getBoolean(
|
||||
YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
|
||||
YarnConfiguration.
|
||||
DEFAULT_YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING)) {
|
||||
getConfig().setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||
getConfig().setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||
}
|
||||
|
||||
LOG.info("Starting NM: " + index);
|
||||
nodeManagers[index].init(getConfig());
|
||||
new Thread() {
|
||||
public void run() {
|
||||
nodeManagers[index].start();
|
||||
};
|
||||
}.start();
|
||||
int waitCount = 0;
|
||||
while (nodeManagers[index].getServiceState() == STATE.INITED
|
||||
&& waitCount++ < 60) {
|
||||
LOG.info("Waiting for NM " + index + " to start...");
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
if (nodeManagers[index].getServiceState() != STATE.STARTED) {
|
||||
// RM could have failed.
|
||||
throw new IOException("NodeManager " + index + " failed to start");
|
||||
}
|
||||
super.serviceStart();
|
||||
} catch (Throwable t) {
|
||||
throw new YarnRuntimeException(t);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void serviceStop() throws Exception {
|
||||
if (nodeManagers[index] != null) {
|
||||
nodeManagers[index].stop();
|
||||
}
|
||||
super.serviceStop();
|
||||
}
|
||||
}
|
||||
|
||||
private class CustomNodeManager extends NodeManager {
|
||||
@Override
|
||||
protected void doSecureLogin() throws IOException {
|
||||
// Don't try to login using keytab in the testcase.
|
||||
};
|
||||
|
||||
@Override
|
||||
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
|
||||
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
|
||||
return new NodeStatusUpdaterImpl(context, dispatcher,
|
||||
healthChecker, metrics) {
|
||||
@Override
|
||||
protected ResourceTracker getRMClient() {
|
||||
final ResourceTrackerService rt = resourceManager
|
||||
.getResourceTrackerService();
|
||||
final RecordFactory recordFactory =
|
||||
RecordFactoryProvider.getRecordFactory(null);
|
||||
|
||||
// For in-process communication without RPC
|
||||
return new ResourceTracker() {
|
||||
|
||||
@Override
|
||||
public NodeHeartbeatResponse nodeHeartbeat(
|
||||
NodeHeartbeatRequest request) throws YarnException,
|
||||
IOException {
|
||||
NodeHeartbeatResponse response = recordFactory.newRecordInstance(
|
||||
NodeHeartbeatResponse.class);
|
||||
try {
|
||||
response = rt.nodeHeartbeat(request);
|
||||
} catch (YarnException e) {
|
||||
LOG.info("Exception in heartbeat from node " +
|
||||
request.getNodeStatus().getNodeId(), e);
|
||||
throw e;
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RegisterNodeManagerResponse registerNodeManager(
|
||||
RegisterNodeManagerRequest request)
|
||||
throws YarnException, IOException {
|
||||
RegisterNodeManagerResponse response = recordFactory.
|
||||
newRecordInstance(RegisterNodeManagerResponse.class);
|
||||
try {
|
||||
response = rt.registerNodeManager(request);
|
||||
} catch (YarnException e) {
|
||||
LOG.info("Exception in node registration from "
|
||||
+ request.getNodeId().toString(), e);
|
||||
throw e;
|
||||
}
|
||||
return response;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@Override
|
||||
protected void stopRMProxy() {
|
||||
return;
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
Apache Solr Morphlines-Cell
|
||||
|
||||
*Experimental* - This contrib is currently subject to change in ways that may
|
||||
break back compatibility.
|
||||
|
||||
This contrib provides a variety of Kite Morphlines features for Solr Cell type functionality.
|
|
@ -1,144 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="solr-morphlines-cell" default="default">
|
||||
|
||||
<description>
|
||||
Solr Cell Morphline commands.
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<solr-contrib-uptodate name="extraction"
|
||||
property="solr-extraction.uptodate"
|
||||
classpath.property="solr-cell.jar"/>
|
||||
|
||||
<target name="compile-solr-extraction" unless="solr-extraction.uptodate">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<solr-contrib-uptodate name="morphlines-core"
|
||||
property="solr-morphlines-core.uptodate"/>
|
||||
|
||||
<target name="compile-morphlines-core" unless="solr-morphlines-core.uptodate">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="compile-test" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<solr-contrib-uptodate name="map-reduce"
|
||||
property="solr-map-reduce.uptodate"
|
||||
classpath.property="MapReduceIndexerTool.jar"/>
|
||||
<target name="compile-map-reduce" unless="solr-map-reduce.uptodate">
|
||||
|
||||
<ant dir="${common-solr.dir}/contrib/map-reduce" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-extraction-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-morphlines-core-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-map-reduce-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/map-reduce" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<path id="classpath.additions">
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-cell/classes/java"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/extraction/lib" excludes="${common.classpath.excludes}"/>
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/java"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/morphlines-core/lib" excludes="${common.classpath.excludes}"/>
|
||||
<!-- <pathelement location="${common-solr.dir}/build/contrib/solr-map-reduce/classes/java"/> -->
|
||||
<!-- <fileset dir="${common-solr.dir}/contrib/map-reduce/lib" excludes="${common.classpath.excludes}"/> -->
|
||||
</path>
|
||||
|
||||
<path id="classpath">
|
||||
<path refid="solr.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="solr.test.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
|
||||
<pathelement location="${common-solr.dir}/contrib/morphlines-core/src/test-files"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
|
||||
</path>
|
||||
|
||||
<path id="javadoc.classpath">
|
||||
<path refid="junit-path"/>
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${ant.home}/lib/ant.jar"/>
|
||||
<fileset dir=".">
|
||||
<exclude name="build/**/*.jar"/>
|
||||
<include name="**/lib/*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<!-- TODO: make this nicer like lucene? -->
|
||||
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,javadocs-extraction,javadocs-morphlines-core,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
|
||||
<sequential>
|
||||
<mkdir dir="${javadoc.dir}/${name}"/>
|
||||
<solr-invoke-javadoc>
|
||||
<solrsources>
|
||||
<packageset dir="${src.dir}"/>
|
||||
</solrsources>
|
||||
<links>
|
||||
<link href="../solr-solrj"/>
|
||||
<link href="../solr-core"/>
|
||||
<link href="../solr-cell"/>
|
||||
<link href="../solr-morphlines-core"/>
|
||||
</links>
|
||||
</solr-invoke-javadoc>
|
||||
<solr-jarify basedir="${javadoc.dir}/${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="javadocs-extraction">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="javadocs-morphlines-core">
|
||||
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="resolve-extraction-libs, resolve-morphlines-core-libs, resolve-map-reduce-libs, compile-solr-extraction, compile-morphlines-core, solr-contrib-build.compile-core"/>
|
||||
<target name="dist" depends="common-solr.dist"/>
|
||||
|
||||
</project>
|
|
@ -1,35 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.solr" module="morphlines-cell" />
|
||||
<configurations defaultconfmapping="compile->master;test->master">
|
||||
<conf name="compile" transitive="false" />
|
||||
<conf name="test" transitive="false" />
|
||||
</configurations>
|
||||
|
||||
<dependencies>
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-tika-core" rev="${/org.kitesdk/kite-morphlines-tika-core}" conf="compile" />
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-tika-decompress" rev="${/org.kitesdk/kite-morphlines-tika-decompress}" conf="compile" />
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-json" rev="${/org.kitesdk/kite-morphlines-json}" conf="compile" />
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-twitter" rev="${/org.kitesdk/kite-morphlines-twitter}" conf="compile" />
|
||||
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
|
||||
|
||||
</dependencies>
|
||||
</ivy-module>
|
|
@ -1,348 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.cell;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.IllformedLocaleException;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
import com.google.common.io.Closeables;
|
||||
import com.typesafe.config.Config;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
import org.apache.solr.common.params.MultiMapSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.SuppressForbidden;
|
||||
import org.apache.solr.handler.extraction.ExtractingParams;
|
||||
import org.apache.solr.handler.extraction.ExtractionDateUtil;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandler;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
|
||||
import org.apache.solr.morphlines.solr.SolrLocator;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.kitesdk.morphline.api.Command;
|
||||
import org.kitesdk.morphline.api.CommandBuilder;
|
||||
import org.kitesdk.morphline.api.MorphlineCompilationException;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.MorphlineRuntimeException;
|
||||
import org.kitesdk.morphline.api.Record;
|
||||
import org.kitesdk.morphline.base.Configs;
|
||||
import org.kitesdk.morphline.base.Fields;
|
||||
import org.kitesdk.morphline.stdio.AbstractParser;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* Command that pipes the first attachment of a record into one of the given Tika parsers, then maps
|
||||
* the Tika output back to a record using SolrCell.
|
||||
* <p>
|
||||
* The Tika parser is chosen from the configurable list of parsers, depending on the MIME type
|
||||
* specified in the input record. Typically, this requires an upstream DetectMimeTypeBuilder
|
||||
* in a prior command.
|
||||
*/
|
||||
public final class SolrCellBuilder implements CommandBuilder {
|
||||
|
||||
@Override
|
||||
public Collection<String> getNames() {
|
||||
return Collections.singletonList("solrCell");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
|
||||
return new SolrCell(this, config, parent, child, context);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class SolrCell extends AbstractParser {
|
||||
|
||||
private final IndexSchema schema;
|
||||
private final List<String> dateFormats;
|
||||
private final String xpathExpr;
|
||||
private final List<Parser> parsers = new ArrayList<>();
|
||||
private final SolrContentHandlerFactory solrContentHandlerFactory;
|
||||
private final Locale locale;
|
||||
|
||||
private final SolrParams solrParams;
|
||||
private final Map<MediaType, Parser> mediaTypeToParserMap;
|
||||
|
||||
private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
|
||||
|
||||
public static final String ADDITIONAL_SUPPORTED_MIME_TYPES = "additionalSupportedMimeTypes";
|
||||
|
||||
public SolrCell(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
|
||||
super(builder, config, parent, child, context);
|
||||
|
||||
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
|
||||
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
|
||||
LOG.debug("solrLocator: {}", locator);
|
||||
this.schema = Objects.requireNonNull(locator.getIndexSchema());
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("Solr schema: \n" + schema.getFields().entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByKey()).map(Map.Entry::getValue).map(Object::toString)
|
||||
.collect(Collectors.joining("\n")));
|
||||
}
|
||||
|
||||
ListMultimap<String, String> cellParams = ArrayListMultimap.create();
|
||||
String uprefix = getConfigs().getString(config, ExtractingParams.UNKNOWN_FIELD_PREFIX, null);
|
||||
if (uprefix != null) {
|
||||
cellParams.put(ExtractingParams.UNKNOWN_FIELD_PREFIX, uprefix);
|
||||
}
|
||||
for (String capture : getConfigs().getStringList(config, ExtractingParams.CAPTURE_ELEMENTS, Collections.<String>emptyList())) {
|
||||
cellParams.put(ExtractingParams.CAPTURE_ELEMENTS, capture);
|
||||
}
|
||||
Config fmapConfig = getConfigs().getConfig(config, "fmap", null);
|
||||
if (fmapConfig != null) {
|
||||
for (Map.Entry<String, Object> entry : new Configs().getEntrySet(fmapConfig)) {
|
||||
cellParams.put(ExtractingParams.MAP_PREFIX + entry.getKey(), entry.getValue().toString());
|
||||
}
|
||||
}
|
||||
String captureAttributes = getConfigs().getString(config, ExtractingParams.CAPTURE_ATTRIBUTES, null);
|
||||
if (captureAttributes != null) {
|
||||
cellParams.put(ExtractingParams.CAPTURE_ATTRIBUTES, captureAttributes);
|
||||
}
|
||||
String lowerNames = getConfigs().getString(config, ExtractingParams.LOWERNAMES, null);
|
||||
if (lowerNames != null) {
|
||||
cellParams.put(ExtractingParams.LOWERNAMES, lowerNames);
|
||||
}
|
||||
String defaultField = getConfigs().getString(config, ExtractingParams.DEFAULT_FIELD, null);
|
||||
if (defaultField != null) {
|
||||
cellParams.put(ExtractingParams.DEFAULT_FIELD, defaultField);
|
||||
}
|
||||
xpathExpr = getConfigs().getString(config, ExtractingParams.XPATH_EXPRESSION, null);
|
||||
if (xpathExpr != null) {
|
||||
cellParams.put(ExtractingParams.XPATH_EXPRESSION, xpathExpr);
|
||||
}
|
||||
|
||||
this.dateFormats = getConfigs().getStringList(config, "dateFormats", new ArrayList<>(ExtractionDateUtil.DEFAULT_DATE_FORMATS));
|
||||
|
||||
String handlerStr = getConfigs().getString(config, "solrContentHandlerFactory", TrimSolrContentHandlerFactory.class.getName());
|
||||
Class<? extends SolrContentHandlerFactory> factoryClass;
|
||||
try {
|
||||
factoryClass = Class.forName(handlerStr).asSubclass(SolrContentHandlerFactory.class);
|
||||
} catch (ClassNotFoundException cnfe) {
|
||||
throw new MorphlineCompilationException("Could not find class "
|
||||
+ handlerStr + " to use for " + "solrContentHandlerFactory", config, cnfe);
|
||||
}
|
||||
this.solrContentHandlerFactory = getSolrContentHandlerFactory(factoryClass, dateFormats, config);
|
||||
|
||||
this.locale = getLocale(getConfigs().getString(config, "locale", null));
|
||||
|
||||
this.mediaTypeToParserMap = new HashMap<>();
|
||||
//MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); // FIXME getMediaTypeRegistry.normalize()
|
||||
|
||||
List<? extends Config> parserConfigs = getConfigs().getConfigList(config, "parsers");
|
||||
for (Config parserConfig : parserConfigs) {
|
||||
String parserClassName = getConfigs().getString(parserConfig, "parser");
|
||||
|
||||
Object obj;
|
||||
try {
|
||||
obj = Class.forName(parserClassName).newInstance();
|
||||
} catch (Throwable e) {
|
||||
throw new MorphlineCompilationException("Cannot instantiate Tika parser: " + parserClassName, config, e);
|
||||
}
|
||||
if (!(obj instanceof Parser)) {
|
||||
throw new MorphlineCompilationException("Tika parser " + obj.getClass().getName()
|
||||
+ " must be an instance of class " + Parser.class.getName(), config);
|
||||
}
|
||||
Parser parser = (Parser) obj;
|
||||
this.parsers.add(parser);
|
||||
|
||||
List<String> mediaTypes = getConfigs().getStringList(parserConfig, SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
|
||||
for (String mediaTypeStr : mediaTypes) {
|
||||
MediaType mediaType = parseMediaType(mediaTypeStr);
|
||||
addSupportedMimeType(mediaTypeStr);
|
||||
this.mediaTypeToParserMap.put(mediaType, parser);
|
||||
}
|
||||
|
||||
if (!parserConfig.hasPath(SUPPORTED_MIME_TYPES)) {
|
||||
for (MediaType mediaType : parser.getSupportedTypes(new ParseContext())) {
|
||||
mediaType = mediaType.getBaseType();
|
||||
addSupportedMimeType(mediaType.toString());
|
||||
this.mediaTypeToParserMap.put(mediaType, parser);
|
||||
}
|
||||
List<String> extras = getConfigs().getStringList(parserConfig, ADDITIONAL_SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
|
||||
for (String mediaTypeStr : extras) {
|
||||
MediaType mediaType = parseMediaType(mediaTypeStr);
|
||||
addSupportedMimeType(mediaTypeStr);
|
||||
this.mediaTypeToParserMap.put(mediaType, parser);
|
||||
}
|
||||
}
|
||||
}
|
||||
//LOG.info("mediaTypeToParserMap="+mediaTypeToParserMap);
|
||||
|
||||
Map<String, String[]> tmp = new HashMap<>();
|
||||
for (Map.Entry<String,Collection<String>> entry : cellParams.asMap().entrySet()) {
|
||||
tmp.put(entry.getKey(), entry.getValue().toArray(new String[entry.getValue().size()]));
|
||||
}
|
||||
this.solrParams = new MultiMapSolrParams(tmp);
|
||||
validateArguments();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean doProcess(Record record, InputStream inputStream) {
|
||||
Parser parser = detectParser(record);
|
||||
if (parser == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(Locale.class, locale);
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
for (Entry<String, Object> entry : record.getFields().entries()) {
|
||||
metadata.add(entry.getKey(), entry.getValue().toString());
|
||||
}
|
||||
|
||||
SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
|
||||
try {
|
||||
inputStream = TikaInputStream.get(inputStream);
|
||||
|
||||
ContentHandler parsingHandler = handler;
|
||||
|
||||
// String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
|
||||
if (xpathExpr != null) {
|
||||
Matcher matcher = PARSER.parse(xpathExpr);
|
||||
parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
|
||||
}
|
||||
|
||||
try {
|
||||
parser.parse(inputStream, parsingHandler, metadata, parseContext);
|
||||
} catch (IOException | TikaException | SAXException e) {
|
||||
throw new MorphlineRuntimeException("Cannot parse", e);
|
||||
}
|
||||
} finally {
|
||||
if (inputStream != null) {
|
||||
Closeables.closeQuietly(inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
SolrInputDocument doc = handler.newDocument();
|
||||
LOG.debug("solr doc: {}", doc);
|
||||
Record outputRecord = toRecord(doc);
|
||||
return getChild().process(outputRecord);
|
||||
}
|
||||
|
||||
private Parser detectParser(Record record) {
|
||||
if (!hasAtLeastOneMimeType(record)) {
|
||||
return null;
|
||||
}
|
||||
String mediaTypeStr = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); //ExtractingParams.STREAM_TYPE);
|
||||
assert mediaTypeStr != null;
|
||||
|
||||
MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType();
|
||||
Parser parser = mediaTypeToParserMap.get(mediaType); // fast path
|
||||
if (parser != null) {
|
||||
return parser;
|
||||
}
|
||||
// wildcard matching
|
||||
for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) {
|
||||
if (isMediaTypeMatch(mediaType, entry.getKey())) {
|
||||
return entry.getValue();
|
||||
}
|
||||
}
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("No supported MIME type parser found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean hasAtLeastOneMimeType(Record record) {
|
||||
if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
|
||||
LOG.debug("Command failed because of missing MIME type for record: {}", record);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private MediaType parseMediaType(String mediaTypeStr) {
|
||||
MediaType mediaType = MediaType.parse(mediaTypeStr.trim().toLowerCase(Locale.ROOT));
|
||||
return mediaType.getBaseType();
|
||||
};
|
||||
|
||||
/** Returns true if mediaType falls withing the given range (pattern), false otherwise */
|
||||
private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) {
|
||||
String WILDCARD = "*";
|
||||
String rangePatternType = rangePattern.getType();
|
||||
String rangePatternSubtype = rangePattern.getSubtype();
|
||||
return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
|
||||
&& (rangePatternSubtype.equals(WILDCARD) || rangePatternSubtype.equals(mediaType.getSubtype()));
|
||||
}
|
||||
|
||||
private static SolrContentHandlerFactory getSolrContentHandlerFactory(
|
||||
Class<? extends SolrContentHandlerFactory> factoryClass, Collection<String> dateFormats, Config config) {
|
||||
try {
|
||||
return factoryClass.getConstructor(Collection.class).newInstance(dateFormats);
|
||||
} catch (NoSuchMethodException nsme) {
|
||||
throw new MorphlineCompilationException("Unable to find valid constructor of type "
|
||||
+ factoryClass.getName() + " for creating SolrContentHandler", config, nsme);
|
||||
} catch (Exception e) {
|
||||
throw new MorphlineCompilationException("Unexpected exception when trying to create SolrContentHandlerFactory of type "
|
||||
+ factoryClass.getName(), config, e);
|
||||
}
|
||||
}
|
||||
|
||||
private Record toRecord(SolrInputDocument doc) {
|
||||
Record record = new Record();
|
||||
for (Entry<String, SolrInputField> entry : doc.entrySet()) {
|
||||
record.getFields().putAll(entry.getKey(), entry.getValue().getValues());
|
||||
}
|
||||
return record;
|
||||
}
|
||||
|
||||
@SuppressForbidden(reason = "Usage of outdated locale parsing with Locale#toString() because of backwards compatibility")
|
||||
private Locale getLocale(String name) {
|
||||
if (name == null) {
|
||||
return Locale.ROOT;
|
||||
}
|
||||
for (Locale locale : Locale.getAvailableLocales()) {
|
||||
if (locale.toString().equals(name)) {
|
||||
return locale;
|
||||
}
|
||||
}
|
||||
try {
|
||||
return new Locale.Builder().setLanguageTag(name).build();
|
||||
} catch (IllformedLocaleException ex) {
|
||||
throw new MorphlineCompilationException("Malformed / non-existent locale: " + name, getConfig(), ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,81 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.cell;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandler;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
/**
|
||||
* {@link SolrContentHandler} and associated factory that strips non-characters and trims on output.
|
||||
* This prevents exceptions on parsing integer fields inside Solr server.
|
||||
*/
|
||||
public class StripNonCharSolrContentHandlerFactory extends SolrContentHandlerFactory {
|
||||
|
||||
public StripNonCharSolrContentHandlerFactory(Collection<String> dateFormats) {
|
||||
super(dateFormats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||
return new StripNonCharSolrContentHandler(metadata, params, schema, dateFormats);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class StripNonCharSolrContentHandler extends SolrContentHandler {
|
||||
|
||||
public StripNonCharSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema, Collection<String> dateFormats) {
|
||||
super(metadata, params, schema, dateFormats);
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip all non-characters, which can cause SolrReducer problems if present.
|
||||
* This is borrowed from Apache Nutch.
|
||||
*/
|
||||
private static String stripNonCharCodepoints(String input) {
|
||||
StringBuilder stripped = new StringBuilder(input.length());
|
||||
char ch;
|
||||
for (int i = 0; i < input.length(); i++) {
|
||||
ch = input.charAt(i);
|
||||
// Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
|
||||
// and non-printable control characters except tabulator, new line and carriage return
|
||||
if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
|
||||
ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
|
||||
(ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
|
||||
(ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
|
||||
stripped.append(ch);
|
||||
}
|
||||
}
|
||||
return stripped.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String transformValue(String val, SchemaField schemaField) {
|
||||
String ret = super.transformValue(val, schemaField).trim();
|
||||
ret = stripNonCharCodepoints(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.cell;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandler;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
/**
|
||||
* {@link SolrContentHandler} and associated factory that trims field values on output.
|
||||
* This prevents exceptions on parsing integer fields inside Solr server.
|
||||
*/
|
||||
public class TrimSolrContentHandlerFactory extends SolrContentHandlerFactory {
|
||||
|
||||
public TrimSolrContentHandlerFactory(Collection<String> dateFormats) {
|
||||
super(dateFormats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||
return new TrimSolrContentHandler(metadata, params, schema, dateFormats);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class TrimSolrContentHandler extends SolrContentHandler {
|
||||
|
||||
public TrimSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema, Collection<String> dateFormats) {
|
||||
super(metadata, params, schema, dateFormats);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String transformValue(String val, SchemaField schemaField) {
|
||||
return super.transformValue(val, schemaField).trim();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Morphlines Solr Cell related code.
|
||||
*/
|
||||
package org.apache.solr.morphlines.cell;
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
Apache Solr Search Server: Solr Cell Morphline Commands
|
||||
</body>
|
||||
</html>
|
|
@ -1 +0,0 @@
|
|||
The test-files by this module are located in the morphlines-core module.
|
|
@ -1,292 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.cell;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.apache.solr.handler.extraction.ExtractionDateUtil;
|
||||
import org.apache.solr.handler.extraction.SolrContentHandler;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
||||
|
||||
private Map<String,Integer> expectedRecords = new HashMap<>();
|
||||
private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<>();
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass2() {
|
||||
assumeFalse("FIXME: Morphlines currently has issues with Windows paths", Constants.WINDOWS);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
|
||||
expectedRecords.put(path + "sample-statuses-20120906-141433.avro", 2);
|
||||
expectedRecords.put(path + "sample-statuses-20120906-141433", 2);
|
||||
expectedRecords.put(path + "sample-statuses-20120906-141433.gz", 2);
|
||||
expectedRecords.put(path + "sample-statuses-20120906-141433.bz2", 2);
|
||||
expectedRecords.put(path + "cars.csv", 6);
|
||||
expectedRecords.put(path + "cars.csv.gz", 6);
|
||||
expectedRecords.put(path + "cars.tar.gz", 4);
|
||||
expectedRecords.put(path + "cars.tsv", 6);
|
||||
expectedRecords.put(path + "cars.ssv", 6);
|
||||
expectedRecords.put(path + "test-documents.7z", 9);
|
||||
expectedRecords.put(path + "test-documents.cpio", 9);
|
||||
expectedRecords.put(path + "test-documents.tar", 9);
|
||||
expectedRecords.put(path + "test-documents.tbz2", 9);
|
||||
expectedRecords.put(path + "test-documents.tgz", 9);
|
||||
expectedRecords.put(path + "test-documents.zip", 9);
|
||||
expectedRecords.put(path + "multiline-stacktrace.log", 4);
|
||||
|
||||
{
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "image/jpeg");
|
||||
record.put("ignored_exif_isospeedratings", "400");
|
||||
record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
|
||||
record.put("ignored_tiff_model", "Canon EOS 40D");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
|
||||
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
|
||||
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "testWORD_various.doc";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "application/msword");
|
||||
record.put("ignored_author", "Michael McCandless");
|
||||
record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
|
||||
record.put("ignored_title", "");
|
||||
record.put("ignored_keywords", "Keyword1 Keyword2");
|
||||
record.put("ignored_subject", "Subject is here");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "testPDF.pdf";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "application/pdf");
|
||||
record.put("ignored_author", "Bertrand Delacrétaz");
|
||||
record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
|
||||
record.put("ignored_title", "Apache Tika - Apache Tika");
|
||||
record.put("ignored_xmp_creatortool", "Firefox");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "email.eml";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
String name = "Patrick Foo <foo@cloudera.com>";
|
||||
record.put("ignored__attachment_mimetype", "message/rfc822");
|
||||
record.put("ignored_author", name);
|
||||
//record.put("ignored_content_length", "1068");
|
||||
record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
|
||||
record.put("ignored_message_from", name);
|
||||
record.put("ignored_message_to", name);
|
||||
record.put("ignored_creator", name);
|
||||
record.put("ignored_dc_creator", name);
|
||||
record.put("ignored_dc_title", "Test EML");
|
||||
record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
|
||||
record.put("ignored_meta_author", name);
|
||||
record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
|
||||
record.put("ignored_subject", "Test EML");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "testEXCEL.xlsx";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
|
||||
record.put("ignored_author", "Keith Bennett");
|
||||
record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
|
||||
record.put("ignored_title", "Simple Excel document");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
|
||||
public void testSolrCellJPGCompressed() throws Exception {
|
||||
morphline = createMorphline("test-morphlines" + File.separator + "solrCellJPGCompressed");
|
||||
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
|
||||
String[] files = new String[] {
|
||||
path + "testJPEG_EXIF.jpg",
|
||||
path + "testJPEG_EXIF.jpg.gz",
|
||||
path + "testJPEG_EXIF.jpg.tar.gz",
|
||||
//path + "jpeg2000.jp2",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSolrCellXML() throws Exception {
|
||||
morphline = createMorphline("test-morphlines" + File.separator + "solrCellXML");
|
||||
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
|
||||
String[] files = new String[] {
|
||||
path + "testXML2.xml",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
|
||||
public void testSolrCellDocumentTypes() throws Exception {
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
|
||||
|
||||
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
|
||||
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
|
||||
String[] files = new String[] {
|
||||
path + "testBMPfp.txt",
|
||||
path + "boilerplate.html",
|
||||
path + "NullHeader.docx",
|
||||
path + "testWORD_various.doc",
|
||||
path + "testPDF.pdf",
|
||||
path + "testJPEG_EXIF.jpg",
|
||||
path + "testJPEG_EXIF.jpg.gz",
|
||||
path + "testJPEG_EXIF.jpg.tar.gz",
|
||||
path + "testXML.xml",
|
||||
path + "cars.csv",
|
||||
// path + "cars.tsv",
|
||||
// path + "cars.ssv",
|
||||
path + "cars.csv.gz",
|
||||
path + "cars.tar.gz",
|
||||
path + "sample-statuses-20120906-141433.avro",
|
||||
path + "sample-statuses-20120906-141433",
|
||||
path + "sample-statuses-20120906-141433.gz",
|
||||
path + "sample-statuses-20120906-141433.bz2",
|
||||
path + "email.eml",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-9220")
|
||||
public void testSolrCellDocumentTypes2() throws Exception {
|
||||
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
|
||||
|
||||
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
|
||||
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
|
||||
String[] files = new String[] {
|
||||
path + "testPPT_various.ppt",
|
||||
path + "testPPT_various.pptx",
|
||||
path + "testEXCEL.xlsx",
|
||||
path + "testEXCEL.xls",
|
||||
path + "testPages.pages",
|
||||
//path + "testNumbers.numbers",
|
||||
//path + "testKeynote.key",
|
||||
|
||||
path + "testRTFVarious.rtf",
|
||||
path + "complex.mbox",
|
||||
path + "test-outlook.msg",
|
||||
path + "testEMLX.emlx",
|
||||
path + "testRFC822",
|
||||
path + "rsstest.rss",
|
||||
// path + "testDITA.dita",
|
||||
|
||||
path + "testMP3i18n.mp3",
|
||||
path + "testAIFF.aif",
|
||||
path + "testFLAC.flac",
|
||||
// path + "testFLAC.oga",
|
||||
// path + "testVORBIS.ogg",
|
||||
path + "testMP4.m4a",
|
||||
path + "testWAV.wav",
|
||||
// path + "testWMA.wma",
|
||||
|
||||
path + "testFLV.flv",
|
||||
// path + "testWMV.wmv",
|
||||
|
||||
path + "testBMP.bmp",
|
||||
path + "testPNG.png",
|
||||
path + "testPSD.psd",
|
||||
path + "testSVG.svg",
|
||||
path + "testTIFF.tif",
|
||||
|
||||
// path + "test-documents.7z",
|
||||
// path + "test-documents.cpio",
|
||||
// path + "test-documents.tar",
|
||||
// path + "test-documents.tbz2",
|
||||
// path + "test-documents.tgz",
|
||||
// path + "test-documents.zip",
|
||||
// path + "test-zip-of-zip.zip",
|
||||
// path + "testJAR.jar",
|
||||
|
||||
// path + "testKML.kml",
|
||||
// path + "testRDF.rdf",
|
||||
path + "testVISIO.vsd",
|
||||
// path + "testWAR.war",
|
||||
// path + "testWindows-x86-32.exe",
|
||||
// path + "testWINMAIL.dat",
|
||||
// path + "testWMF.wmf",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that the ContentHandler properly strips the illegal characters
|
||||
*/
|
||||
@Test
|
||||
public void testTransformValue() {
|
||||
String fieldName = "user_name";
|
||||
assertFalse("foobar".equals(getFoobarWithNonChars()));
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
// load illegal char string into a metadata field and generate a new document,
|
||||
// which will cause the ContentHandler to be invoked.
|
||||
metadata.set(fieldName, getFoobarWithNonChars());
|
||||
StripNonCharSolrContentHandlerFactory contentHandlerFactory =
|
||||
new StripNonCharSolrContentHandlerFactory(ExtractionDateUtil.DEFAULT_DATE_FORMATS);
|
||||
IndexSchema schema = h.getCore().getLatestSchema();
|
||||
SolrContentHandler contentHandler =
|
||||
contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
|
||||
SolrInputDocument doc = contentHandler.newDocument();
|
||||
String foobar = doc.getFieldValue(fieldName).toString();
|
||||
assertTrue("foobar".equals(foobar));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns string "foobar" with illegal characters interspersed.
|
||||
*/
|
||||
private String getFoobarWithNonChars() {
|
||||
char illegalChar = '\uffff';
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(illegalChar).append(illegalChar).append("foo").append(illegalChar)
|
||||
.append(illegalChar).append("bar").append(illegalChar).append(illegalChar);
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
Apache Solr Morphlines-Core
|
||||
|
||||
*Experimental* - This contrib is currently subject to change in ways that may
|
||||
break back compatibility.
|
||||
|
||||
This contrib provides a variety of Kite Morphlines features for Solr.
|
|
@ -1,105 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="solr-morphlines-core" default="default" xmlns:ivy="antlib:org.apache.ivy.ant">
|
||||
|
||||
<description>
|
||||
Solr Morphlines commands.
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<solr-contrib-uptodate name="extraction"
|
||||
property="solr-extraction.uptodate"
|
||||
classpath.property="solr-cell.jar"/>
|
||||
|
||||
<target name="compile-solr-extraction" unless="solr-extraction.uptodate">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="compile-core" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve-extraction-libs">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<path id="classpath.additions">
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-cell/classes/java"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/extraction/lib" excludes="${common.classpath.excludes}"/>
|
||||
</path>
|
||||
|
||||
<path id="classpath">
|
||||
<path refid="solr.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="solr.test.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
<fileset dir="${test.lib.dir}" includes="*.jar"/>
|
||||
</path>
|
||||
|
||||
<path id="javadoc.classpath">
|
||||
<path refid="junit-path"/>
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${ant.home}/lib/ant.jar"/>
|
||||
<fileset dir=".">
|
||||
<exclude name="build/**/*.jar"/>
|
||||
<include name="**/lib/*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<!-- TODO: make this nicer like lucene? -->
|
||||
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,javadocs-extraction,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
|
||||
<sequential>
|
||||
<mkdir dir="${javadoc.dir}/${name}"/>
|
||||
<solr-invoke-javadoc>
|
||||
<solrsources>
|
||||
<packageset dir="${src.dir}"/>
|
||||
</solrsources>
|
||||
<links>
|
||||
<link href="../solr-solrj"/>
|
||||
<link href="../solr-core"/>
|
||||
<link href="../solr-cell"/>
|
||||
</links>
|
||||
</solr-invoke-javadoc>
|
||||
<solr-jarify basedir="${javadoc.dir}/${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="javadocs-extraction">
|
||||
<ant dir="${common-solr.dir}/contrib/extraction" target="javadocs" inheritAll="false">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
</target>
|
||||
|
||||
<target name="resolve" depends="ivy-availability-check,ivy-fail,ivy-configure">
|
||||
<sequential>
|
||||
<ivy:retrieve conf="compile" type="jar,bundle" sync="${ivy.sync}" log="download-only" symlink="${ivy.symlink}"/>
|
||||
<ivy:retrieve conf="test,test.DfsMiniCluster" type="jar,bundle,test" sync="${ivy.sync}" log="download-only" symlink="${ivy.symlink}"
|
||||
pattern="${test.lib.dir}/[artifact]-[revision](-[classifier]).[ext]"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="resolve-extraction-libs, compile-solr-extraction, solr-contrib-build.compile-core"/>
|
||||
<target name="dist" depends="common-solr.dist"/>
|
||||
|
||||
</project>
|
|
@ -1,128 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<ivy-module version="2.0" xmlns:maven="http://ant.apache.org/ivy/maven">
|
||||
<info organisation="org.apache.solr" module="morphlines-core" />
|
||||
<configurations defaultconfmapping="compile->master;test->master;test.DfsMiniCluster->master">
|
||||
<!-- artifacts in the "compile" configuration will go into morphlines-core/lib/ -->
|
||||
<conf name="compile" transitive="false" />
|
||||
<!-- artifacts in the "test" and "test.DfsMiniCluster" configuration will go into morphlines-core/test-lib/ -->
|
||||
<conf name="test" transitive="false" />
|
||||
<conf name="test.DfsMiniCluster" transitive="false" />
|
||||
</configurations>
|
||||
|
||||
<dependencies>
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-core" rev="${/org.kitesdk/kite-morphlines-core}" conf="compile;test">
|
||||
<artifact name="kite-morphlines-core" ext="jar" />
|
||||
<artifact name="kite-morphlines-core" type="test" ext="jar" maven:classifier="tests" />
|
||||
</dependency>
|
||||
|
||||
<dependency org="org.kitesdk" name="kite-morphlines-avro" rev="${/org.kitesdk/kite-morphlines-avro}" conf="compile" />
|
||||
|
||||
<dependency org="io.dropwizard.metrics" name="metrics-core" rev="${/io.dropwizard.metrics/metrics-core}" conf="compile" />
|
||||
<dependency org="io.dropwizard.metrics" name="metrics-healthchecks" rev="${/io.dropwizard.metrics/metrics-healthchecks}" conf="compile" />
|
||||
<dependency org="com.typesafe" name="config" rev="${/com.typesafe/config}" conf="compile" />
|
||||
|
||||
<!-- Test Dependencies -->
|
||||
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="${/org.apache.hadoop/hadoop-mapreduce-client-core}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-common" rev="${/org.apache.hadoop/hadoop-yarn-common}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-api" rev="${/org.apache.hadoop/hadoop-yarn-api}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-client" rev="${/org.apache.hadoop/hadoop-yarn-client}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-tests" rev="${/org.apache.hadoop/hadoop-yarn-server-tests}" conf="test">
|
||||
<artifact name="hadoop-yarn-server-tests" type="test" ext="jar" maven:classifier="tests" />
|
||||
</dependency>
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-common" rev="${/org.apache.hadoop/hadoop-yarn-server-common}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-nodemanager" rev="${/org.apache.hadoop/hadoop-yarn-server-nodemanager}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-app" rev="${/org.apache.hadoop/hadoop-mapreduce-client-app}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-common" rev="${/org.apache.hadoop/hadoop-mapreduce-client-common}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-hs" rev="${/org.apache.hadoop/hadoop-mapreduce-client-hs}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-resourcemanager" rev="${/org.apache.hadoop/hadoop-yarn-server-resourcemanager}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-shuffle" rev="${/org.apache.hadoop/hadoop-mapreduce-client-shuffle}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-web-proxy" rev="${/org.apache.hadoop/hadoop-yarn-server-web-proxy}" conf="test" />
|
||||
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="${/org.apache.hadoop/hadoop-mapreduce-client-jobclient}" conf="test">
|
||||
<artifact name="hadoop-mapreduce-client-jobclient" type="jar" ext="jar" />
|
||||
<artifact name="hadoop-mapreduce-client-jobclient" type="test" ext="jar" maven:classifier="tests" />
|
||||
</dependency>
|
||||
|
||||
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-applicationhistoryservice" rev="${/org.apache.hadoop/hadoop-yarn-server-applicationhistoryservice}" conf="test"/>
|
||||
<dependency org="org.fusesource.leveldbjni" name="leveldbjni" rev="${/org.fusesource.leveldbjni/leveldbjni}" conf="test"/>
|
||||
<dependency org="org.iq80.leveldb" name="leveldb" rev="${/org.iq80.leveldb/leveldb}" conf="test.DfsMiniCluster"/>
|
||||
<dependency org="org.iq80.leveldb" name="leveldb-api" rev="${/org.iq80.leveldb/leveldb-api}" conf="test.DfsMiniCluster"/>
|
||||
<dependency org="org.apache.curator" name="curator-framework" rev="${/org.apache.curator/curator-framework}" conf="test"/>
|
||||
<dependency org="org.apache.curator" name="curator-client" rev="${/org.apache.curator/curator-client}" conf="test"/>
|
||||
|
||||
<dependency org="aopalliance" name="aopalliance" rev="${/aopalliance/aopalliance}" conf="test" />
|
||||
<dependency org="com.sun.xml.bind" name="jaxb-impl" rev="${/com.sun.xml.bind/jaxb-impl}" conf="test" />
|
||||
<dependency org="io.netty" name="netty-all" rev="${/io.netty/netty-all}" conf="test" />
|
||||
<dependency org="org.apache.mrunit" name="mrunit" rev="${/org.apache.mrunit/mrunit}" conf="test">
|
||||
<artifact name="mrunit" maven:classifier="hadoop2" />
|
||||
<exclude org="log4j" module="log4j" />
|
||||
</dependency>
|
||||
|
||||
<!-- Mocking -->
|
||||
<dependency org="org.mockito" name="mockito-core" rev="${/org.mockito/mockito-core}" conf="test"/>
|
||||
<dependency org="net.bytebuddy" name="byte-buddy" rev="${/net.bytebuddy/byte-buddy}" conf="test"/>
|
||||
<dependency org="org.objenesis" name="objenesis" rev="${/org.objenesis/objenesis}" conf="test"/>
|
||||
|
||||
<dependency org="commons-collections" name="commons-collections" rev="${/commons-collections/commons-collections}" conf="test" />
|
||||
|
||||
<!-- FasterXml Jackson Dependencies -->
|
||||
<dependency org="com.fasterxml.jackson.core" name="jackson-core" rev="${/com.fasterxml.jackson.core/jackson-core}" conf="test" />
|
||||
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="${/com.fasterxml.jackson.core/jackson-databind}" conf="test" />
|
||||
<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="${/com.fasterxml.jackson.core/jackson-annotations}" conf="test" />
|
||||
|
||||
<!-- CodeHaus Jackson Dependencies -->
|
||||
<dependency org="org.codehaus.jackson" name="jackson-jaxrs" rev="${/org.codehaus.jackson/jackson-jaxrs}" conf="test" />
|
||||
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="${/org.codehaus.jackson/jackson-mapper-asl}" conf="test" />
|
||||
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="${/org.codehaus.jackson/jackson-core-asl}" conf="test" />
|
||||
|
||||
<!-- Jersey Dependencies -->
|
||||
<dependency org="com.sun.jersey.contribs" name="jersey-guice" rev="${/com.sun.jersey.contribs/jersey-guice}" conf="test" />
|
||||
<dependency org="com.sun.jersey" name="jersey-core" rev="${/com.sun.jersey/jersey-core}" conf="test" />
|
||||
<dependency org="com.sun.jersey" name="jersey-json" rev="${/com.sun.jersey/jersey-json}" conf="test" />
|
||||
<dependency org="com.sun.jersey" name="jersey-server" rev="${/com.sun.jersey/jersey-server}" conf="test" />
|
||||
<dependency org="com.sun.jersey" name="jersey-bundle" rev="${/com.sun.jersey/jersey-bundle}" conf="test" />
|
||||
|
||||
<!-- Guice Dependencies -->
|
||||
<dependency org="com.google.inject" name="guice" rev="${/com.google.inject/guice}" conf="test" />
|
||||
<dependency org="com.google.inject.extensions" name="guice-servlet" rev="${/com.google.inject.extensions/guice-servlet}" conf="test" />
|
||||
<dependency org="javax.inject" name="javax.inject" rev="${/javax.inject/javax.inject}" conf="test" />
|
||||
|
||||
<!-- Avro Dependencies -->
|
||||
<dependency org="org.apache.avro" name="avro" rev="${/org.apache.avro/avro}" conf="test" />
|
||||
<dependency org="com.thoughtworks.paranamer" name="paranamer" rev="${/com.thoughtworks.paranamer/paranamer}" conf="test" />
|
||||
<dependency org="org.xerial.snappy" name="snappy-java" rev="${/org.xerial.snappy/snappy-java}" conf="test" />
|
||||
|
||||
<!-- Hadoop DfsMiniCluster Dependencies -->
|
||||
<dependency org="org.apache.hadoop" name="hadoop-common" rev="${/org.apache.hadoop/hadoop-common}" conf="test.DfsMiniCluster">
|
||||
<artifact name="hadoop-common" type="jar" ext="jar" />
|
||||
<artifact name="hadoop-common" type="test" ext="jar" maven:classifier="tests" />
|
||||
</dependency>
|
||||
<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="${/org.apache.hadoop/hadoop-hdfs}" conf="test.DfsMiniCluster">
|
||||
<artifact name="hadoop-hdfs" type="test" ext="jar" maven:classifier="tests" />
|
||||
</dependency>
|
||||
<dependency org="org.mortbay.jetty" name="jetty" rev="${/org.mortbay.jetty/jetty}" conf="test.DfsMiniCluster" />
|
||||
<dependency org="org.mortbay.jetty" name="jetty-util" rev="${/org.mortbay.jetty/jetty-util}" conf="test.DfsMiniCluster" />
|
||||
<dependency org="com.sun.jersey" name="jersey-core" rev="${/com.sun.jersey/jersey-core}" conf="test.DfsMiniCluster" />
|
||||
<dependency org="org.apache.htrace" name="htrace-core" rev="${/org.apache.htrace/htrace-core}" conf="test.DfsMiniCluster"/>
|
||||
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
|
||||
|
||||
</dependencies>
|
||||
</ivy-module>
|
|
@ -1,73 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
/**
|
||||
* A vehicle to load a list of Solr documents into some kind of destination,
|
||||
* such as a SolrServer or MapReduce RecordWriter.
|
||||
*/
|
||||
public interface DocumentLoader {
|
||||
|
||||
/** Begins a transaction */
|
||||
public void beginTransaction() throws IOException, SolrServerException;
|
||||
|
||||
/** Loads the given document into the destination */
|
||||
public void load(SolrInputDocument doc) throws IOException, SolrServerException;
|
||||
|
||||
/**
|
||||
* Sends any outstanding documents to the destination and waits for a positive
|
||||
* or negative ack (i.e. exception). Depending on the outcome the caller
|
||||
* should then commit or rollback the current flume transaction
|
||||
* correspondingly.
|
||||
*
|
||||
* @throws IOException
|
||||
* If there is a low-level I/O error.
|
||||
*/
|
||||
public void commitTransaction() throws IOException, SolrServerException;
|
||||
|
||||
/**
|
||||
* Performs a rollback of all non-committed documents pending.
|
||||
* <p>
|
||||
* Note that this is not a true rollback as in databases. Content you have
|
||||
* previously added may have already been committed due to autoCommit, buffer
|
||||
* full, other client performing a commit etc. So this is only a best-effort
|
||||
* rollback.
|
||||
*
|
||||
* @throws IOException
|
||||
* If there is a low-level I/O error.
|
||||
*/
|
||||
public UpdateResponse rollbackTransaction() throws IOException, SolrServerException;
|
||||
|
||||
/** Releases allocated resources */
|
||||
public void shutdown() throws IOException, SolrServerException;
|
||||
|
||||
/**
|
||||
* Issues a ping request to check if the server is alive
|
||||
*
|
||||
* @throws IOException
|
||||
* If there is a low-level I/O error.
|
||||
*/
|
||||
public SolrPingResponse ping() throws IOException, SolrServerException;
|
||||
|
||||
}
|
|
@ -1,140 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
|
||||
|
||||
class FileUtils {
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Deletes a directory recursively.
|
||||
*
|
||||
* @param directory directory to delete
|
||||
* @throws IOException in case deletion is unsuccessful
|
||||
*/
|
||||
public static void deleteDirectory(File directory) throws IOException {
|
||||
if (!directory.exists()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isSymlink(directory)) {
|
||||
cleanDirectory(directory);
|
||||
}
|
||||
|
||||
Files.delete(directory.toPath());
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the specified file is a Symbolic Link rather than an actual file.
|
||||
* <p>
|
||||
* Will not return true if there is a Symbolic Link anywhere in the path,
|
||||
* only if the specific file is.
|
||||
*
|
||||
* @param file the file to check
|
||||
* @return true if the file is a Symbolic Link
|
||||
* @throws IOException if an IO error occurs while checking the file
|
||||
* @since Commons IO 2.0
|
||||
*/
|
||||
public static boolean isSymlink(File file) throws IOException {
|
||||
if (file == null) {
|
||||
throw new NullPointerException("File must not be null");
|
||||
}
|
||||
// if (FilenameUtils.isSystemWindows()) {
|
||||
if (File.separatorChar == '\\') {
|
||||
return false;
|
||||
}
|
||||
File fileInCanonicalDir = null;
|
||||
if (file.getParent() == null) {
|
||||
fileInCanonicalDir = file;
|
||||
} else {
|
||||
File canonicalDir = file.getParentFile().getCanonicalFile();
|
||||
fileInCanonicalDir = new File(canonicalDir, file.getName());
|
||||
}
|
||||
|
||||
if (fileInCanonicalDir.getCanonicalFile().equals(fileInCanonicalDir.getAbsoluteFile())) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleans a directory without deleting it.
|
||||
*
|
||||
* @param directory directory to clean
|
||||
* @throws IOException in case cleaning is unsuccessful
|
||||
*/
|
||||
public static void cleanDirectory(File directory) throws IOException {
|
||||
if (!directory.exists()) {
|
||||
String message = directory + " does not exist";
|
||||
throw new IllegalArgumentException(message);
|
||||
}
|
||||
|
||||
if (!directory.isDirectory()) {
|
||||
String message = directory + " is not a directory";
|
||||
throw new IllegalArgumentException(message);
|
||||
}
|
||||
|
||||
File[] files = directory.listFiles();
|
||||
if (files == null) { // null if security restricted
|
||||
throw new IOException("Failed to list contents of " + directory);
|
||||
}
|
||||
|
||||
IOException exception = null;
|
||||
for (File file : files) {
|
||||
try {
|
||||
forceDelete(file);
|
||||
} catch (IOException ioe) {
|
||||
exception = ioe;
|
||||
}
|
||||
}
|
||||
|
||||
if (null != exception) {
|
||||
throw exception;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Deletes a file. If file is a directory, delete it and all sub-directories.
|
||||
* <p>
|
||||
* The difference between File.delete() and this method are:
|
||||
* <ul>
|
||||
* <li>A directory to be deleted does not have to be empty.</li>
|
||||
* <li>You get exceptions when a file or directory cannot be deleted.
|
||||
* (java.io.File methods returns a boolean)</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param file file or directory to delete, must not be <code>null</code>
|
||||
* @throws NullPointerException if the directory is <code>null</code>
|
||||
* @throws FileNotFoundException if the file was not found
|
||||
* @throws IOException in case deletion is unsuccessful
|
||||
*/
|
||||
public static void forceDelete(File file) throws IOException {
|
||||
if (file.isDirectory()) {
|
||||
deleteDirectory(file);
|
||||
} else {
|
||||
Files.delete(file.toPath());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,143 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import java.security.SecureRandom;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.kitesdk.morphline.api.Command;
|
||||
import org.kitesdk.morphline.api.CommandBuilder;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.MorphlineRuntimeException;
|
||||
import org.kitesdk.morphline.api.Record;
|
||||
import org.kitesdk.morphline.base.AbstractCommand;
|
||||
import org.kitesdk.morphline.base.Fields;
|
||||
import org.kitesdk.morphline.base.Notifications;
|
||||
|
||||
import com.typesafe.config.Config;
|
||||
|
||||
/**
|
||||
* A command that assigns a record unique key that is the concatenation of the given
|
||||
* <code>baseIdField</code> record field, followed by a running count of the record number within
|
||||
* the current session. The count is reset to zero whenever a "startSession" notification is
|
||||
* received.
|
||||
* <p>
|
||||
* For example, assume a CSV file containing multiple records but no unique ids, and the
|
||||
* <code>baseIdField</code> field is the filesystem path of the file. Now this command can be used
|
||||
* to assign the following record values to Solr's unique key field:
|
||||
* <code>$path#0, $path#1, ... $path#N</code>.
|
||||
* <p>
|
||||
* The name of the unique key field is fetched from Solr's schema.xml file, as directed by the
|
||||
* <code>solrLocator</code> configuration parameter.
|
||||
*/
|
||||
public final class GenerateSolrSequenceKeyBuilder implements CommandBuilder {
|
||||
|
||||
@Override
|
||||
public Collection<String> getNames() {
|
||||
return Arrays.asList(
|
||||
"generateSolrSequenceKey",
|
||||
"sanitizeUniqueSolrKey" // old name (retained for backwards compatibility)
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
|
||||
return new GenerateSolrSequenceKey(this, config, parent, child, context);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class GenerateSolrSequenceKey extends AbstractCommand {
|
||||
|
||||
private final boolean preserveExisting;
|
||||
private final String baseIdFieldName;
|
||||
private final String uniqueKeyName;
|
||||
private long recordCounter = 0;
|
||||
|
||||
private final String idPrefix; // for load testing only; enables adding same document many times with a different unique key
|
||||
private final Random randomIdPrefix; // for load testing only; enables adding same document many times with a different unique key
|
||||
|
||||
public GenerateSolrSequenceKey(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
|
||||
super(builder, config, parent, child, context);
|
||||
this.baseIdFieldName = getConfigs().getString(config, "baseIdField", Fields.BASE_ID);
|
||||
this.preserveExisting = getConfigs().getBoolean(config, "preserveExisting", true);
|
||||
|
||||
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
|
||||
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
|
||||
LOG.debug("solrLocator: {}", locator);
|
||||
IndexSchema schema = locator.getIndexSchema();
|
||||
SchemaField uniqueKey = schema.getUniqueKeyField();
|
||||
uniqueKeyName = uniqueKey == null ? null : uniqueKey.getName();
|
||||
|
||||
String tmpIdPrefix = getConfigs().getString(config, "idPrefix", null); // for load testing only
|
||||
Random tmpRandomIdPrefx = null;
|
||||
if ("random".equals(tmpIdPrefix)) { // for load testing only
|
||||
tmpRandomIdPrefx = new Random(new SecureRandom().nextLong());
|
||||
tmpIdPrefix = null;
|
||||
}
|
||||
idPrefix = tmpIdPrefix;
|
||||
randomIdPrefix = tmpRandomIdPrefx;
|
||||
validateArguments();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean doProcess(Record doc) {
|
||||
long num = recordCounter++;
|
||||
// LOG.debug("record #{} id before sanitizing doc: {}", num, doc);
|
||||
if (uniqueKeyName == null || (preserveExisting && doc.getFields().containsKey(uniqueKeyName))) {
|
||||
; // we must preserve the existing id
|
||||
} else {
|
||||
Object baseId = doc.getFirstValue(baseIdFieldName);
|
||||
if (baseId == null) {
|
||||
throw new MorphlineRuntimeException("Record field " + baseIdFieldName
|
||||
+ " must not be null as it is needed as a basis for a unique key for solr doc: " + doc);
|
||||
}
|
||||
doc.replaceValues(uniqueKeyName, baseId.toString() + "#" + num);
|
||||
}
|
||||
|
||||
// for load testing only; enables adding same document many times with a different unique key
|
||||
if (idPrefix != null) {
|
||||
String id = doc.getFirstValue(uniqueKeyName).toString();
|
||||
id = idPrefix + id;
|
||||
doc.replaceValues(uniqueKeyName, id);
|
||||
} else if (randomIdPrefix != null) {
|
||||
String id = doc.getFirstValue(uniqueKeyName).toString();
|
||||
id = String.valueOf(Math.abs(randomIdPrefix.nextInt())) + "#" + id;
|
||||
doc.replaceValues(uniqueKeyName, id);
|
||||
}
|
||||
|
||||
LOG.debug("record #{} unique key sanitized to this: {}", num, doc);
|
||||
|
||||
return super.doProcess(doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doNotify(Record notification) {
|
||||
if (Notifications.containsLifecycleEvent(notification, Notifications.LifecycleEvent.START_SESSION)) {
|
||||
recordCounter = 0; // reset
|
||||
}
|
||||
super.doNotify(notification);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import org.kitesdk.morphline.api.Command;
|
||||
import org.kitesdk.morphline.api.CommandBuilder;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.MorphlineRuntimeException;
|
||||
import org.kitesdk.morphline.api.Record;
|
||||
import org.kitesdk.morphline.base.AbstractCommand;
|
||||
import org.kitesdk.morphline.base.Configs;
|
||||
import org.kitesdk.morphline.base.Metrics;
|
||||
import org.kitesdk.morphline.base.Notifications;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.codahale.metrics.Timer;
|
||||
import com.typesafe.config.Config;
|
||||
import com.typesafe.config.ConfigFactory;
|
||||
|
||||
/**
|
||||
* A command that loads a record into a SolrServer or MapReduce SolrOutputFormat.
|
||||
*/
|
||||
public final class LoadSolrBuilder implements CommandBuilder {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
private static final AtomicBoolean WARNED_ABOUT_INDEX_TIME_BOOSTS = new AtomicBoolean();
|
||||
|
||||
@Override
|
||||
public Collection<String> getNames() {
|
||||
return Collections.singletonList("loadSolr");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
|
||||
return new LoadSolr(this, config, parent, child, context);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class LoadSolr extends AbstractCommand {
|
||||
|
||||
private final DocumentLoader loader;
|
||||
private final Timer elapsedTime;
|
||||
|
||||
public LoadSolr(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
|
||||
super(builder, config, parent, child, context);
|
||||
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
|
||||
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
|
||||
LOG.debug("solrLocator: {}", locator);
|
||||
this.loader = locator.getLoader();
|
||||
Config boostsConfig = getConfigs().getConfig(config, "boosts", ConfigFactory.empty());
|
||||
if (new Configs().getEntrySet(boostsConfig).isEmpty() == false) {
|
||||
String message = "Ignoring field boosts: as index-time boosts are not supported anymore";
|
||||
if (WARNED_ABOUT_INDEX_TIME_BOOSTS.compareAndSet(false, true)) {
|
||||
log.warn(message);
|
||||
} else {
|
||||
log.debug(message);
|
||||
}
|
||||
}
|
||||
validateArguments();
|
||||
this.elapsedTime = getTimer(Metrics.ELAPSED_TIME);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doNotify(Record notification) {
|
||||
for (Object event : Notifications.getLifecycleEvents(notification)) {
|
||||
if (event == Notifications.LifecycleEvent.BEGIN_TRANSACTION) {
|
||||
try {
|
||||
loader.beginTransaction();
|
||||
} catch (SolrServerException | IOException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
}
|
||||
} else if (event == Notifications.LifecycleEvent.COMMIT_TRANSACTION) {
|
||||
try {
|
||||
loader.commitTransaction();
|
||||
} catch (SolrServerException | IOException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
}
|
||||
}
|
||||
else if (event == Notifications.LifecycleEvent.ROLLBACK_TRANSACTION) {
|
||||
try {
|
||||
loader.rollbackTransaction();
|
||||
} catch (SolrServerException | IOException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
}
|
||||
}
|
||||
else if (event == Notifications.LifecycleEvent.SHUTDOWN) {
|
||||
try {
|
||||
loader.shutdown();
|
||||
} catch (SolrServerException | IOException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
super.doNotify(notification);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean doProcess(Record record) {
|
||||
Timer.Context timerContext = elapsedTime.time();
|
||||
SolrInputDocument doc = convert(record);
|
||||
try {
|
||||
loader.load(doc);
|
||||
} catch (IOException | SolrServerException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
} finally {
|
||||
timerContext.stop();
|
||||
}
|
||||
|
||||
// pass record to next command in chain:
|
||||
return super.doProcess(record);
|
||||
}
|
||||
|
||||
private SolrInputDocument convert(Record record) {
|
||||
Map<String, Collection<Object>> map = record.getFields().asMap();
|
||||
SolrInputDocument doc = new SolrInputDocument(new HashMap(2 * map.size()));
|
||||
for (Map.Entry<String, Collection<Object>> entry : map.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
doc.setField(key, entry.getValue());
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,70 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import org.apache.http.client.HttpClient;
|
||||
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* ConcurrentUpdateSolrServer that propagates exceptions up to the submitter of
|
||||
* requests on blockUntilFinished()
|
||||
*/
|
||||
final class SafeConcurrentUpdateSolrClient extends ConcurrentUpdateSolrClient {
|
||||
|
||||
private Throwable currentException = null;
|
||||
private final Object myLock = new Object();
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public SafeConcurrentUpdateSolrClient(String solrServerUrl, int queueSize, int threadCount) {
|
||||
this(solrServerUrl, null, queueSize, threadCount);
|
||||
}
|
||||
|
||||
public SafeConcurrentUpdateSolrClient(String solrServerUrl, HttpClient client, int queueSize, int threadCount) {
|
||||
super(solrServerUrl, client, queueSize, threadCount, null, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleError(Throwable ex) {
|
||||
assert ex != null;
|
||||
synchronized (myLock) {
|
||||
currentException = ex;
|
||||
}
|
||||
LOGGER.error("handleError", ex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void blockUntilFinished() {
|
||||
super.blockUntilFinished();
|
||||
synchronized (myLock) {
|
||||
if (currentException != null) {
|
||||
throw new RuntimeException(currentException);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void clearException() {
|
||||
synchronized (myLock) {
|
||||
currentException = null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
import org.kitesdk.morphline.api.Command;
|
||||
import org.kitesdk.morphline.api.CommandBuilder;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.Record;
|
||||
import org.kitesdk.morphline.base.AbstractCommand;
|
||||
import com.typesafe.config.Config;
|
||||
|
||||
/**
|
||||
* Command that sanitizes record fields that are unknown to Solr schema.xml by either deleting them
|
||||
* (renameToPrefix is absent or a zero length string), or by moving them to a field prefixed with
|
||||
* the given renameToPrefix (e.g. renameToPrefix = "ignored_" to use typical dynamic Solr fields).
|
||||
* <p>
|
||||
* Recall that Solr throws an exception on any attempt to load a document that contains a field that
|
||||
* isn't specified in schema.xml.
|
||||
*/
|
||||
public final class SanitizeUnknownSolrFieldsBuilder implements CommandBuilder {
|
||||
|
||||
@Override
|
||||
public Collection<String> getNames() {
|
||||
return Collections.singletonList("sanitizeUnknownSolrFields");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
|
||||
return new SanitizeUnknownSolrFields(this, config, parent, child, context);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
private static final class SanitizeUnknownSolrFields extends AbstractCommand {
|
||||
|
||||
private final IndexSchema schema;
|
||||
private final String renameToPrefix;
|
||||
|
||||
public SanitizeUnknownSolrFields(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
|
||||
super(builder, config, parent, child, context);
|
||||
|
||||
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
|
||||
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
|
||||
LOG.debug("solrLocator: {}", locator);
|
||||
this.schema = Objects.requireNonNull(locator.getIndexSchema());
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("Solr schema: \n" +
|
||||
schema.getFields().entrySet().stream().sorted(Map.Entry.comparingByKey())
|
||||
.map(Map.Entry::getValue).map(Object::toString).collect(Collectors.joining("\n"))
|
||||
);
|
||||
}
|
||||
|
||||
String str = getConfigs().getString(config, "renameToPrefix", "").trim();
|
||||
this.renameToPrefix = str.length() > 0 ? str : null;
|
||||
validateArguments();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean doProcess(Record record) {
|
||||
Collection<Map.Entry> entries = new ArrayList<Map.Entry>(record.getFields().asMap().entrySet());
|
||||
for (Map.Entry<String, Collection<Object>> entry : entries) {
|
||||
String key = entry.getKey();
|
||||
if (schema.getFieldOrNull(key) == null) {
|
||||
LOG.debug("Sanitizing unknown Solr field: {}", key);
|
||||
Collection values = entry.getValue();
|
||||
if (renameToPrefix != null) {
|
||||
record.getFields().putAll(renameToPrefix + key, values);
|
||||
}
|
||||
values.clear(); // implicitly removes key from record
|
||||
}
|
||||
}
|
||||
return super.doProcess(record);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,124 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient;
|
||||
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A vehicle to load a list of Solr documents into a local or remote {@link org.apache.solr.client.solrj.SolrClient}.
|
||||
*/
|
||||
public class SolrClientDocumentLoader implements DocumentLoader {
|
||||
|
||||
private final SolrClient client; // proxy to local or remote solr server
|
||||
private long numLoadedDocs = 0; // number of documents loaded in the current transaction
|
||||
private final int batchSize;
|
||||
private final List<SolrInputDocument> batch = new ArrayList();
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public SolrClientDocumentLoader(SolrClient client, int batchSize) {
|
||||
if (client == null) {
|
||||
throw new IllegalArgumentException("solr server must not be null");
|
||||
}
|
||||
this.client = client;
|
||||
if (batchSize <= 0) {
|
||||
throw new IllegalArgumentException("batchSize must be a positive number: " + batchSize);
|
||||
}
|
||||
this.batchSize = batchSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void beginTransaction() {
|
||||
LOGGER.trace("beginTransaction");
|
||||
batch.clear();
|
||||
numLoadedDocs = 0;
|
||||
if (client instanceof SafeConcurrentUpdateSolrClient) {
|
||||
((SafeConcurrentUpdateSolrClient) client).clearException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void load(SolrInputDocument doc) throws IOException, SolrServerException {
|
||||
LOGGER.trace("load doc: {}", doc);
|
||||
batch.add(doc);
|
||||
if (batch.size() >= batchSize) {
|
||||
loadBatch();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void commitTransaction() throws SolrServerException, IOException {
|
||||
LOGGER.trace("commitTransaction");
|
||||
if (batch.size() > 0) {
|
||||
loadBatch();
|
||||
}
|
||||
if (numLoadedDocs > 0) {
|
||||
if (client instanceof ConcurrentUpdateSolrClient) {
|
||||
((ConcurrentUpdateSolrClient) client).blockUntilFinished();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void loadBatch() throws SolrServerException, IOException {
|
||||
numLoadedDocs += batch.size();
|
||||
try {
|
||||
UpdateResponse rsp = client.add(batch);
|
||||
} finally {
|
||||
batch.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateResponse rollbackTransaction() throws SolrServerException, IOException {
|
||||
LOGGER.trace("rollback");
|
||||
if (!(client instanceof CloudSolrClient)) {
|
||||
return client.rollback();
|
||||
} else {
|
||||
return new UpdateResponse();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() throws IOException {
|
||||
LOGGER.trace("shutdown");
|
||||
client.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrPingResponse ping() throws SolrServerException, IOException {
|
||||
LOGGER.trace("ping");
|
||||
return client.ping();
|
||||
}
|
||||
|
||||
public SolrClient getSolrClient() {
|
||||
return client;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,254 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.google.common.io.Files;
|
||||
import com.typesafe.config.Config;
|
||||
import com.typesafe.config.ConfigFactory;
|
||||
import com.typesafe.config.ConfigRenderOptions;
|
||||
import com.typesafe.config.ConfigUtil;
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient.Builder;
|
||||
import org.apache.solr.common.cloud.SolrZkClient;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.util.SystemIdResolver;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.kitesdk.morphline.api.MorphlineCompilationException;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.MorphlineRuntimeException;
|
||||
import org.kitesdk.morphline.base.Configs;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* Set of configuration parameters that identify the location and schema of a Solr server or
|
||||
* SolrCloud; Based on this information this class can return the schema and a corresponding
|
||||
* {@link DocumentLoader}.
|
||||
*/
|
||||
public class SolrLocator {
|
||||
|
||||
private Config config;
|
||||
private MorphlineContext context;
|
||||
private String collectionName;
|
||||
private String zkHost;
|
||||
private String solrUrl;
|
||||
private String solrHomeDir;
|
||||
private int batchSize = 1000;
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
protected SolrLocator(MorphlineContext context) {
|
||||
this.context = Objects.requireNonNull(context);
|
||||
}
|
||||
|
||||
public SolrLocator(Config config, MorphlineContext context) {
|
||||
this(context);
|
||||
this.config = config;
|
||||
Configs configs = new Configs();
|
||||
collectionName = configs.getString(config, "collection", null);
|
||||
zkHost = configs.getString(config, "zkHost", null);
|
||||
solrHomeDir = configs.getString(config, "solrHomeDir", null);
|
||||
solrUrl = configs.getString(config, "solrUrl", null);
|
||||
batchSize = configs.getInt(config, "batchSize", batchSize);
|
||||
LOG.trace("Constructed solrLocator: {}", this);
|
||||
configs.validateArguments(config);
|
||||
}
|
||||
|
||||
public DocumentLoader getLoader() {
|
||||
if (context instanceof SolrMorphlineContext) {
|
||||
DocumentLoader loader = ((SolrMorphlineContext)context).getDocumentLoader();
|
||||
if (loader != null) {
|
||||
return loader;
|
||||
}
|
||||
}
|
||||
|
||||
if (zkHost != null && zkHost.length() > 0) {
|
||||
if (collectionName == null || collectionName.length() == 0) {
|
||||
throw new MorphlineCompilationException("Parameter 'zkHost' requires that you also pass parameter 'collection'", config);
|
||||
}
|
||||
CloudSolrClient cloudSolrClient = new Builder()
|
||||
.withZkHost(zkHost)
|
||||
.build();
|
||||
cloudSolrClient.setDefaultCollection(collectionName);
|
||||
cloudSolrClient.connect();
|
||||
return new SolrClientDocumentLoader(cloudSolrClient, batchSize);
|
||||
} else {
|
||||
if (solrUrl == null || solrUrl.length() == 0) {
|
||||
throw new MorphlineCompilationException("Missing parameter 'solrUrl'", config);
|
||||
}
|
||||
int solrServerNumThreads = 2;
|
||||
int solrServerQueueLength = solrServerNumThreads;
|
||||
SolrClient server = new SafeConcurrentUpdateSolrClient(solrUrl, solrServerQueueLength, solrServerNumThreads);
|
||||
// SolrServer server = new HttpSolrServer(solrServerUrl);
|
||||
// SolrServer server = new ConcurrentUpdateSolrServer(solrServerUrl, solrServerQueueLength, solrServerNumThreads);
|
||||
// server.setParser(new XMLResponseParser()); // binary parser is used by default
|
||||
return new SolrClientDocumentLoader(server, batchSize);
|
||||
}
|
||||
}
|
||||
|
||||
public IndexSchema getIndexSchema() {
|
||||
if (context instanceof SolrMorphlineContext) {
|
||||
IndexSchema schema = ((SolrMorphlineContext)context).getIndexSchema();
|
||||
if (schema != null) {
|
||||
validateSchema(schema);
|
||||
return schema;
|
||||
}
|
||||
}
|
||||
|
||||
File downloadedSolrHomeDir = null;
|
||||
try {
|
||||
// If solrHomeDir isn't defined and zkHost and collectionName are defined
|
||||
// then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir
|
||||
String mySolrHomeDir = solrHomeDir;
|
||||
if (solrHomeDir == null || solrHomeDir.length() == 0) {
|
||||
if (zkHost == null || zkHost.length() == 0) {
|
||||
// TODO: implement download from solrUrl if specified
|
||||
throw new MorphlineCompilationException(
|
||||
"Downloading a Solr schema requires either parameter 'solrHomeDir' or parameters 'zkHost' and 'collection'",
|
||||
config);
|
||||
}
|
||||
if (collectionName == null || collectionName.length() == 0) {
|
||||
throw new MorphlineCompilationException(
|
||||
"Parameter 'zkHost' requires that you also pass parameter 'collection'", config);
|
||||
}
|
||||
ZooKeeperDownloader zki = new ZooKeeperDownloader();
|
||||
SolrZkClient zkClient = zki.getZkClient(zkHost);
|
||||
try {
|
||||
String configName = zki.readConfigName(zkClient, collectionName);
|
||||
downloadedSolrHomeDir = Files.createTempDir();
|
||||
downloadedSolrHomeDir = zki.downloadConfigDir(zkClient, configName, downloadedSolrHomeDir);
|
||||
mySolrHomeDir = downloadedSolrHomeDir.getAbsolutePath();
|
||||
} catch (KeeperException | InterruptedException | IOException e) {
|
||||
throw new MorphlineCompilationException("Cannot download schema.xml from ZooKeeper", config, e);
|
||||
} finally {
|
||||
zkClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
LOG.debug("SolrLocator loading IndexSchema from dir {}", mySolrHomeDir);
|
||||
try {
|
||||
SolrResourceLoader loader = new SolrResourceLoader(Paths.get(mySolrHomeDir));
|
||||
SolrConfig solrConfig = new SolrConfig(loader, "solrconfig.xml", null);
|
||||
InputSource is = new InputSource(loader.openSchema("schema.xml"));
|
||||
is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml"));
|
||||
|
||||
IndexSchema schema = new IndexSchema(solrConfig, "schema.xml", is);
|
||||
validateSchema(schema);
|
||||
return schema;
|
||||
} catch (ParserConfigurationException | IOException | SAXException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
}
|
||||
} finally {
|
||||
if (downloadedSolrHomeDir != null) {
|
||||
try {
|
||||
FileUtils.deleteDirectory(downloadedSolrHomeDir);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Cannot delete tmp directory", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void validateSchema(IndexSchema schema) {
|
||||
if (schema.getUniqueKeyField() == null) {
|
||||
throw new MorphlineCompilationException("Solr schema.xml is missing unique key field", config);
|
||||
}
|
||||
if (!schema.getUniqueKeyField().isRequired()) {
|
||||
throw new MorphlineCompilationException("Solr schema.xml must contain a required unique key field", config);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return toConfig(null).root().render(ConfigRenderOptions.concise());
|
||||
}
|
||||
|
||||
public Config toConfig(String key) {
|
||||
String json = "";
|
||||
if (key != null) {
|
||||
json = toJson(key) + " : ";
|
||||
}
|
||||
json +=
|
||||
"{" +
|
||||
" collection : " + toJson(collectionName) + ", " +
|
||||
" zkHost : " + toJson(zkHost) + ", " +
|
||||
" solrUrl : " + toJson(solrUrl) + ", " +
|
||||
" solrHomeDir : " + toJson(solrHomeDir) + ", " +
|
||||
" batchSize : " + toJson(batchSize) + " " +
|
||||
"}";
|
||||
return ConfigFactory.parseString(json);
|
||||
}
|
||||
|
||||
private String toJson(Object key) {
|
||||
String str = key == null ? "" : key.toString();
|
||||
str = ConfigUtil.quoteString(str);
|
||||
return str;
|
||||
}
|
||||
|
||||
public String getCollectionName() {
|
||||
return this.collectionName;
|
||||
}
|
||||
|
||||
public void setCollectionName(String collectionName) {
|
||||
this.collectionName = collectionName;
|
||||
}
|
||||
|
||||
public String getZkHost() {
|
||||
return this.zkHost;
|
||||
}
|
||||
|
||||
public void setZkHost(String zkHost) {
|
||||
this.zkHost = zkHost;
|
||||
}
|
||||
|
||||
public String getSolrHomeDir() {
|
||||
return this.solrHomeDir;
|
||||
}
|
||||
|
||||
public void setSolrHomeDir(String solrHomeDir) {
|
||||
this.solrHomeDir = solrHomeDir;
|
||||
}
|
||||
|
||||
public String getServerUrl() {
|
||||
return this.solrUrl;
|
||||
}
|
||||
|
||||
public void setServerUrl(String solrUrl) {
|
||||
this.solrUrl = solrUrl;
|
||||
}
|
||||
|
||||
public int getBatchSize() {
|
||||
return this.batchSize;
|
||||
}
|
||||
|
||||
public void setBatchSize(int batchSize) {
|
||||
this.batchSize = batchSize;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.morphlines.solr;
|
||||
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
|
||||
/**
|
||||
* A context that is specific to Solr.
|
||||
*/
|
||||
public class SolrMorphlineContext extends MorphlineContext {
|
||||
|
||||
private DocumentLoader loader;
|
||||
private IndexSchema schema;
|
||||
|
||||
/** For public access use {@link Builder#build()} instead */
|
||||
protected SolrMorphlineContext() {}
|
||||
|
||||
public DocumentLoader getDocumentLoader() {
|
||||
return loader;
|
||||
}
|
||||
|
||||
public IndexSchema getIndexSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* Helper to construct a {@link SolrMorphlineContext} instance.
|
||||
*/
|
||||
public static class Builder extends MorphlineContext.Builder {
|
||||
|
||||
private DocumentLoader loader;
|
||||
private IndexSchema schema;
|
||||
|
||||
public Builder() {}
|
||||
|
||||
public Builder setDocumentLoader(DocumentLoader loader) {
|
||||
this.loader = loader;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setIndexSchema(IndexSchema schema) {
|
||||
this.schema = schema;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrMorphlineContext build() {
|
||||
((SolrMorphlineContext)context).loader = loader;
|
||||
((SolrMorphlineContext)context).schema = schema;
|
||||
return (SolrMorphlineContext) super.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected SolrMorphlineContext create() {
|
||||
return new SolrMorphlineContext();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue