SOLR-9221: Remove Solr contribs: map-reduce, morphlines-core and morphlines-cell

This commit is contained in:
Steve Rowe 2017-03-24 12:31:16 -04:00
parent 6665aed952
commit 53e5f34f66
480 changed files with 15 additions and 59625 deletions

1
.gitignore vendored
View File

@ -45,7 +45,6 @@ parent.iml
/solr/example/example-DIH/solr/mail/lib/*.jar
solr/contrib/dataimporthandler/test-lib/
solr/contrib/morphlines-core/test-lib/
solr/core/test-lib/

View File

@ -46,9 +46,6 @@
<buildFile url="file://$PROJECT_DIR$/solr/contrib/dataimporthandler/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/extraction/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/langid/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/map-reduce/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/uima/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/velocity/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/solrj/build.xml" />

View File

@ -1,10 +0,0 @@
<component name="libraryTable">
<library name="Solr morphlines cell library">
<CLASSES>
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" recursive="false" />
</library>
</component>

View File

@ -1,10 +0,0 @@
<component name="libraryTable">
<library name="Solr morphlines core library">
<CLASSES>
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" recursive="false" />
</library>
</component>

View File

@ -1,10 +0,0 @@
<component name="libraryTable">
<library name="Solr morphlines core test library">
<CLASSES>
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" recursive="false" />
</library>
</component>

View File

@ -56,9 +56,6 @@
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/extraction/extraction.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/langid/langid.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/ltr/ltr.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/map-reduce/map-reduce.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/morphlines-cell/morphlines-cell.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/morphlines-core/morphlines-core.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/uima/uima.iml" />
<module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/velocity/velocity.iml" />
</modules>

View File

@ -316,30 +316,6 @@
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<configuration default="false" name="Solr map-reduce contrib" type="JUnit" factoryName="JUnit">
<module name="map-reduce" />
<option name="TEST_OBJECT" value="pattern" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/map-reduce" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<configuration default="false" name="Solr morphlines-cell contrib" type="JUnit" factoryName="JUnit">
<module name="morphlines-cell" />
<option name="TEST_OBJECT" value="pattern" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-cell" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<configuration default="false" name="Solr morphlines-core contrib" type="JUnit" factoryName="JUnit">
<module name="morphlines-core" />
<option name="TEST_OBJECT" value="pattern" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-core" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<configuration default="false" name="Solr uima contrib" type="JUnit" factoryName="JUnit">
<module name="uima" />
<option name="TEST_OBJECT" value="pattern" />
@ -357,7 +333,7 @@
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
<list size="44">
<list size="41">
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
@ -395,13 +371,10 @@
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr map-reduce contrib" />
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr morphlines-cell contrib" />
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr morphlines-core contrib" />
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
<item index="41" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
<item index="42" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
<item index="43" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
<item index="37" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
<item index="38" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
<item index="39" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
<item index="40" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
</list>
</component>
</project>

View File

@ -1,43 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Solr core library" level="project" />
<orderEntry type="library" name="Solrj library" level="project" />
<orderEntry type="library" name="Solr extraction library" level="project" />
<orderEntry type="library" name="Solr morphlines core library" level="project" />
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr core test library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="misc" />
<orderEntry type="module" module-name="extraction" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="morphlines-core" />
<orderEntry type="module" module-name="analysis-common" />
<orderEntry type="module-library">
<library>
<CLASSES>
<root url="file://$MODULE_DIR$/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
</library>
</orderEntry>
</component>
</module>

View File

@ -1,29 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Solr core library" level="project" />
<orderEntry type="library" name="Solrj library" level="project" />
<orderEntry type="library" name="Solr extraction library" level="project" />
<orderEntry type="library" name="Solr morphlines core library" level="project" />
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" scope="TEST" module-name="lucene-core" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="extraction" />
<orderEntry type="module" module-name="morphlines-core" />
</component>
</module>

View File

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Solr example library" level="project" />
<orderEntry type="library" name="Solr core library" level="project" />
<orderEntry type="library" name="Solrj library" level="project" />
<orderEntry type="library" name="Solr extraction library" level="project" />
<orderEntry type="library" name="Solr morphlines core library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="analysis-common" />
</component>
</module>

View File

@ -1,90 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-map-reduce</artifactId>
<packaging>jar</packaging>
<name>Apache Solr map-reduce index construction</name>
<description>Apache Solr - map-reduce index construction</description>
<properties>
<module-directory>solr/contrib/map-reduce</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:git:${vc-anonymous-base-url}</connection>
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
<url>${vc-browse-base-url};f=${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<!-- This dependency cannot be put into solr-parent, because local -->
<!-- dependencies are always ordered before inherited dependencies. -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-core</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
@solr-map-reduce.internal.dependencies@
@solr-map-reduce.external.dependencies@
@solr-map-reduce.internal.test.dependencies@
@solr-map-reduce.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<!-- TODO: This is a hack, because the shared test-files folder seems not to be
included by the dependency, maybe because the dependency test-jar is not unpacked? -->
<directory>${module-path}/../morphlines-core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
</build>
</project>

View File

@ -1,90 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-cell</artifactId>
<packaging>jar</packaging>
<name>Apache Solr Cell Morphlines</name>
<description>Apache Solr - Cell Morphlines</description>
<properties>
<module-directory>solr/contrib/morphlines-cell</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:git:${vc-anonymous-base-url}</connection>
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
<url>${vc-browse-base-url};f=${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<!-- This dependency cannot be put into solr-parent, because local -->
<!-- dependencies are always ordered before inherited dependencies. -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-core</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
@solr-morphlines-cell.internal.dependencies@
@solr-morphlines-cell.external.dependencies@
@solr-morphlines-cell.internal.test.dependencies@
@solr-morphlines-cell.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<!-- TODO: This is a hack, because the shared test-files folder seems not to be
included by the dependency, maybe because the dependency test-jar is not unpacked? -->
<directory>${module-path}/../morphlines-core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
</build>
</project>

View File

@ -1,91 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-core</artifactId>
<packaging>jar</packaging>
<name>Apache Solr Morphlines Core</name>
<description>Apache Solr - Morphlines Core</description>
<properties>
<module-directory>solr/contrib/morphlines-core</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:git:${vc-anonymous-base-url}</connection>
<developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
<url>${vc-browse-base-url};f=${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<!-- This dependency cannot be put into solr-parent, because local -->
<!-- dependencies are always ordered before inherited dependencies. -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
@solr-morphlines-core.internal.dependencies@
@solr-morphlines-core.external.dependencies@
@solr-morphlines-core.internal.test.dependencies@
@solr-morphlines-core.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -39,9 +39,6 @@
<module>extraction</module>
<module>langid</module>
<module>ltr</module>
<module>morphlines-cell</module>
<module>morphlines-core</module>
<module>map-reduce</module>
<module>uima</module>
<module>velocity</module>
</modules>

View File

@ -10,6 +10,5 @@
# trigger a conflict) when the ant check-lib-versions target is run.
/com.google.guava/guava = 16.0.1
/com.google.inject/guice=4.0-beta5
/javax.servlet/servlet-api = 2.5, 3.0-alpha-1
/org.ow2.asm/asm = 5.0_BETA

View File

@ -3,7 +3,6 @@
# when the lexical sort check is performed by the ant check-lib-versions target.
/antlr/antlr = 2.7.7
/aopalliance/aopalliance = 1.0
/com.adobe.xmp/xmpcore = 5.1.2
com.carrotsearch.randomizedtesting.version = 2.5.0
@ -26,10 +25,6 @@ com.fasterxml.jackson.core.version = 2.5.4
/com.google.guava/guava = 14.0.1
com.google.inject.guice.version = 3.0
/com.google.inject.extensions/guice-servlet = ${com.google.inject.guice.version}
/com.google.inject/guice = ${com.google.inject.guice.version}
/com.google.protobuf/protobuf-java = 3.1.0
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
/com.googlecode.mp4parser/isoparser = 1.1.18
@ -37,24 +32,14 @@ com.google.inject.guice.version = 3.0
/com.ibm.icu/icu4j = 56.1
/com.pff/java-libpst = 0.8.1
com.rometools.version = 1.6.1
/com.rometools/rome = ${com.rometools.version}
com.sun.jersey.version = 1.9
/com.sun.jersey.contribs/jersey-guice = ${com.sun.jersey.version}
/com.sun.jersey/jersey-bundle = ${com.sun.jersey.version}
/com.sun.jersey/jersey-core = ${com.sun.jersey.version}
/com.sun.jersey/jersey-json = ${com.sun.jersey.version}
/com.sun.jersey/jersey-server = ${com.sun.jersey.version}
/com.sun.mail/gimap = 1.5.1
/com.sun.mail/javax.mail = 1.5.1
/com.sun.xml.bind/jaxb-impl = 2.2.3-1
/com.tdunning/t-digest = 3.1
/com.thoughtworks.paranamer/paranamer = 2.3
/com.typesafe/config = 1.0.2
/commons-beanutils/commons-beanutils = 1.8.3
/commons-cli/commons-cli = 1.2
/commons-codec/commons-codec = 1.10
@ -74,7 +59,6 @@ io.dropwizard.metrics.version = 3.1.2
/io.dropwizard.metrics/metrics-core = ${io.dropwizard.metrics.version}
/io.dropwizard.metrics/metrics-ganglia = ${io.dropwizard.metrics.version}
/io.dropwizard.metrics/metrics-graphite = ${io.dropwizard.metrics.version}
/io.dropwizard.metrics/metrics-healthchecks = ${io.dropwizard.metrics.version}
/io.dropwizard.metrics/metrics-jetty9 = ${io.dropwizard.metrics.version}
/io.dropwizard.metrics/metrics-jvm = ${io.dropwizard.metrics.version}
@ -82,7 +66,6 @@ io.netty.netty-all.version = 4.0.36.Final
/io.netty/netty-all = ${io.netty.netty-all.version}
/javax.activation/activation = 1.1.1
/javax.inject/javax.inject= 1
/javax.servlet/javax.servlet-api = 3.1.0
/javax.servlet/servlet-api = 2.4
/jdom/jdom = 1.0
@ -95,14 +78,11 @@ io.netty.netty-all.version = 4.0.36.Final
/net.bytebuddy/byte-buddy = 1.6.2
/net.hydromatic/eigenbase-properties = 1.1.5
/net.sf.ehcache/ehcache-core = 2.4.4
/net.sf.saxon/Saxon-HE = 9.6.0-2
/net.sourceforge.argparse4j/argparse4j = 0.4.3
/net.sourceforge.jmatio/jmatio = 1.0
/net.sourceforge.nekohtml/nekohtml = 1.9.17
/org.antlr/antlr4-runtime = 4.5.1-1
/org.apache.ant/ant = 1.8.2
/org.apache.avro/avro = 1.7.5
org.apache.calcite.avatica.version = 1.9.0
/org.apache.calcite.avatica/avatica-core = ${org.apache.calcite.avatica.version}
@ -160,23 +140,7 @@ org.apache.hadoop.version = 2.7.2
/org.apache.hadoop/hadoop-auth = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-common = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-hdfs = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-mapreduce-client-app = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-mapreduce-client-common = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-mapreduce-client-core = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-mapreduce-client-hs = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-mapreduce-client-jobclient = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-mapreduce-client-shuffle = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-minikdc = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-api = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-client = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-common = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-server-applicationhistoryservice = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-server-common = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-server-nodemanager = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-server-resourcemanager = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-server-tests = ${org.apache.hadoop.version}
/org.apache.hadoop/hadoop-yarn-server-web-proxy = ${org.apache.hadoop.version}
/org.apache.htrace/htrace-core = 3.2.0-incubating
@ -193,7 +157,6 @@ org.apache.james.apache.mime4j.version = 0.7.2
/org.apache.james/apache-mime4j-dom = ${org.apache.james.apache.mime4j.version}
/org.apache.mina/mina-core = 2.0.0-M5
/org.apache.mrunit/mrunit = 1.0.0
org.apache.pdfbox.version = 2.0.1
/org.apache.pdfbox/fontbox = ${org.apache.pdfbox.version}
@ -228,7 +191,6 @@ org.apache.uima.version = 2.3.1
org.bouncycastle.version = 1.45
/org.bouncycastle/bcmail-jdk15 = ${org.bouncycastle.version}
/org.bouncycastle/bcpkix-jdk15on = 1.47
/org.bouncycastle/bcprov-jdk15 = ${org.bouncycastle.version}
/org.carrot2.attributes/attributes-binder = 1.3.1
@ -245,7 +207,6 @@ org.carrot2.morfologik.version = 2.1.1
org.codehaus.jackson.version = 1.9.13
/org.codehaus.jackson/jackson-core-asl = ${org.codehaus.jackson.version}
/org.codehaus.jackson/jackson-jaxrs = ${org.codehaus.jackson.version}
/org.codehaus.jackson/jackson-mapper-asl = ${org.codehaus.jackson.version}
org.codehaus.janino.version = 2.7.6
@ -271,29 +232,10 @@ org.eclipse.jetty.version = 9.3.14.v20161028
/org.eclipse.jetty/jetty-webapp = ${org.eclipse.jetty.version}
/org.eclipse.jetty/jetty-xml = ${org.eclipse.jetty.version}
/org.fusesource.leveldbjni/leveldbjni = 1.8
org.gagravarr.vorbis.java.version = 0.8
/org.gagravarr/vorbis-java-core = ${org.gagravarr.vorbis.java.version}
/org.gagravarr/vorbis-java-tika = ${org.gagravarr.vorbis.java.version}
org.iq80.leveldb.version = 0.7
/org.iq80.leveldb/leveldb = ${org.iq80.leveldb.version}
/org.iq80.leveldb/leveldb-api = ${org.iq80.leveldb.version}
org.jboss.netty.netty.version = 3.2.4.Final
/org.jboss.netty/netty = ${org.jboss.netty.netty.version}
org.kitesdk.kite-morphlines.version = 1.1.0
/org.kitesdk/kite-morphlines-avro = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-core = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-hadoop-sequencefile = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-json = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-saxon = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-tika-core = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-tika-decompress = ${org.kitesdk.kite-morphlines.version}
/org.kitesdk/kite-morphlines-twitter = ${org.kitesdk.kite-morphlines.version}
/org.locationtech.spatial4j/spatial4j = 0.6
/org.mockito/mockito-core = 2.6.2
@ -322,7 +264,6 @@ org.slf4j.version = 1.7.7
/org.slf4j/slf4j-log4j12 = ${org.slf4j.version}
/org.tukaani/xz = 1.5
/org.xerial.snappy/snappy-java = 1.0.5
/rome/rome = 1.0
/xerces/xercesImpl = 2.9.1

View File

@ -104,6 +104,11 @@ Apache UIMA 2.3.1
Apache ZooKeeper 3.4.6
Jetty 9.3.14.v20161028
Upgrade Notes
----------------------
* Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed.
Detailed Change List
----------------------
@ -128,6 +133,11 @@ Bug Fixes
* SOLR-10281: ADMIN_PATHS is duplicated in two places and inconsistent. This can cause automatic
retries to /admin/metrics handler by the CloudSolrClient. (shalin)
Other Changes
----------------------
* SOLR-9221: Remove Solr contribs: map-reduce, morphlines-core and morphlines-cell. (Steve Rowe)
================== 6.5.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -1,20 +0,0 @@
Apache Solr MapReduce
*Experimental* - This contrib is currently subject to change in ways that may
break back compatibility.
The Solr MapReduce contrib provides an a mapreduce job that allows you to build
Solr indexes and optionally merge them into a live Solr cluster.
Example:
# Build an index with map-reduce and deploy it to SolrCloud
source $solr_distrib/example/scripts/map-reduce/set-map-reduce-classpath.sh
$hadoop_distrib/bin/hadoop --config $hadoop_conf_dir jar \
$solr_distrib/dist/solr-map-reduce-*.jar -D 'mapred.child.java.opts=-Xmx500m' \
-libjars "$HADOOP_LIBJAR" --morphline-file readAvroContainer.conf \
--zk-host 127.0.0.1:9983 --output-dir hdfs://127.0.0.1:8020/outdir \
--collection $collection --log4j log4j.properties --go-live \
--verbose "hdfs://127.0.0.1:8020/indir"

View File

@ -1,157 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="solr-map-reduce" default="default">
<description>
Solr map-reduce index construction.
</description>
<!-- <property name="name" value="MapReduceIndexerTool" /> -->
<import file="../contrib-build.xml"/>
<solr-contrib-uptodate name="extraction"
property="solr-extraction.uptodate"
classpath.property="solr-cell.jar"/>
<target name="compile-solr-extraction" unless="solr-extraction.uptodate">
<ant dir="${common-solr.dir}/contrib/extraction" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<solr-contrib-uptodate name="morphlines-core"
property="solr-morphlines-core.uptodate"/>
<target name="compile-morphlines-core" unless="solr-morphlines-core.uptodate">
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<solr-contrib-uptodate name="morphlines-cell"
property="solr-morphlines-cell.uptodate"/>
<target name="compile-morphlines-cell" unless="solr-morphlines-cell.uptodate">
<ant dir="${common-solr.dir}/contrib/morphlines-cell" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-extraction-libs">
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-morphlines-core-libs">
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-morphlines-cell-libs">
<ant dir="${common-solr.dir}/contrib/morphlines-cell" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<path id="classpath.additions">
<pathelement location="${common-solr.dir}/build/contrib/solr-cell/classes/java"/>
<fileset dir="${common-solr.dir}/contrib/extraction/lib" excludes="${common.classpath.excludes}"/>
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/java"/>
<fileset dir="${common-solr.dir}/contrib/morphlines-core/lib" excludes="${common.classpath.excludes}"/>
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-cell/classes/java"/>
<fileset dir="${common-solr.dir}/contrib/morphlines-cell/lib" excludes="${common.classpath.excludes}"/>
</path>
<path id="classpath">
<path refid="solr.base.classpath"/>
<path refid="classpath.additions"/>
</path>
<path id="test.classpath">
<path refid="solr.test.base.classpath"/>
<path refid="classpath.additions"/>
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
<pathelement location="${common-solr.dir}/contrib/morphlines-core/src/test-files"/>
<fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
</path>
<path id="javadoc.classpath">
<path refid="junit-path"/>
<path refid="classpath"/>
<pathelement location="${ant.home}/lib/ant.jar"/>
<fileset dir=".">
<exclude name="build/**/*.jar"/>
<include name="**/lib/*.jar"/>
</fileset>
</path>
<!-- TODO: make this nicer like lucene? -->
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,javadocs-extraction,javadocs-morphlines-core,javadocs-morphlines-cell,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
<sequential>
<mkdir dir="${javadoc.dir}/${name}"/>
<solr-invoke-javadoc>
<solrsources>
<packageset dir="${src.dir}"/>
</solrsources>
<links>
<link href="../solr-solrj"/>
<link href="../solr-morphlines-core"/>
<link href="../solr-cell"/>
</links>
</solr-invoke-javadoc>
<solr-jarify basedir="${javadoc.dir}/${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
</sequential>
</target>
<target name="javadocs-extraction">
<ant dir="${common-solr.dir}/contrib/extraction" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="javadocs-morphlines-core">
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="javadocs-morphlines-cell">
<ant dir="${common-solr.dir}/contrib/morphlines-cell" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="compile-core" depends="resolve-extraction-libs, resolve-morphlines-core-libs, resolve-morphlines-cell-libs, compile-solr-extraction, compile-morphlines-core, compile-morphlines-cell, solr-contrib-build.compile-core"/>
<property name="main.class" value="org.apache.solr.hadoop.MapReduceIndexerTool" />
<target name="jar-core" depends="compile-core">
<solr-jarify>
<solr-jarify-additional-manifest-attributes>
<attribute name="Main-Class" value="${main.class}"/>
</solr-jarify-additional-manifest-attributes>
</solr-jarify>
</target>
<target name="dist" depends="common-solr.dist"/>
</project>

View File

@ -1,37 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<ivy-module version="2.0">
<info organisation="org.apache.solr" module="map-reduce" />
<configurations defaultconfmapping="compile->master;test->master">
<conf name="compile" transitive="false" />
<conf name="test" transitive="false" />
</configurations>
<dependencies>
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="${/org.apache.hadoop/hadoop-mapreduce-client-core}" conf="compile" />
<dependency org="net.sourceforge.argparse4j" name="argparse4j" rev="${/net.sourceforge.argparse4j/argparse4j}" conf="compile" />
<dependency org="org.kitesdk" name="kite-morphlines-saxon" rev="${/org.kitesdk/kite-morphlines-saxon}" conf="compile" />
<dependency org="net.sf.saxon" name="Saxon-HE" rev="${/net.sf.saxon/Saxon-HE}" conf="compile" />
<dependency org="org.kitesdk" name="kite-morphlines-hadoop-sequencefile" rev="${/org.kitesdk/kite-morphlines-hadoop-sequencefile}" conf="compile" />
<dependency org="org.jboss.netty" name="netty" rev="${/org.jboss.netty/netty}" conf="test" />
<dependency org="org.bouncycastle" name="bcpkix-jdk15on" rev="${/org.bouncycastle/bcpkix-jdk15on}" conf="test"/>
<dependency org="com.rometools" name="rome" rev="${/com.rometools/rome}" conf="test"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
</dependencies>
</ivy-module>

View File

@ -1,39 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
license agreements. See the NOTICE file distributed with this work for additional
information regarding copyright ownership. The ASF licenses this file to
You under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
by applicable law or agreed to in writing, software distributed under the
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
OF ANY KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License. -->
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id>job</id>
<formats>
<format>jar</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<unpack>false</unpack>
<scope>runtime</scope>
<outputDirectory>lib</outputDirectory>
<excludes>
<exclude>${groupId}:${artifactId}</exclude>
</excludes>
</dependencySet>
<dependencySet>
<unpack>true</unpack>
<includes>
<include>${groupId}:${artifactId}</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -1,75 +0,0 @@
//The MIT License
//
// Copyright (c) 2003 Ron Alford, Mike Grove, Bijan Parsia, Evren Sirin
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
package org.apache.solr.hadoop;
import java.util.Comparator;
/**
* This is a comparator to perform a mix of alphabetical+numeric comparison. For
* example, if there is a list {"test10", "test2", "test150", "test25", "test1"}
* then what we generally expect from the ordering is the result {"test1",
* "test2", "test10", "test25", "test150"}. However, standard lexigraphic
* ordering does not do that and "test10" comes before "test2". This class is
* provided to overcome that problem. This functionality is useful to sort the
* benchmark files (like the ones in in DL-benchmark-suite) from smallest to the
* largest. Comparisons are done on the String values retuned by toString() so
* care should be taken when this comparator is used to sort arbitrary Java
* objects.
*
*/
final class AlphaNumericComparator implements Comparator {
public AlphaNumericComparator() {
}
public int compare(Object o1, Object o2) {
String s1 = o1.toString();
String s2 = o2.toString();
int n1 = s1.length(), n2 = s2.length();
int i1 = 0, i2 = 0;
while (i1 < n1 && i2 < n2) {
int p1 = i1;
int p2 = i2;
char c1 = s1.charAt(i1++);
char c2 = s2.charAt(i2++);
if(c1 != c2) {
if (Character.isDigit(c1) && Character.isDigit(c2)) {
int value1 = 0, value2 = 0;
while (i1 < n1 && Character.isDigit(c1 = s1.charAt(i1))) {
i1++;
}
value1 = Integer.parseInt(s1.substring(p1, i1));
while (i2 < n2 && Character.isDigit(c2 = s2.charAt(i2))) {
i2++;
}
value2 = Integer.parseInt(s2.substring(p2, i2));
if (value1 != value2) {
return value1 - value2;
}
}
return c1 - c2;
}
}
return n1 - n2;
}
}

View File

@ -1,243 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.ExecutorUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Enables adding batches of documents to an EmbeddedSolrServer.
*/
class BatchWriter {
private final EmbeddedSolrServer solr;
private volatile Exception batchWriteException = null;
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public Exception getBatchWriteException() {
return batchWriteException;
}
public void setBatchWriteException(Exception batchWriteException) {
this.batchWriteException = batchWriteException;
}
/** The number of writing threads. */
final int writerThreads;
/** Queue Size */
final int queueSize;
private final ThreadPoolExecutor batchPool;
private TaskID taskId = null;
/**
* The number of in progress batches, must be zero before the close can
* actually start closing
*/
AtomicInteger executingBatches = new AtomicInteger(0);
/**
* Create the batch writer object, set the thread to daemon mode, and start
* it.
*
*/
final class Batch implements Runnable {
private List<SolrInputDocument> documents;
private UpdateResponse result;
public Batch(Collection<SolrInputDocument> batch) {
documents = new ArrayList<>(batch);
}
public void run() {
try {
executingBatches.getAndIncrement();
result = runUpdate(documents);
} finally {
executingBatches.getAndDecrement();
}
}
protected List<SolrInputDocument> getDocuments() {
return documents;
}
protected void setDocuments(List<SolrInputDocument> documents) {
this.documents = documents;
}
protected UpdateResponse getResult() {
return result;
}
protected void setResult(UpdateResponse result) {
this.result = result;
}
protected void reset(List<SolrInputDocument> documents) {
if (this.documents == null) {
this.documents = new ArrayList<>(documents);
} else {
this.documents.clear();
this.documents.addAll(documents);
}
result = null;
}
protected void reset(SolrInputDocument document) {
if (this.documents == null) {
this.documents = new ArrayList<>();
} else {
this.documents.clear();
}
this.documents.add(document);
result = null;
}
}
protected UpdateResponse runUpdate(List<SolrInputDocument> batchToWrite) {
try {
UpdateResponse result = solr.add(batchToWrite);
SolrRecordWriter.incrementCounter(taskId, SolrCounters.class.getName(), SolrCounters.BATCHES_WRITTEN.toString(), 1);
SolrRecordWriter.incrementCounter(taskId, SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString(), batchToWrite.size());
if (LOG.isDebugEnabled()) {
SolrRecordWriter.incrementCounter(taskId, SolrCounters.class.getName(), SolrCounters.BATCH_WRITE_TIME.toString(), result.getElapsedTime());
}
return result;
} catch (Throwable e) {
if (e instanceof Exception) {
setBatchWriteException((Exception) e);
} else {
setBatchWriteException(new Exception(e));
}
SolrRecordWriter.incrementCounter(taskId, getClass().getName() + ".errors", e.getClass().getName(), 1);
LOG.error("Unable to process batch", e);
return null;
}
}
public BatchWriter(EmbeddedSolrServer solr, int batchSize, TaskID tid,
int writerThreads, int queueSize) {
this.solr = solr;
this.writerThreads = writerThreads;
this.queueSize = queueSize;
taskId = tid;
// we need to obtain the settings before the constructor
if (writerThreads != 0) {
batchPool = new ExecutorUtil.MDCAwareThreadPoolExecutor(writerThreads, writerThreads, 5,
TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(queueSize),
new ThreadPoolExecutor.CallerRunsPolicy());
} else { // single threaded case
batchPool = null;
}
}
public void queueBatch(Collection<SolrInputDocument> batch)
throws IOException, SolrServerException {
throwIf();
Batch b = new Batch(batch);
if (batchPool != null) {
batchPool.execute(b);
} else { // single threaded case
b.run();
throwIf();
}
}
public synchronized void close(TaskAttemptContext context)
throws InterruptedException, SolrServerException, IOException {
if (batchPool != null) {
context.setStatus("Waiting for batches to complete");
batchPool.shutdown();
while (!batchPool.isTerminated()) {
LOG.info(String.format(Locale.ENGLISH,
"Waiting for %d items and %d threads to finish executing", batchPool
.getQueue().size(), batchPool.getActiveCount()));
batchPool.awaitTermination(5, TimeUnit.SECONDS);
}
}
context.setStatus("Committing Solr Phase 1");
solr.commit(true, false);
context.setStatus("Optimizing Solr");
int maxSegments = context.getConfiguration().getInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, 1);
LOG.info("Optimizing Solr: forcing merge down to {} segments", maxSegments);
long start = System.nanoTime();
solr.optimize(true, false, maxSegments);
context.getCounter(SolrCounters.class.getName(), SolrCounters.PHYSICAL_REDUCER_MERGE_TIME.toString()).increment(System.nanoTime() - start);
float secs = (System.nanoTime() - start) / (float)(10^9);
LOG.info("Optimizing Solr: done forcing merge down to {} segments in {} secs", maxSegments, secs);
context.setStatus("Committing Solr Phase 2");
solr.commit(true, false);
context.setStatus("Shutting down Solr");
solr.close();
}
/**
* Throw a legal exception if a previous batch write had an exception. The
* previous state is cleared. Uses {@link #batchWriteException} for the state
* from the last exception.
*
* This will loose individual exceptions if the exceptions happen rapidly.
*
* @throws IOException On low level IO error
* @throws SolrServerException On Solr Exception
*/
private void throwIf() throws IOException, SolrServerException {
final Exception last = batchWriteException;
batchWriteException = null;
if (last == null) {
return;
}
if (last instanceof SolrServerException) {
throw (SolrServerException) last;
}
if (last instanceof IOException) {
throw (IOException) last;
}
throw new IOException("Batch Write Failure", last);
}
}

View File

@ -1,58 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.DataInput;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.classification.InterfaceAudience;
/**
* An InputStream that wraps a DataInput.
* @see DataOutputOutputStream
*/
@InterfaceAudience.Private
public class DataInputInputStream extends InputStream {
private DataInput in;
/**
* Construct an InputStream from the given DataInput. If 'in'
* is already an InputStream, simply returns it. Otherwise, wraps
* it in an InputStream.
* @param in the DataInput to wrap
* @return an InputStream instance that reads from 'in'
*/
public static InputStream constructInputStream(DataInput in) {
if (in instanceof InputStream) {
return (InputStream)in;
} else {
return new DataInputInputStream(in);
}
}
public DataInputInputStream(DataInput in) {
this.in = in;
}
@Override
public int read() throws IOException {
return in.readUnsignedByte();
}
}

View File

@ -1,66 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.classification.InterfaceAudience;
/**
* OutputStream implementation that wraps a DataOutput.
*/
@InterfaceAudience.Private
public class DataOutputOutputStream extends OutputStream {
private final DataOutput out;
/**
* Construct an OutputStream from the given DataOutput. If 'out'
* is already an OutputStream, simply returns it. Otherwise, wraps
* it in an OutputStream.
* @param out the DataOutput to wrap
* @return an OutputStream instance that outputs to 'out'
*/
public static OutputStream constructOutputStream(DataOutput out) {
if (out instanceof OutputStream) {
return (OutputStream)out;
} else {
return new DataOutputOutputStream(out);
}
}
private DataOutputOutputStream(DataOutput out) {
this.out = out;
}
@Override
public void write(int b) throws IOException {
out.writeByte(b);
}
@Override
public void write(byte[] b, int off, int len) throws IOException {
out.write(b, off, len);
}
@Override
public void write(byte[] b) throws IOException {
out.write(b);
}
}

View File

@ -1,57 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.morphlines.solr.DocumentLoader;
/**
* Prints documents to stdout instead of loading them into Solr for quicker turnaround during early
* trial & debug sessions.
*/
final class DryRunDocumentLoader implements DocumentLoader {
@Override
public void beginTransaction() {
}
@Override
public void load(SolrInputDocument doc) {
System.out.println("dryrun: " + doc);
}
@Override
public void commitTransaction() {
}
@Override
public UpdateResponse rollbackTransaction() {
return new UpdateResponse();
}
@Override
public void shutdown() {
}
@Override
public SolrPingResponse ping() {
return new SolrPingResponse();
}
}

View File

@ -1,182 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.fs.FileStatus;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.hadoop.MapReduceIndexerTool.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The optional (parallel) GoLive phase merges the output shards of the previous
* phase into a set of live customer facing Solr servers, typically a SolrCloud.
*/
class GoLive {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
// TODO: handle clusters with replicas
public boolean goLive(Options options, FileStatus[] outDirs) {
LOG.info("Live merging of output shards into Solr cluster...");
boolean success = false;
long start = System.nanoTime();
int concurrentMerges = options.goLiveThreads;
ThreadPoolExecutor executor = new ExecutorUtil.MDCAwareThreadPoolExecutor(concurrentMerges,
concurrentMerges, 1, TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>());
try {
CompletionService<Request> completionService = new ExecutorCompletionService<>(executor);
Set<Future<Request>> pending = new HashSet<>();
int cnt = -1;
for (final FileStatus dir : outDirs) {
LOG.debug("processing: " + dir.getPath());
cnt++;
List<String> urls = options.shardUrls.get(cnt);
for (String url : urls) {
String baseUrl = url;
if (baseUrl.endsWith("/")) {
baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
}
int lastPathIndex = baseUrl.lastIndexOf("/");
if (lastPathIndex == -1) {
LOG.error("Found unexpected shardurl, live merge failed: " + baseUrl);
return false;
}
final String name = baseUrl.substring(lastPathIndex + 1);
baseUrl = baseUrl.substring(0, lastPathIndex);
final String mergeUrl = baseUrl;
Callable<Request> task = () -> {
Request req = new Request();
LOG.info("Live merge " + dir.getPath() + " into " + mergeUrl);
try (final HttpSolrClient client = new HttpSolrClient.Builder(mergeUrl).build()) {
CoreAdminRequest.MergeIndexes mergeRequest = new CoreAdminRequest.MergeIndexes();
mergeRequest.setCoreName(name);
mergeRequest.setIndexDirs(Arrays.asList(dir.getPath().toString() + "/data/index"));
mergeRequest.process(client);
req.success = true;
} catch (SolrServerException | IOException e) {
req.e = e;
}
return req;
};
pending.add(completionService.submit(task));
}
}
while (pending != null && pending.size() > 0) {
try {
Future<Request> future = completionService.take();
if (future == null) break;
pending.remove(future);
try {
Request req = future.get();
if (!req.success) {
// failed
LOG.error("A live merge command failed", req.e);
return false;
}
} catch (ExecutionException e) {
LOG.error("Error sending live merge command", e);
return false;
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
LOG.error("Live merge process interrupted", e);
return false;
}
}
cnt = -1;
try {
LOG.info("Committing live merge...");
if (options.zkHost != null) {
try (CloudSolrClient server = new CloudSolrClient.Builder().withZkHost(options.zkHost).build()) {
server.setDefaultCollection(options.collection);
server.commit();
}
} else {
for (List<String> urls : options.shardUrls) {
for (String url : urls) {
// TODO: we should do these concurrently
try (HttpSolrClient server = new HttpSolrClient.Builder(url).build()) {
server.commit();
}
}
}
}
LOG.info("Done committing live merge");
} catch (Exception e) {
LOG.error("Error sending commits to live Solr cluster", e);
return false;
}
success = true;
return true;
} finally {
ExecutorUtil.shutdownAndAwaitTermination(executor);
float secs = (System.nanoTime() - start) / (float)(10^9);
LOG.info("Live merging of index shards into Solr cluster took " + secs + " secs");
if (success) {
LOG.info("Live merging completed successfully");
} else {
LOG.info("Live merging failed");
}
}
// if an output dir does not exist, we should fail and do no merge?
}
private static final class Request {
Exception e;
boolean success = false;
}
}

View File

@ -1,41 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
/**
* Solr field names for metadata of an HDFS file.
*/
public interface HdfsFileFieldNames {
public static final String FILE_UPLOAD_URL = "file_upload_url";
public static final String FILE_DOWNLOAD_URL = "file_download_url";
public static final String FILE_SCHEME = "file_scheme";
public static final String FILE_HOST = "file_host";
public static final String FILE_PORT = "file_port";
public static final String FILE_PATH = "file_path";
public static final String FILE_NAME = "file_name";
public static final String FILE_LENGTH = "file_length";
public static final String FILE_LAST_MODIFIED = "file_last_modified";
public static final String FILE_OWNER = "file_owner";
public static final String FILE_GROUP = "file_group";
public static final String FILE_PERMISSIONS_USER = "file_permissions_user";
public static final String FILE_PERMISSIONS_GROUP = "file_permissions_group";
public static final String FILE_PERMISSIONS_OTHER = "file_permissions_other";
public static final String FILE_PERMISSIONS_STICKYBIT = "file_permissions_stickybit";
}

View File

@ -1,159 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.lang.invoke.MethodHandles;
import java.util.Locale;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class runs a background thread that once every 60 seconds checks to see if
* a progress report is needed. If a report is needed it is issued.
*
* A simple counter {@link #threadsNeedingHeartBeat} handles the number of
* threads requesting a heart beat.
*
* The expected usage pattern is
*
* <pre>
* try {
* heartBeater.needHeartBeat();
* do something that may take a while
* } finally {
* heartBeater.cancelHeartBeat();
* }
* </pre>
*
*
*/
public class HeartBeater extends Thread {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* count of threads asking for heart beat, at 0 no heart beat done. This could
* be an atomic long but then missmatches in need/cancel could result in
* negative counts.
*/
private volatile int threadsNeedingHeartBeat = 0;
private Progressable progress;
/**
* The amount of time to wait between checks for the need to issue a heart
* beat. In milliseconds.
*/
private final long waitTimeMs = TimeUnit.MILLISECONDS.convert(60, TimeUnit.SECONDS);
private final CountDownLatch isClosing = new CountDownLatch(1);
/**
* Create the heart beat object thread set it to daemon priority and start the
* thread. When the count in {@link #threadsNeedingHeartBeat} is positive, the
* heart beat will be issued on the progress object every 60 seconds.
*/
public HeartBeater(Progressable progress) {
setDaemon(true);
this.progress = progress;
LOG.info("Heart beat reporting class is " + progress.getClass().getName());
start();
}
public Progressable getProgress() {
return progress;
}
public void setProgress(Progressable progress) {
this.progress = progress;
}
@Override
public void run() {
LOG.info("HeartBeat thread running");
while (true) {
try {
synchronized (this) {
if (threadsNeedingHeartBeat > 0) {
progress.progress();
if (LOG.isInfoEnabled()) {
LOG.info(String.format(Locale.ENGLISH, "Issuing heart beat for %d threads",
threadsNeedingHeartBeat));
}
} else {
if (LOG.isInfoEnabled()) {
LOG.info(String.format(Locale.ENGLISH, "heartbeat skipped count %d",
threadsNeedingHeartBeat));
}
}
}
if (isClosing.await(waitTimeMs, TimeUnit.MILLISECONDS)) {
return;
}
} catch (Throwable e) {
LOG.error("HeartBeat throwable", e);
}
}
}
/**
* inform the background thread that heartbeats are to be issued. Issue a
* heart beat also
*/
public synchronized void needHeartBeat() {
threadsNeedingHeartBeat++;
// Issue a progress report right away,
// just in case the the cancel comes before the background thread issues a
// report.
// If enough cases like this happen the 600 second timeout can occur
progress.progress();
if (threadsNeedingHeartBeat == 1) {
// this.notify(); // wake up the heartbeater
}
}
/**
* inform the background thread that this heartbeat request is not needed.
* This must be called at some point after each {@link #needHeartBeat()}
* request.
*/
public synchronized void cancelHeartBeat() {
if (threadsNeedingHeartBeat > 0) {
threadsNeedingHeartBeat--;
} else {
Exception e = new Exception("Dummy");
e.fillInStackTrace();
LOG.warn("extra call to cancelHeartBeat", e);
}
}
public void setStatus(String status) {
if (progress instanceof TaskInputOutputContext) {
((TaskInputOutputContext<?,?,?,?>) progress).setStatus(status);
}
}
/** Releases any resources */
public void close() {
isClosing.countDown();
}
}

View File

@ -1,67 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Random;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* MR Mapper that randomizing a list of URLs.
*
* Mapper input is (offset, URL) pairs. Each such pair indicates a file to
* index.
*
* Mapper output is (randomPosition, URL) pairs. The reducer receives these
* pairs sorted by randomPosition.
*/
public class LineRandomizerMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
private Random random;
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
random = createRandom(context);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
LOGGER.debug("map key: {}, value: {}", key, value);
context.write(new LongWritable(random.nextLong()), value);
}
private Random createRandom(Context context) {
long taskId = 0;
if (context.getTaskAttemptID() != null) { // MRUnit returns null
LOGGER.debug("context.getTaskAttemptID().getId(): {}", context.getTaskAttemptID().getId());
LOGGER.debug("context.getTaskAttemptID().getTaskID().getId(): {}", context.getTaskAttemptID().getTaskID().getId());
taskId = context.getTaskAttemptID().getTaskID().getId(); // taskId = 0, 1, ..., N
}
// create a good random seed, yet ensure deterministic PRNG sequence for easy reproducability
return new Random(421439783L * (taskId + 1));
}
}

View File

@ -1,48 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* MR Reducer that randomizing a list of URLs.
*
* Reducer input is (randomPosition, URL) pairs. Each such pair indicates a file
* to index.
*
* Reducer output is a list of URLs, each URL in a random position.
*/
public class LineRandomizerReducer extends Reducer<LongWritable, Text, Text, NullWritable> {
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
LOGGER.debug("reduce key: {}, value: {}", key, value);
context.write(value, NullWritable.get());
}
}
}

View File

@ -1,233 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import net.sourceforge.argparse4j.inf.Argument;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.ArgumentType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
/**
* ArgumentType subclass for HDFS Path type, using fluent style API.
*/
public class PathArgumentType implements ArgumentType<Path> {
private final Configuration conf;
private FileSystem fs;
private boolean acceptSystemIn = false;
private boolean verifyExists = false;
private boolean verifyNotExists = false;
private boolean verifyIsFile = false;
private boolean verifyIsDirectory = false;
private boolean verifyCanRead = false;
private boolean verifyCanWrite = false;
private boolean verifyCanWriteParent = false;
private boolean verifyCanExecute = false;
private boolean verifyIsAbsolute = false;
private boolean verifyHasScheme = false;
private String verifyScheme = null;
public PathArgumentType(Configuration conf) {
this.conf = conf;
}
public PathArgumentType acceptSystemIn() {
acceptSystemIn = true;
return this;
}
public PathArgumentType verifyExists() {
verifyExists = true;
return this;
}
public PathArgumentType verifyNotExists() {
verifyNotExists = true;
return this;
}
public PathArgumentType verifyIsFile() {
verifyIsFile = true;
return this;
}
public PathArgumentType verifyIsDirectory() {
verifyIsDirectory = true;
return this;
}
public PathArgumentType verifyCanRead() {
verifyCanRead = true;
return this;
}
public PathArgumentType verifyCanWrite() {
verifyCanWrite = true;
return this;
}
public PathArgumentType verifyCanWriteParent() {
verifyCanWriteParent = true;
return this;
}
public PathArgumentType verifyCanExecute() {
verifyCanExecute = true;
return this;
}
public PathArgumentType verifyIsAbsolute() {
verifyIsAbsolute = true;
return this;
}
public PathArgumentType verifyHasScheme() {
verifyHasScheme = true;
return this;
}
public PathArgumentType verifyScheme(String scheme) {
verifyScheme = scheme;
return this;
}
@Override
public Path convert(ArgumentParser parser, Argument arg, String value) throws ArgumentParserException {
Path file = new Path(value);
try {
fs = file.getFileSystem(conf);
if (verifyHasScheme && !isSystemIn(file)) {
verifyHasScheme(parser, file);
}
if (verifyScheme != null && !isSystemIn(file)) {
verifyScheme(parser, file);
}
if (verifyIsAbsolute && !isSystemIn(file)) {
verifyIsAbsolute(parser, file);
}
if (verifyExists && !isSystemIn(file)) {
verifyExists(parser, file);
}
if (verifyNotExists && !isSystemIn(file)) {
verifyNotExists(parser, file);
}
if (verifyIsFile && !isSystemIn(file)) {
verifyIsFile(parser, file);
}
if (verifyIsDirectory && !isSystemIn(file)) {
verifyIsDirectory(parser, file);
}
if (verifyCanRead && !isSystemIn(file)) {
verifyCanRead(parser, file);
}
if (verifyCanWrite && !isSystemIn(file)) {
verifyCanWrite(parser, file);
}
if (verifyCanWriteParent && !isSystemIn(file)) {
verifyCanWriteParent(parser, file);
}
if (verifyCanExecute && !isSystemIn(file)) {
verifyCanExecute(parser, file);
}
} catch (IOException e) {
throw new ArgumentParserException(e, parser);
}
return file;
}
private void verifyExists(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
if (!fs.exists(file)) {
throw new ArgumentParserException("File not found: " + file, parser);
}
}
private void verifyNotExists(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
if (fs.exists(file)) {
throw new ArgumentParserException("File found: " + file, parser);
}
}
private void verifyIsFile(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
if (!fs.isFile(file)) {
throw new ArgumentParserException("Not a file: " + file, parser);
}
}
private void verifyIsDirectory(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
if (!fs.isDirectory(file)) {
throw new ArgumentParserException("Not a directory: " + file, parser);
}
}
private void verifyCanRead(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
verifyExists(parser, file);
if (!fs.getFileStatus(file).getPermission().getUserAction().implies(FsAction.READ)) {
throw new ArgumentParserException("Insufficient permissions to read file: " + file, parser);
}
}
private void verifyCanWrite(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
verifyExists(parser, file);
if (!fs.getFileStatus(file).getPermission().getUserAction().implies(FsAction.WRITE)) {
throw new ArgumentParserException("Insufficient permissions to write file: " + file, parser);
}
}
private void verifyCanWriteParent(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
Path parent = file.getParent();
if (parent == null || !fs.exists(parent) || !fs.getFileStatus(parent).getPermission().getUserAction().implies(FsAction.WRITE)) {
throw new ArgumentParserException("Cannot write parent of file: " + file, parser);
}
}
private void verifyCanExecute(ArgumentParser parser, Path file) throws ArgumentParserException, IOException {
verifyExists(parser, file);
if (!fs.getFileStatus(file).getPermission().getUserAction().implies(FsAction.EXECUTE)) {
throw new ArgumentParserException("Insufficient permissions to execute file: " + file, parser);
}
}
private void verifyIsAbsolute(ArgumentParser parser, Path file) throws ArgumentParserException {
if (!file.isAbsolute()) {
throw new ArgumentParserException("Not an absolute file: " + file, parser);
}
}
private void verifyHasScheme(ArgumentParser parser, Path file) throws ArgumentParserException {
if (file.toUri().getScheme() == null) {
throw new ArgumentParserException("URI scheme is missing in path: " + file, parser);
}
}
private void verifyScheme(ArgumentParser parser, Path file) throws ArgumentParserException {
if (!verifyScheme.equals(file.toUri().getScheme())) {
throw new ArgumentParserException("Scheme of path: " + file + " must be: " + verifyScheme, parser);
}
}
private boolean isSystemIn(Path file) {
return acceptSystemIn && file.toString().equals("-");
}
}

View File

@ -1,130 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
/**
* Extracts various components of an HDFS Path
*/
public final class PathParts {
private final String uploadURL;
private final Configuration conf;
private final FileSystem fs;
private final Path normalizedPath;
private FileStatus stats;
public PathParts(String uploadURL, Configuration conf) throws IOException {
if (uploadURL == null) {
throw new IllegalArgumentException("Path must not be null: " + uploadURL);
}
this.uploadURL = uploadURL;
if (conf == null) {
throw new IllegalArgumentException("Configuration must not be null: " + uploadURL);
}
this.conf = conf;
URI uri = stringToUri(uploadURL);
this.fs = FileSystem.get(uri, conf);
if (fs == null) {
throw new IllegalArgumentException("File system must not be null: " + uploadURL);
}
this.normalizedPath = fs.makeQualified(new Path(uri));
if (!normalizedPath.isAbsolute()) {
throw new IllegalArgumentException("Path must be absolute: " + uploadURL);
}
if (getScheme() == null) {
throw new IllegalArgumentException("Scheme must not be null: " + uploadURL);
}
if (getHost() == null) {
throw new IllegalArgumentException("Host must not be null: " + uploadURL);
}
if (getPort() < 0) {
throw new IllegalArgumentException("Port must not be negative: " + uploadURL);
}
}
public String getUploadURL() {
return uploadURL;
}
public Path getUploadPath() {
return new Path(getUploadURL());
}
public String getURIPath() {
return normalizedPath.toUri().getPath();
}
public String getName() {
return normalizedPath.getName();
}
public String getScheme() {
return normalizedPath.toUri().getScheme();
}
public String getHost() {
return normalizedPath.toUri().getHost();
}
public int getPort() {
int port = normalizedPath.toUri().getPort();
if (port == -1) {
port = fs.getWorkingDirectory().toUri().getPort();
if (port == -1) {
port = NameNode.DEFAULT_PORT;
}
}
return port;
}
public String getId() {
return getScheme() + "://" + getHost() + ":" + getPort() + getURIPath();
}
public String getDownloadURL() {
return getId();
}
public Configuration getConfiguration() {
return conf;
}
public FileSystem getFileSystem() {
return fs;
}
public FileStatus getFileStatus() throws IOException {
if (stats == null) {
stats = getFileSystem().getFileStatus(getUploadPath());
}
return stats;
}
private URI stringToUri(String pathString) {
//return new Path(pathString).toUri().normalize();
return URI.create(pathString).normalize();
}
}

View File

@ -1,143 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.DocRouter;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.Hash;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* MapReduce partitioner that partitions the Mapper output such that each
* SolrInputDocument gets sent to the SolrCloud shard that it would have been
* sent to if the document were ingested via the standard SolrCloud Near Real
* Time (NRT) API.
*
* In other words, this class implements the same partitioning semantics as the
* standard SolrCloud NRT API. This enables to mix batch updates from MapReduce
* ingestion with updates from standard NRT ingestion on the same SolrCloud
* cluster, using identical unique document keys.
*/
public class SolrCloudPartitioner extends Partitioner<Text, SolrInputDocumentWritable> implements Configurable {
private Configuration conf;
private DocCollection docCollection;
private Map<String, Integer> shardNumbers;
private int shards = 0;
private final SolrParams emptySolrParams = new MapSolrParams(Collections.EMPTY_MAP);
public static final String SHARDS = SolrCloudPartitioner.class.getName() + ".shards";
public static final String ZKHOST = SolrCloudPartitioner.class.getName() + ".zkHost";
public static final String COLLECTION = SolrCloudPartitioner.class.getName() + ".collection";
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public SolrCloudPartitioner() {}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.shards = conf.getInt(SHARDS, -1);
if (shards <= 0) {
throw new IllegalArgumentException("Illegal shards: " + shards);
}
String zkHost = conf.get(ZKHOST);
if (zkHost == null) {
throw new IllegalArgumentException("zkHost must not be null");
}
String collection = conf.get(COLLECTION);
if (collection == null) {
throw new IllegalArgumentException("collection must not be null");
}
LOG.info("Using SolrCloud zkHost: {}, collection: {}", zkHost, collection);
docCollection = new ZooKeeperInspector().extractDocCollection(zkHost, collection);
if (docCollection == null) {
throw new IllegalArgumentException("docCollection must not be null");
}
if (docCollection.getSlicesMap().size() != shards) {
throw new IllegalArgumentException("Incompatible shards: + " + shards + " for docCollection: " + docCollection);
}
List<Slice> slices = new ZooKeeperInspector().getSortedSlices(docCollection.getSlices());
if (slices.size() != shards) {
throw new IllegalStateException("Incompatible sorted shards: + " + shards + " for docCollection: " + docCollection);
}
shardNumbers = new HashMap(10 * slices.size()); // sparse for performance
for (int i = 0; i < slices.size(); i++) {
shardNumbers.put(slices.get(i).getName(), i);
}
LOG.debug("Using SolrCloud docCollection: {}", docCollection);
DocRouter docRouter = docCollection.getRouter();
if (docRouter == null) {
throw new IllegalArgumentException("docRouter must not be null");
}
LOG.info("Using SolrCloud docRouterClass: {}", docRouter.getClass());
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public int getPartition(Text key, SolrInputDocumentWritable value, int numPartitions) {
DocRouter docRouter = docCollection.getRouter();
SolrInputDocument doc = value.getSolrInputDocument();
String keyStr = key.toString();
// TODO: scalability: replace linear search in HashBasedRouter.hashToSlice() with binary search on sorted hash ranges
Slice slice = docRouter.getTargetSlice(keyStr, doc, null, emptySolrParams, docCollection);
// LOG.info("slice: {}", slice);
if (slice == null) {
throw new IllegalStateException("No matching slice found! The slice seems unavailable. docRouterClass: "
+ docRouter.getClass().getName());
}
int rootShard = shardNumbers.get(slice.getName());
if (rootShard < 0 || rootShard >= shards) {
throw new IllegalStateException("Illegal shard number " + rootShard + " for slice: " + slice + ", docCollection: "
+ docCollection);
}
// map doc to micro shard aka leaf shard, akin to HashBasedRouter.sliceHash()
// taking into account mtree merge algorithm
assert numPartitions % shards == 0; // Also note that numPartitions is equal to the number of reducers
int hashCode = Hash.murmurhash3_x86_32(keyStr, 0, keyStr.length(), 0);
int offset = (hashCode & Integer.MAX_VALUE) % (numPartitions / shards);
int microShard = (rootShard * (numPartitions / shards)) + offset;
// LOG.info("Subpartitions rootShard: {}, offset: {}", rootShard, offset);
// LOG.info("Partitioned to p: {} for numPartitions: {}, shards: {}, key: {}, value: {}", microShard, numPartitions, shards, key, value);
assert microShard >= 0 && microShard < numPartitions;
return microShard;
}
}

View File

@ -1,53 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
public enum SolrCounters {
DOCUMENTS_WRITTEN (getClassName(SolrReducer.class)
+ ": Number of documents processed"),
BATCHES_WRITTEN (getClassName(SolrReducer.class)
+ ": Number of document batches processed"),
BATCH_WRITE_TIME (getClassName(SolrReducer.class)
+ ": Time spent by reducers writing batches [ms]"),
PHYSICAL_REDUCER_MERGE_TIME (getClassName(SolrReducer.class)
+ ": Time spent by reducers on physical merges [ms]"),
LOGICAL_TREE_MERGE_TIME (getClassName(TreeMergeMapper.class)
+ ": Time spent on logical tree merges [ms]"),
PHYSICAL_TREE_MERGE_TIME (getClassName(TreeMergeMapper.class)
+ ": Time spent on physical tree merges [ms]");
private final String label;
private SolrCounters(String label) {
this.label = label;
}
public String toString() {
return label;
}
private static String getClassName(Class clazz) {
return Utils.getShortClassName(clazz);
}
}

View File

@ -1,66 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.FastOutputStream;
import org.apache.solr.common.util.JavaBinCodec;
public class SolrInputDocumentWritable implements Writable {
private SolrInputDocument sid;
public SolrInputDocumentWritable() {
}
public SolrInputDocumentWritable(SolrInputDocument sid) {
this.sid = sid;
}
public SolrInputDocument getSolrInputDocument() {
return sid;
}
@Override
public String toString() {
return sid.toString();
}
@Override
public void write(DataOutput out) throws IOException {
JavaBinCodec codec = new JavaBinCodec();
FastOutputStream daos = FastOutputStream.wrap(DataOutputOutputStream.constructOutputStream(out));
codec.init(daos);
try {
codec.writeVal(sid);
} finally {
daos.flushBuffer();
}
}
@Override
public void readFields(DataInput in) throws IOException {
JavaBinCodec codec = new JavaBinCodec();
UnbufferedDataInputInputStream dis = new UnbufferedDataInputInputStream(in);
sid = (SolrInputDocument)codec.readVal(dis);
}
}

View File

@ -1,39 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SolrMapper<KEYIN, VALUEIN> extends Mapper<KEYIN, VALUEIN, Text, SolrInputDocumentWritable> {
private Path solrHomeDir;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Utils.getLogConfigFile(context.getConfiguration());
super.setup(context);
solrHomeDir = SolrRecordWriter.findSolrConfig(context.getConfiguration());
}
protected Path getSolrHomeDir() {
return solrHomeDir;
}
}

View File

@ -1,280 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SolrOutputFormat<K, V> extends FileOutputFormat<K, V> {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* The parameter used to pass the solr config zip file information. This will
* be the hdfs path to the configuration zip file
*/
public static final String SETUP_OK = "solr.output.format.setup";
/** The key used to pass the zip file name through the configuration. */
public static final String ZIP_NAME = "solr.zip.name";
/**
* The base name of the zip file containing the configuration information.
* This file is passed via the distributed cache using a unique name, obtained
* via {@link #getZipName(Configuration jobConf)}.
*/
public static final String ZIP_FILE_BASE_NAME = "solr.zip";
/**
* The key used to pass the boolean configuration parameter that instructs for
* regular or zip file output
*/
public static final String OUTPUT_ZIP_FILE = "solr.output.zip.format";
static int defaultSolrWriterThreadCount = 0;
public static final String SOLR_WRITER_THREAD_COUNT = "solr.record.writer.num.threads";
static int defaultSolrWriterQueueSize = 1;
public static final String SOLR_WRITER_QUEUE_SIZE = "solr.record.writer.max.queues.size";
static int defaultSolrBatchSize = 20;
public static final String SOLR_RECORD_WRITER_BATCH_SIZE = "solr.record.writer.batch.size";
public static final String SOLR_RECORD_WRITER_MAX_SEGMENTS = "solr.record.writer.maxSegments";
public static String getSetupOk() {
return SETUP_OK;
}
/** Get the number of threads used for index writing */
public static void setSolrWriterThreadCount(int count, Configuration conf) {
conf.setInt(SOLR_WRITER_THREAD_COUNT, count);
}
/** Set the number of threads used for index writing */
public static int getSolrWriterThreadCount(Configuration conf) {
return conf.getInt(SOLR_WRITER_THREAD_COUNT, defaultSolrWriterThreadCount);
}
/**
* Set the maximum size of the the queue for documents to be written to the
* index.
*/
public static void setSolrWriterQueueSize(int count, Configuration conf) {
conf.setInt(SOLR_WRITER_QUEUE_SIZE, count);
}
/** Return the maximum size for the number of documents pending index writing. */
public static int getSolrWriterQueueSize(Configuration conf) {
return conf.getInt(SOLR_WRITER_QUEUE_SIZE, defaultSolrWriterQueueSize);
}
/**
* Return the file name portion of the configuration zip file, from the
* configuration.
*/
public static String getZipName(Configuration conf) {
return conf.get(ZIP_NAME, ZIP_FILE_BASE_NAME);
}
/**
* configure the job to output zip files of the output index, or full
* directory trees. Zip files are about 1/5th the size of the raw index, and
* much faster to write, but take more cpu to create.
*
* @param output true if should output zip files
* @param conf to use
*/
public static void setOutputZipFormat(boolean output, Configuration conf) {
conf.setBoolean(OUTPUT_ZIP_FILE, output);
}
/**
* return true if the output should be a zip file of the index, rather than
* the raw index
*
* @param conf to use
* @return true if output zip files is on
*/
public static boolean isOutputZipFormat(Configuration conf) {
return conf.getBoolean(OUTPUT_ZIP_FILE, false);
}
public static String getOutputName(JobContext job) {
return FileOutputFormat.getOutputName(job);
}
@Override
public void checkOutputSpecs(JobContext job) throws IOException {
super.checkOutputSpecs(job);
if (job.getConfiguration().get(SETUP_OK) == null) {
throw new IOException("Solr home cache not set up!");
}
}
@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
Utils.getLogConfigFile(context.getConfiguration());
Path workDir = getDefaultWorkFile(context, "");
int batchSize = getBatchSize(context.getConfiguration());
return new SolrRecordWriter<>(context, workDir, batchSize);
}
public static void setupSolrHomeCache(File solrHomeDir, Job job) throws IOException{
File solrHomeZip = createSolrHomeZip(solrHomeDir);
addSolrConfToDistributedCache(job, solrHomeZip);
}
public static File createSolrHomeZip(File solrHomeDir) throws IOException {
return createSolrHomeZip(solrHomeDir, false);
}
private static File createSolrHomeZip(File solrHomeDir, boolean safeToModify) throws IOException {
if (solrHomeDir == null || !(solrHomeDir.exists() && solrHomeDir.isDirectory())) {
throw new IOException("Invalid solr home: " + solrHomeDir);
}
File solrHomeZip = File.createTempFile("solr", ".zip");
createZip(solrHomeDir, solrHomeZip);
return solrHomeZip;
}
public static void addSolrConfToDistributedCache(Job job, File solrHomeZip)
throws IOException {
// Make a reasonably unique name for the zip file in the distributed cache
// to avoid collisions if multiple jobs are running.
String hdfsZipName = UUID.randomUUID().toString() + '.'
+ ZIP_FILE_BASE_NAME;
Configuration jobConf = job.getConfiguration();
jobConf.set(ZIP_NAME, hdfsZipName);
Path zipPath = new Path("/tmp", getZipName(jobConf));
FileSystem fs = FileSystem.get(jobConf);
fs.copyFromLocalFile(new Path(solrHomeZip.toString()), zipPath);
final URI baseZipUrl = fs.getUri().resolve(
zipPath.toString() + '#' + getZipName(jobConf));
DistributedCache.addCacheArchive(baseZipUrl, jobConf);
LOG.debug("Set Solr distributed cache: {}", Arrays.asList(job.getCacheArchives()));
LOG.debug("Set zipPath: {}", zipPath);
// Actually send the path for the configuration zip file
jobConf.set(SETUP_OK, zipPath.toString());
}
private static void createZip(File dir, File out) throws IOException {
HashSet<File> files = new HashSet<>();
// take only conf/ and lib/
for (String allowedDirectory : SolrRecordWriter
.getAllowedConfigDirectories()) {
File configDir = new File(dir, allowedDirectory);
boolean configDirExists;
/** If the directory does not exist, and is required, bail out */
if (!(configDirExists = configDir.exists())
&& SolrRecordWriter.isRequiredConfigDirectory(allowedDirectory)) {
throw new IOException(String.format(Locale.ENGLISH,
"required configuration directory %s is not present in %s",
allowedDirectory, dir));
}
if (!configDirExists) {
continue;
}
listFiles(configDir, files); // Store the files in the existing, allowed
// directory configDir, in the list of files
// to store in the zip file
}
Files.deleteIfExists(out.toPath());
int subst = dir.toString().length();
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(out));
byte[] buf = new byte[1024];
for (File f : files) {
ZipEntry ze = new ZipEntry(f.toString().substring(subst));
zos.putNextEntry(ze);
InputStream is = new FileInputStream(f);
int cnt;
while ((cnt = is.read(buf)) >= 0) {
zos.write(buf, 0, cnt);
}
is.close();
zos.flush();
zos.closeEntry();
}
ZipEntry ze = new ZipEntry("solr.xml");
zos.putNextEntry(ze);
zos.write("<solr></solr>".getBytes("UTF-8"));
zos.flush();
zos.closeEntry();
zos.close();
}
private static void listFiles(File dir, Set<File> files) throws IOException {
File[] list = dir.listFiles();
if (list == null && dir.isFile()) {
files.add(dir);
return;
}
for (File f : list) {
if (f.isFile()) {
files.add(f);
} else {
listFiles(f, files);
}
}
}
public static int getBatchSize(Configuration jobConf) {
// TODO Auto-generated method stub
return jobConf.getInt(SolrOutputFormat.SOLR_RECORD_WRITER_BATCH_SIZE,
defaultSolrBatchSize);
}
public static void setBatchSize(int count, Configuration jobConf) {
jobConf.setInt(SOLR_RECORD_WRITER_BATCH_SIZE, count);
}
}

View File

@ -1,479 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.DirectoryFactory;
import org.apache.solr.core.HdfsDirectoryFactory;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
class SolrRecordWriter<K, V> extends RecordWriter<K, V> {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public final static List<String> allowedConfigDirectories = new ArrayList<>(
Arrays.asList(new String[] { "conf", "lib", "solr.xml", "core1" }));
public final static Set<String> requiredConfigDirectories = new HashSet<>();
static {
requiredConfigDirectories.add("conf");
}
/**
* Return the list of directories names that may be included in the
* configuration data passed to the tasks.
*
* @return an UnmodifiableList of directory names
*/
public static List<String> getAllowedConfigDirectories() {
return Collections.unmodifiableList(allowedConfigDirectories);
}
/**
* check if the passed in directory is required to be present in the
* configuration data set.
*
* @param directory The directory to check
* @return true if the directory is required.
*/
public static boolean isRequiredConfigDirectory(final String directory) {
return requiredConfigDirectories.contains(directory);
}
/** The path that the final index will be written to */
/** The location in a local temporary directory that the index is built in. */
// /**
// * If true, create a zip file of the completed index in the final storage
// * location A .zip will be appended to the final output name if it is not
// * already present.
// */
// private boolean outputZipFile = false;
private final HeartBeater heartBeater;
private final BatchWriter batchWriter;
private final List<SolrInputDocument> batch;
private final int batchSize;
private long numDocsWritten = 0;
private long nextLogTime = System.nanoTime();
private static HashMap<TaskID, Reducer<?,?,?,?>.Context> contextMap = new HashMap<>();
public SolrRecordWriter(TaskAttemptContext context, Path outputShardDir, int batchSize) {
this.batchSize = batchSize;
this.batch = new ArrayList<>(batchSize);
Configuration conf = context.getConfiguration();
// setLogLevel("org.apache.solr.core", "WARN");
// setLogLevel("org.apache.solr.update", "WARN");
heartBeater = new HeartBeater(context);
try {
heartBeater.needHeartBeat();
Path solrHomeDir = SolrRecordWriter.findSolrConfig(conf);
FileSystem fs = outputShardDir.getFileSystem(conf);
EmbeddedSolrServer solr = createEmbeddedSolrServer(solrHomeDir, fs, outputShardDir);
batchWriter = new BatchWriter(solr, batchSize,
context.getTaskAttemptID().getTaskID(),
SolrOutputFormat.getSolrWriterThreadCount(conf),
SolrOutputFormat.getSolrWriterQueueSize(conf));
} catch (Exception e) {
throw new IllegalStateException(String.format(Locale.ENGLISH,
"Failed to initialize record writer for %s, %s", context.getJobName(), conf
.get("mapred.task.id")), e);
} finally {
heartBeater.cancelHeartBeat();
}
}
public static EmbeddedSolrServer createEmbeddedSolrServer(Path solrHomeDir, FileSystem fs, Path outputShardDir)
throws IOException {
LOG.info("Creating embedded Solr server with solrHomeDir: " + solrHomeDir + ", fs: " + fs + ", outputShardDir: " + outputShardDir);
Path solrDataDir = new Path(outputShardDir, "data");
String dataDirStr = solrDataDir.toUri().toString();
SolrResourceLoader loader = new SolrResourceLoader(Paths.get(solrHomeDir.toString()), null, null);
LOG.info(String
.format(Locale.ENGLISH,
"Constructed instance information solr.home %s (%s), instance dir %s, conf dir %s, writing index to solr.data.dir %s, with permdir %s",
solrHomeDir, solrHomeDir.toUri(), loader.getInstancePath(),
loader.getConfigDir(), dataDirStr, outputShardDir));
// TODO: This is fragile and should be well documented
System.setProperty("solr.directoryFactory", HdfsDirectoryFactory.class.getName());
System.setProperty("solr.lock.type", DirectoryFactory.LOCK_TYPE_HDFS);
System.setProperty("solr.hdfs.nrtcachingdirectory", "false");
System.setProperty("solr.hdfs.blockcache.enabled", "false");
System.setProperty("solr.autoCommit.maxTime", "600000");
System.setProperty("solr.autoSoftCommit.maxTime", "-1");
CoreContainer container = new CoreContainer(loader);
container.load();
SolrCore core = container.create("", ImmutableMap.of(CoreDescriptor.CORE_DATADIR, dataDirStr));
if (!(core.getDirectoryFactory() instanceof HdfsDirectoryFactory)) {
throw new UnsupportedOperationException(
"Invalid configuration. Currently, the only DirectoryFactory supported is "
+ HdfsDirectoryFactory.class.getSimpleName());
}
EmbeddedSolrServer solr = new EmbeddedSolrServer(container, "");
return solr;
}
public static void incrementCounter(TaskID taskId, String groupName, String counterName, long incr) {
Reducer<?,?,?,?>.Context context = contextMap.get(taskId);
if (context != null) {
context.getCounter(groupName, counterName).increment(incr);
}
}
public static void incrementCounter(TaskID taskId, Enum<?> counterName, long incr) {
Reducer<?,?,?,?>.Context context = contextMap.get(taskId);
if (context != null) {
context.getCounter(counterName).increment(incr);
}
}
public static void addReducerContext(Reducer<?,?,?,?>.Context context) {
TaskID taskID = context.getTaskAttemptID().getTaskID();
contextMap.put(taskID, context);
}
public static Path findSolrConfig(Configuration conf) throws IOException {
// FIXME when mrunit supports the new cache apis
//URI[] localArchives = context.getCacheArchives();
Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
for (Path unpackedDir : localArchives) {
if (unpackedDir.getName().equals(SolrOutputFormat.getZipName(conf))) {
LOG.info("Using this unpacked directory as solr home: {}", unpackedDir);
return unpackedDir;
}
}
throw new IOException(String.format(Locale.ENGLISH,
"No local cache archives, where is %s:%s", SolrOutputFormat
.getSetupOk(), SolrOutputFormat.getZipName(conf)));
}
/**
* Write a record. This method accumulates records in to a batch, and when
* {@link #batchSize} items are present flushes it to the indexer. The writes
* can take a substantial amount of time, depending on {@link #batchSize}. If
* there is heavy disk contention the writes may take more than the 600 second
* default timeout.
*/
@Override
public void write(K key, V value) throws IOException {
heartBeater.needHeartBeat();
try {
try {
SolrInputDocumentWritable sidw = (SolrInputDocumentWritable) value;
batch.add(sidw.getSolrInputDocument());
if (batch.size() >= batchSize) {
batchWriter.queueBatch(batch);
numDocsWritten += batch.size();
if (System.nanoTime() >= nextLogTime) {
LOG.info("docsWritten: {}", numDocsWritten);
nextLogTime += TimeUnit.NANOSECONDS.convert(10, TimeUnit.SECONDS);
}
batch.clear();
}
} catch (SolrServerException e) {
throw new IOException(e);
}
} finally {
heartBeater.cancelHeartBeat();
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if (context != null) {
heartBeater.setProgress(context);
}
try {
heartBeater.needHeartBeat();
if (batch.size() > 0) {
batchWriter.queueBatch(batch);
numDocsWritten += batch.size();
batch.clear();
}
LOG.info("docsWritten: {}", numDocsWritten);
batchWriter.close(context);
// if (outputZipFile) {
// context.setStatus("Writing Zip");
// packZipFile(); // Written to the perm location
// } else {
// context.setStatus("Copying Index");
// fs.completeLocalOutput(perm, temp); // copy to dfs
// }
} catch (Exception e) {
if (e instanceof IOException) {
throw (IOException) e;
}
throw new IOException(e);
} finally {
heartBeater.cancelHeartBeat();
heartBeater.close();
// File tempFile = new File(temp.toString());
// if (tempFile.exists()) {
// FileUtils.forceDelete(new File(temp.toString()));
// }
}
context.setStatus("Done");
}
// private void packZipFile() throws IOException {
// FSDataOutputStream out = null;
// ZipOutputStream zos = null;
// int zipCount = 0;
// LOG.info("Packing zip file for " + perm);
// try {
// out = fs.create(perm, false);
// zos = new ZipOutputStream(out);
//
// String name = perm.getName().replaceAll(".zip$", "");
// LOG.info("adding index directory" + temp);
// zipCount = zipDirectory(conf, zos, name, temp.toString(), temp);
// /**
// for (String configDir : allowedConfigDirectories) {
// if (!isRequiredConfigDirectory(configDir)) {
// continue;
// }
// final Path confPath = new Path(solrHome, configDir);
// LOG.info("adding configdirectory" + confPath);
//
// zipCount += zipDirectory(conf, zos, name, solrHome.toString(), confPath);
// }
// **/
// } catch (Throwable ohFoo) {
// LOG.error("packZipFile exception", ohFoo);
// if (ohFoo instanceof RuntimeException) {
// throw (RuntimeException) ohFoo;
// }
// if (ohFoo instanceof IOException) {
// throw (IOException) ohFoo;
// }
// throw new IOException(ohFoo);
//
// } finally {
// if (zos != null) {
// if (zipCount == 0) { // If no entries were written, only close out, as
// // the zip will throw an error
// LOG.error("No entries written to zip file " + perm);
// fs.delete(perm, false);
// // out.close();
// } else {
// LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm,
// temp));
// zos.close();
// }
// }
// }
// }
//
// /**
// * Write a file to a zip output stream, removing leading path name components
// * from the actual file name when creating the zip file entry.
// *
// * The entry placed in the zip file is <code>baseName</code>/
// * <code>relativePath</code>, where <code>relativePath</code> is constructed
// * by removing a leading <code>root</code> from the path for
// * <code>itemToZip</code>.
// *
// * If <code>itemToZip</code> is an empty directory, it is ignored. If
// * <code>itemToZip</code> is a directory, the contents of the directory are
// * added recursively.
// *
// * @param zos The zip output stream
// * @param baseName The base name to use for the file name entry in the zip
// * file
// * @param root The path to remove from <code>itemToZip</code> to make a
// * relative path name
// * @param itemToZip The path to the file to be added to the zip file
// * @return the number of entries added
// * @throws IOException
// */
// static public int zipDirectory(final Configuration conf,
// final ZipOutputStream zos, final String baseName, final String root,
// final Path itemToZip) throws IOException {
// LOG
// .info(String
// .format("zipDirectory: %s %s %s", baseName, root, itemToZip));
// LocalFileSystem localFs = FileSystem.getLocal(conf);
// int count = 0;
//
// final FileStatus itemStatus = localFs.getFileStatus(itemToZip);
// if (itemStatus.isDirectory()) {
// final FileStatus[] statai = localFs.listStatus(itemToZip);
//
// // Add a directory entry to the zip file
// final String zipDirName = relativePathForZipEntry(itemToZip.toUri()
// .getPath(), baseName, root);
// final ZipEntry dirZipEntry = new ZipEntry(zipDirName
// + Path.SEPARATOR_CHAR);
// LOG.info(String.format("Adding directory %s to zip", zipDirName));
// zos.putNextEntry(dirZipEntry);
// zos.closeEntry();
// count++;
//
// if (statai == null || statai.length == 0) {
// LOG.info(String.format("Skipping empty directory %s", itemToZip));
// return count;
// }
// for (FileStatus status : statai) {
// count += zipDirectory(conf, zos, baseName, root, status.getPath());
// }
// LOG.info(String.format("Wrote %d entries for directory %s", count,
// itemToZip));
// return count;
// }
//
// final String inZipPath = relativePathForZipEntry(itemToZip.toUri()
// .getPath(), baseName, root);
//
// if (inZipPath.length() == 0) {
// LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)",
// itemToZip, root, baseName));
// return 0;
// }
//
// // Take empty files in case the place holder is needed
// FSDataInputStream in = null;
// try {
// in = localFs.open(itemToZip);
// final ZipEntry ze = new ZipEntry(inZipPath);
// ze.setTime(itemStatus.getModificationTime());
// // Comments confuse looking at the zip file
// // ze.setComment(itemToZip.toString());
// zos.putNextEntry(ze);
//
// IOUtils.copyBytes(in, zos, conf, false);
// zos.closeEntry();
// LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip));
// return 1;
// } finally {
// in.close();
// }
//
// }
//
// static String relativePathForZipEntry(final String rawPath,
// final String baseName, final String root) {
// String relativePath = rawPath.replaceFirst(Pattern.quote(root.toString()),
// "");
// LOG.info(String.format("RawPath %s, baseName %s, root %s, first %s",
// rawPath, baseName, root, relativePath));
//
// if (relativePath.startsWith(Path.SEPARATOR)) {
// relativePath = relativePath.substring(1);
// }
// LOG.info(String.format(
// "RawPath %s, baseName %s, root %s, post leading slash %s", rawPath,
// baseName, root, relativePath));
// if (relativePath.isEmpty()) {
// LOG.warn(String.format(
// "No data after root (%s) removal from raw path %s", root, rawPath));
// return baseName;
// }
// // Construct the path that will be written to the zip file, including
// // removing any leading '/' characters
// String inZipPath = baseName + Path.SEPARATOR_CHAR + relativePath;
//
// LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 1 %s",
// rawPath, baseName, root, inZipPath));
// if (inZipPath.startsWith(Path.SEPARATOR)) {
// inZipPath = inZipPath.substring(1);
// }
// LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 2 %s",
// rawPath, baseName, root, inZipPath));
//
// return inZipPath;
//
// }
//
/*
static boolean setLogLevel(String packageName, String level) {
Log logger = LogFactory.getLog(packageName);
if (logger == null) {
return false;
}
// look for: org.apache.commons.logging.impl.SLF4JLocationAwareLog
LOG.warn("logger class:"+logger.getClass().getName());
if (logger instanceof Log4JLogger) {
process(((Log4JLogger) logger).getLogger(), level);
return true;
}
if (logger instanceof Jdk14Logger) {
process(((Jdk14Logger) logger).getLogger(), level);
return true;
}
return false;
}
public static void process(org.apache.log4j.Logger log, String level) {
if (level != null) {
log.setLevel(org.apache.log4j.Level.toLevel(level));
}
}
public static void process(java.util.logging.Logger log, String level) {
if (level != null) {
log.setLevel(java.util.logging.Level.parse(level));
}
}
*/
}

View File

@ -1,188 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.hadoop.dedup.NoChangeUpdateConflictResolver;
import org.apache.solr.hadoop.dedup.RetainMostRecentUpdateConflictResolver;
import org.apache.solr.hadoop.dedup.UpdateConflictResolver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kitesdk.morphline.api.ExceptionHandler;
import org.kitesdk.morphline.base.FaultTolerance;
import com.google.common.base.Preconditions;
/**
* This class loads the mapper's SolrInputDocuments into one EmbeddedSolrServer
* per reducer. Each such reducer and Solr server can be seen as a (micro)
* shard. The Solr servers store their data in HDFS.
*
* More specifically, this class consumes a list of &lt;docId, SolrInputDocument&gt;
* pairs, sorted by docId, and sends them to an embedded Solr server to generate
* a Solr index shard from the documents.
*/
public class SolrReducer extends Reducer<Text, SolrInputDocumentWritable, Text, SolrInputDocumentWritable> {
private UpdateConflictResolver resolver;
private HeartBeater heartBeater;
private ExceptionHandler exceptionHandler;
public static final String UPDATE_CONFLICT_RESOLVER = SolrReducer.class.getName() + ".updateConflictResolver";
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
protected void setup(Context context) throws IOException, InterruptedException {
verifyPartitionAssignment(context);
SolrRecordWriter.addReducerContext(context);
Class<? extends UpdateConflictResolver> resolverClass = context.getConfiguration().getClass(
UPDATE_CONFLICT_RESOLVER, RetainMostRecentUpdateConflictResolver.class, UpdateConflictResolver.class);
this.resolver = ReflectionUtils.newInstance(resolverClass, context.getConfiguration());
/*
* Note that ReflectionUtils.newInstance() above also implicitly calls
* resolver.configure(context.getConfiguration()) if the resolver
* implements org.apache.hadoop.conf.Configurable
*/
this.exceptionHandler = new FaultTolerance(
context.getConfiguration().getBoolean(FaultTolerance.IS_PRODUCTION_MODE, false),
context.getConfiguration().getBoolean(FaultTolerance.IS_IGNORING_RECOVERABLE_EXCEPTIONS, false),
context.getConfiguration().get(FaultTolerance.RECOVERABLE_EXCEPTION_CLASSES, SolrServerException.class.getName()));
this.heartBeater = new HeartBeater(context);
}
protected void reduce(Text key, Iterable<SolrInputDocumentWritable> values, Context context) throws IOException, InterruptedException {
heartBeater.needHeartBeat();
try {
values = resolve(key, values, context);
super.reduce(key, values, context);
} catch (Exception e) {
LOG.error("Unable to process key " + key, e);
context.getCounter(getClass().getName() + ".errors", e.getClass().getName()).increment(1);
exceptionHandler.handleException(e, null);
} finally {
heartBeater.cancelHeartBeat();
}
}
private Iterable<SolrInputDocumentWritable> resolve(
final Text key, final Iterable<SolrInputDocumentWritable> values, final Context context) {
if (resolver instanceof NoChangeUpdateConflictResolver) {
return values; // fast path
}
return new Iterable<SolrInputDocumentWritable>() {
@Override
public Iterator<SolrInputDocumentWritable> iterator() {
return new WrapIterator(resolver.orderUpdates(key, new UnwrapIterator(values.iterator()), context));
}
};
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
heartBeater.close();
super.cleanup(context);
}
/*
* Verify that if a mappers's partitioner sends an item to partition X it implies that said item
* is sent to the reducer with taskID == X. This invariant is currently required for Solr
* documents to end up in the right Solr shard.
*/
private void verifyPartitionAssignment(Context context) {
if ("true".equals(System.getProperty("verifyPartitionAssignment", "true"))) {
String partitionStr = context.getConfiguration().get("mapred.task.partition");
if (partitionStr == null) {
partitionStr = context.getConfiguration().get("mapreduce.task.partition");
}
int partition = Integer.parseInt(partitionStr);
int taskId = context.getTaskAttemptID().getTaskID().getId();
Preconditions.checkArgument(partition == taskId,
"mapred.task.partition: " + partition + " not equal to reducer taskId: " + taskId);
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class WrapIterator implements Iterator<SolrInputDocumentWritable> {
private Iterator<SolrInputDocument> parent;
private WrapIterator(Iterator<SolrInputDocument> parent) {
this.parent = parent;
}
@Override
public boolean hasNext() {
return parent.hasNext();
}
@Override
public SolrInputDocumentWritable next() {
return new SolrInputDocumentWritable(parent.next());
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class UnwrapIterator implements Iterator<SolrInputDocument> {
private Iterator<SolrInputDocumentWritable> parent;
private UnwrapIterator(Iterator<SolrInputDocumentWritable> parent) {
this.parent = parent;
}
@Override
public boolean hasNext() {
return parent.hasNext();
}
@Override
public SolrInputDocument next() {
return parent.next().getSolrInputDocument();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
}

View File

@ -1,90 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.helper.ASCIITextWidthCounter;
import net.sourceforge.argparse4j.helper.TextHelper;
import org.apache.hadoop.util.ToolRunner;
/**
* Nicely formats the output of
* {@link ToolRunner#printGenericCommandUsage(PrintStream)} with the same look and feel that argparse4j uses for help text.
*/
class ToolRunnerHelpFormatter {
public static String getGenericCommandUsage() {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
String msg;
try {
ToolRunner.printGenericCommandUsage(new PrintStream(bout, true, "UTF-8"));
msg = new String(bout.toByteArray(), StandardCharsets.UTF_8);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); // unreachable
}
BufferedReader reader = new BufferedReader(new StringReader(msg));
StringBuilder result = new StringBuilder();
while (true) {
String line;
try {
line = reader.readLine();
} catch (IOException e) {
throw new RuntimeException(e); // unreachable
}
if (line == null) {
return result.toString(); // EOS
}
if (!line.startsWith("-")) {
result.append(line + "\n");
} else {
line = line.trim();
int i = line.indexOf(" ");
if (i < 0) {
i = line.indexOf('\t');
}
if (i < 0) {
result.append(line + "\n");
} else {
String title = line.substring(0, i).trim();
if (title.length() >= 3 && Character.isLetterOrDigit(title.charAt(1)) && Character.isLetterOrDigit(title.charAt(2))) {
title = "-" + title; // prefer "--libjars" long arg style over "-libjars" style but retain "-D foo" short arg style
}
String help = line.substring(i, line.length()).trim();
StringWriter strWriter = new StringWriter();
PrintWriter writer = new PrintWriter(strWriter, true);
TextHelper.printHelp(writer, title, help, new ASCIITextWidthCounter(), ArgumentParsers.getFormatWidth());
result.append(strWriter.toString());
}
}
}
}
}

View File

@ -1,46 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* For the meat see {@link TreeMergeOutputFormat}.
*/
public class TreeMergeMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String MAX_SEGMENTS_ON_TREE_MERGE = "maxSegmentsOnTreeMerge";
public static final String SOLR_SHARD_NUMBER = "_solrShardNumber";
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
LOGGER.trace("map key: {}, value: {}", key, value);
context.write(value, NullWritable.get());
}
}

View File

@ -1,201 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import com.google.common.base.Preconditions;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.misc.IndexMergeTool;
import org.apache.lucene.store.Directory;
import org.apache.solr.store.hdfs.HdfsDirectory;
import org.apache.solr.update.SolrIndexWriter;
import org.apache.solr.util.RTimer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* See {@link IndexMergeTool}.
*/
public class TreeMergeOutputFormat extends FileOutputFormat<Text, NullWritable> {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException {
Utils.getLogConfigFile(context.getConfiguration());
Path workDir = getDefaultWorkFile(context, "");
return new TreeMergeRecordWriter(context, workDir);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class TreeMergeRecordWriter extends RecordWriter<Text,NullWritable> {
private final Path workDir;
private final List<Path> shards = new ArrayList();
private final HeartBeater heartBeater;
private final TaskAttemptContext context;
private static final Logger LOG = log;
public TreeMergeRecordWriter(TaskAttemptContext context, Path workDir) {
this.workDir = new Path(workDir, "data/index");
this.heartBeater = new HeartBeater(context);
this.context = context;
}
@Override
public void write(Text key, NullWritable value) {
LOG.info("map key: {}", key);
heartBeater.needHeartBeat();
try {
Path path = new Path(key.toString());
shards.add(path);
} finally {
heartBeater.cancelHeartBeat();
}
}
@Override
public void close(TaskAttemptContext context) throws IOException {
LOG.debug("Task " + context.getTaskAttemptID() + " merging into dstDir: " + workDir + ", srcDirs: " + shards);
writeShardNumberFile(context);
heartBeater.needHeartBeat();
try {
Directory mergedIndex = new HdfsDirectory(workDir, context.getConfiguration());
// TODO: shouldn't we pull the Version from the solrconfig.xml?
IndexWriterConfig writerConfig = new IndexWriterConfig(null)
.setOpenMode(OpenMode.CREATE).setUseCompoundFile(false)
//.setMergePolicy(mergePolicy) // TODO: grab tuned MergePolicy from solrconfig.xml?
//.setMergeScheduler(...) // TODO: grab tuned MergeScheduler from solrconfig.xml?
;
if (LOG.isDebugEnabled()) {
writerConfig.setInfoStream(System.out);
}
// writerConfig.setRAMBufferSizeMB(100); // improve performance
// writerConfig.setMaxThreadStates(1);
// disable compound file to improve performance
// also see http://lucene.472066.n3.nabble.com/Questions-on-compound-file-format-td489105.html
// also see defaults in SolrIndexConfig
MergePolicy mergePolicy = writerConfig.getMergePolicy();
LOG.debug("mergePolicy was: {}", mergePolicy);
if (mergePolicy instanceof TieredMergePolicy) {
((TieredMergePolicy) mergePolicy).setNoCFSRatio(0.0);
// ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnceExplicit(10000);
// ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnce(10000);
// ((TieredMergePolicy) mergePolicy).setSegmentsPerTier(10000);
} else if (mergePolicy instanceof LogMergePolicy) {
((LogMergePolicy) mergePolicy).setNoCFSRatio(0.0);
}
LOG.info("Using mergePolicy: {}", mergePolicy);
IndexWriter writer = new IndexWriter(mergedIndex, writerConfig);
Directory[] indexes = new Directory[shards.size()];
for (int i = 0; i < shards.size(); i++) {
indexes[i] = new HdfsDirectory(shards.get(i), context.getConfiguration());
}
context.setStatus("Logically merging " + shards.size() + " shards into one shard");
LOG.info("Logically merging " + shards.size() + " shards into one shard: " + workDir);
RTimer timer = new RTimer();
writer.addIndexes(indexes);
// TODO: avoid intermediate copying of files into dst directory; rename the files into the dir instead (cp -> rename)
// This can improve performance and turns this phase into a true "logical" merge, completing in constant time.
// See https://issues.apache.org/jira/browse/LUCENE-4746
timer.stop();
if (LOG.isDebugEnabled()) {
context.getCounter(SolrCounters.class.getName(), SolrCounters.LOGICAL_TREE_MERGE_TIME.toString()).increment((long) timer.getTime());
}
LOG.info("Logical merge took {}ms", timer.getTime());
int maxSegments = context.getConfiguration().getInt(TreeMergeMapper.MAX_SEGMENTS_ON_TREE_MERGE, Integer.MAX_VALUE);
context.setStatus("Optimizing Solr: forcing mtree merge down to " + maxSegments + " segments");
LOG.info("Optimizing Solr: forcing tree merge down to {} segments", maxSegments);
timer = new RTimer();
if (maxSegments < Integer.MAX_VALUE) {
writer.forceMerge(maxSegments);
// TODO: consider perf enhancement for no-deletes merges: bulk-copy the postings data
// see http://lucene.472066.n3.nabble.com/Experience-with-large-merge-factors-tp1637832p1647046.html
}
timer.stop();
if (LOG.isDebugEnabled()) {
context.getCounter(SolrCounters.class.getName(), SolrCounters.PHYSICAL_TREE_MERGE_TIME.toString()).increment((long) timer.getTime());
}
LOG.info("Optimizing Solr: done forcing tree merge down to {} segments in {}ms", maxSegments, timer.getTime());
// Set Solr's commit data so the created index is usable by SolrCloud. E.g. Currently SolrCloud relies on
// commitTimeMSec in the commit data to do replication.
//TODO no commitUpdateCommand
SolrIndexWriter.setCommitData(writer, -1);
timer = new RTimer();
LOG.info("Optimizing Solr: Closing index writer");
writer.close();
LOG.info("Optimizing Solr: Done closing index writer in {}ms", timer.getTime());
context.setStatus("Done");
} finally {
heartBeater.cancelHeartBeat();
heartBeater.close();
}
}
/*
* For background see MapReduceIndexerTool.renameTreeMergeShardDirs()
*
* Also see MapReduceIndexerTool.run() method where it uses
* NLineInputFormat.setNumLinesPerSplit(job, options.fanout)
*/
private void writeShardNumberFile(TaskAttemptContext context) throws IOException {
Preconditions.checkArgument(shards.size() > 0);
String shard = shards.get(0).getParent().getParent().getName(); // move up from "data/index"
String taskId = shard.substring("part-m-".length(), shard.length()); // e.g. part-m-00001
int taskNum = Integer.parseInt(taskId);
int outputShardNum = taskNum / shards.size();
LOG.debug("Merging into outputShardNum: " + outputShardNum + " from taskId: " + taskId);
Path shardNumberFile = new Path(workDir.getParent().getParent(), TreeMergeMapper.SOLR_SHARD_NUMBER);
OutputStream out = shardNumberFile.getFileSystem(context.getConfiguration()).create(shardNumberFile);
Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8);
writer.write(String.valueOf(outputShardNum));
writer.flush();
writer.close();
}
}
}

View File

@ -1,114 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class UnbufferedDataInputInputStream extends org.apache.solr.common.util.DataInputInputStream {
private final DataInputStream in;
public UnbufferedDataInputInputStream(DataInput in) {
this.in = new DataInputStream(DataInputInputStream.constructInputStream(in));
}
@Override
public void readFully(byte[] b) throws IOException {
in.readFully(b);
}
@Override
public void readFully(byte[] b, int off, int len) throws IOException {
in.readFully(b, off, len);
}
@Override
public int skipBytes(int n) throws IOException {
return in.skipBytes(n);
}
@Override
public boolean readBoolean() throws IOException {
return in.readBoolean();
}
@Override
public byte readByte() throws IOException {
return in.readByte();
}
@Override
public int readUnsignedByte() throws IOException {
return in.readUnsignedByte();
}
@Override
public short readShort() throws IOException {
return in.readShort();
}
@Override
public int readUnsignedShort() throws IOException {
return in.readUnsignedShort();
}
@Override
public char readChar() throws IOException {
return in.readChar();
}
@Override
public int readInt() throws IOException {
return in.readInt();
}
@Override
public long readLong() throws IOException {
return in.readLong();
}
@Override
public float readFloat() throws IOException {
return in.readFloat();
}
@Override
public double readDouble() throws IOException {
return in.readDouble();
}
@Override
public String readLine() throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
return reader.readLine();
}
@Override
public String readUTF() throws IOException {
return in.readUTF();
}
@Override
public int read() throws IOException {
return in.read();
}
}

View File

@ -1,59 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.PropertyConfigurator;
import com.google.common.annotations.Beta;
import org.apache.solr.common.util.SuppressForbidden;
@Beta
public final class Utils {
private static final String LOG_CONFIG_FILE = "hadoop.log4j.configuration";
public static void setLogConfigFile(File file, Configuration conf) {
conf.set(LOG_CONFIG_FILE, file.getName());
}
public static void getLogConfigFile(Configuration conf) {
String log4jPropertiesFile = conf.get(LOG_CONFIG_FILE);
configureLog4jProperties(log4jPropertiesFile);
}
@SuppressForbidden(reason = "method is specific to log4j")
public static void configureLog4jProperties(String log4jPropertiesFile) {
if (log4jPropertiesFile != null) {
PropertyConfigurator.configure(log4jPropertiesFile);
}
}
public static String getShortClassName(Class clazz) {
return getShortClassName(clazz.getName());
}
public static String getShortClassName(String className) {
int i = className.lastIndexOf('.'); // regular class
int j = className.lastIndexOf('$'); // inner class
return className.substring(1 + Math.max(i, j));
}
}

View File

@ -1,213 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.Aliases;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkConfigManager;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.StrUtils;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Extracts SolrCloud information from ZooKeeper.
*/
final class ZooKeeperInspector {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public List<List<String>> extractShardUrls(String zkHost, String collection) {
DocCollection docCollection = extractDocCollection(zkHost, collection);
List<Slice> slices = getSortedSlices(docCollection.getSlices());
List<List<String>> solrUrls = new ArrayList<>(slices.size());
for (Slice slice : slices) {
if (slice.getLeader() == null) {
throw new IllegalArgumentException("Cannot find SolrCloud slice leader. " +
"It looks like not all of your shards are registered in ZooKeeper yet");
}
Collection<Replica> replicas = slice.getReplicas();
List<String> urls = new ArrayList<>(replicas.size());
for (Replica replica : replicas) {
ZkCoreNodeProps props = new ZkCoreNodeProps(replica);
urls.add(props.getCoreUrl());
}
solrUrls.add(urls);
}
return solrUrls;
}
public DocCollection extractDocCollection(String zkHost, String collection) {
if (collection == null) {
throw new IllegalArgumentException("collection must not be null");
}
SolrZkClient zkClient = getZkClient(zkHost);
try (ZkStateReader zkStateReader = new ZkStateReader(zkClient)) {
try {
// first check for alias
collection = checkForAlias(zkClient, collection);
zkStateReader.createClusterStateWatchersAndUpdate();
} catch (Exception e) {
throw new IllegalArgumentException("Cannot find expected information for SolrCloud in ZooKeeper: " + zkHost, e);
}
try {
return zkStateReader.getClusterState().getCollection(collection);
} catch (SolrException e) {
throw new IllegalArgumentException("Cannot find collection '" + collection + "' in ZooKeeper: " + zkHost, e);
}
} finally {
zkClient.close();
}
}
public SolrZkClient getZkClient(String zkHost) {
if (zkHost == null) {
throw new IllegalArgumentException("zkHost must not be null");
}
SolrZkClient zkClient;
try {
zkClient = new SolrZkClient(zkHost, 30000);
} catch (Exception e) {
throw new IllegalArgumentException("Cannot connect to ZooKeeper: " + zkHost, e);
}
return zkClient;
}
public List<Slice> getSortedSlices(Collection<Slice> slices) {
List<Slice> sorted = new ArrayList(slices);
Collections.sort(sorted, (slice1, slice2) -> {
Comparator c = new AlphaNumericComparator();
return c.compare(slice1.getName(), slice2.getName());
});
LOG.trace("Sorted slices: {}", sorted);
return sorted;
}
/**
* Returns config value given collection name
* Borrowed heavily from Solr's ZKController.
*/
public String readConfigName(SolrZkClient zkClient, String collection)
throws KeeperException, InterruptedException {
if (collection == null) {
throw new IllegalArgumentException("collection must not be null");
}
String configName = null;
// first check for alias
collection = checkForAlias(zkClient, collection);
String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
if (LOG.isInfoEnabled()) {
LOG.info("Load collection config from:" + path);
}
byte[] data = zkClient.getData(path, null, null, true);
if(data != null) {
ZkNodeProps props = ZkNodeProps.load(data);
configName = props.getStr(ZkController.CONFIGNAME_PROP);
}
if (configName != null && !zkClient.exists(ZkConfigManager.CONFIGS_ZKNODE + "/" + configName, true)) {
LOG.error("Specified config does not exist in ZooKeeper:" + configName);
throw new IllegalArgumentException("Specified config does not exist in ZooKeeper:"
+ configName);
}
return configName;
}
private String checkForAlias(SolrZkClient zkClient, String collection)
throws KeeperException, InterruptedException {
byte[] aliasData = zkClient.getData(ZkStateReader.ALIASES, null, null, true);
Aliases aliases = ClusterState.load(aliasData);
String alias = aliases.getCollectionAlias(collection);
if (alias != null) {
List<String> aliasList = StrUtils.splitSmart(alias, ",", true);
if (aliasList.size() > 1) {
throw new IllegalArgumentException("collection cannot be an alias that maps to multiple collections");
}
collection = aliasList.get(0);
}
return collection;
}
/**
* Download and return the config directory from ZK
*/
public File downloadConfigDir(SolrZkClient zkClient, String configName)
throws IOException, InterruptedException, KeeperException {
File dir = Files.createTempDir();
dir.deleteOnExit();
ZkConfigManager configManager = new ZkConfigManager(zkClient);
configManager.downloadConfigDir(configName, dir.toPath());
File confDir = new File(dir, "conf");
if (!confDir.isDirectory()) {
// create a temporary directory with "conf" subdir and mv the config in there. This is
// necessary because of CDH-11188; solrctl does not generate nor accept directories with e.g.
// conf/solrconfig.xml which is necessary for proper solr operation. This should work
// even if solrctl changes.
confDir = new File(Files.createTempDir().getAbsolutePath(), "conf");
confDir.getParentFile().deleteOnExit();
Files.move(dir, confDir);
dir = confDir.getParentFile();
}
FileUtils.writeStringToFile(new File(dir, "solr.xml"), "<solr><solrcloud></solrcloud></solr>", "UTF-8");
verifyConfigDir(confDir);
return dir;
}
private void verifyConfigDir(File confDir) throws IOException {
File solrConfigFile = new File(confDir, "solrconfig.xml");
if (!solrConfigFile.exists()) {
throw new IOException("Detected invalid Solr config dir in ZooKeeper - Reason: File not found: "
+ solrConfigFile.getName());
}
if (!solrConfigFile.isFile()) {
throw new IOException("Detected invalid Solr config dir in ZooKeeper - Reason: Not a file: "
+ solrConfigFile.getName());
}
if (!solrConfigFile.canRead()) {
throw new IOException("Insufficient permissions to read file: " + solrConfigFile);
}
}
}

View File

@ -1,36 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.dedup;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.solr.common.SolrInputDocument;
/**
* UpdateConflictResolver implementation that returns the solr documents in the
* same order as they are received on input, i.e. without change in order.
*/
public final class NoChangeUpdateConflictResolver implements UpdateConflictResolver {
@Override
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
return updates;
}
}

View File

@ -1,48 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.dedup;
import java.util.Collections;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.solr.common.SolrInputDocument;
/**
* UpdateConflictResolver implementation that rejects multiple documents with
* the same key with an exception.
*/
public final class RejectingUpdateConflictResolver implements UpdateConflictResolver {
@Override
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
SolrInputDocument firstUpdate = null;
while (updates.hasNext()) {
if (firstUpdate == null) {
firstUpdate = updates.next();
assert firstUpdate != null;
} else {
throw new IllegalArgumentException("Update conflict! Documents with the same unique key are forbidden: "
+ key);
}
}
assert firstUpdate != null;
return Collections.singletonList(firstUpdate).iterator();
}
}

View File

@ -1,114 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.dedup;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.hadoop.HdfsFileFieldNames;
import org.apache.solr.hadoop.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* UpdateConflictResolver implementation that ignores all but the most recent
* document version, based on a configurable numeric Solr field, which defaults
* to the file_last_modified timestamp.
*/
public class RetainMostRecentUpdateConflictResolver implements UpdateConflictResolver, Configurable {
private Configuration conf;
private String orderByFieldName = ORDER_BY_FIELD_NAME_DEFAULT;
public static final String ORDER_BY_FIELD_NAME_KEY =
RetainMostRecentUpdateConflictResolver.class.getName() + ".orderByFieldName";
public static final String ORDER_BY_FIELD_NAME_DEFAULT = HdfsFileFieldNames.FILE_LAST_MODIFIED;
public static final String COUNTER_GROUP = Utils.getShortClassName(RetainMostRecentUpdateConflictResolver.class);
public static final String DUPLICATES_COUNTER_NAME = "Number of documents ignored as duplicates";
public static final String OUTDATED_COUNTER_NAME = "Number of documents ignored as outdated";
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.orderByFieldName = conf.get(ORDER_BY_FIELD_NAME_KEY, orderByFieldName);
}
@Override
public Configuration getConf() {
return conf;
}
protected String getOrderByFieldName() {
return orderByFieldName;
}
@Override
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
return getMaximum(updates, getOrderByFieldName(), new SolrInputDocumentComparator.TimeStampComparator(), ctx);
}
/** Returns the most recent document among the colliding updates */
protected Iterator<SolrInputDocument> getMaximum(Iterator<SolrInputDocument> updates, String fieldName,
Comparator child, Context context) {
SolrInputDocumentComparator comp = new SolrInputDocumentComparator(fieldName, child);
SolrInputDocument max = null;
long numDupes = 0;
long numOutdated = 0;
while (updates.hasNext()) {
SolrInputDocument next = updates.next();
assert next != null;
if (max == null) {
max = next;
} else {
int c = comp.compare(next, max);
if (c == 0) {
LOG.debug("Ignoring document version because it is a duplicate: {}", next);
numDupes++;
} else if (c > 0) {
LOG.debug("Ignoring document version because it is outdated: {}", max);
max = next;
numOutdated++;
} else {
LOG.debug("Ignoring document version because it is outdated: {}", next);
numOutdated++;
}
}
}
assert max != null;
if (numDupes > 0) {
context.getCounter(COUNTER_GROUP, DUPLICATES_COUNTER_NAME).increment(numDupes);
}
if (numOutdated > 0) {
context.getCounter(COUNTER_GROUP, OUTDATED_COUNTER_NAME).increment(numOutdated);
}
return Collections.singletonList(max).iterator();
}
}

View File

@ -1,84 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.dedup;
import java.util.Comparator;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
/**
* Default mechanism of determining which of two Solr documents with the same
* key is the more recent version.
*/
public final class SolrInputDocumentComparator implements Comparator<SolrInputDocument> {
private Comparator child;
private String fieldName;
SolrInputDocumentComparator(String fieldName, Comparator child) {
this.child = child;
this.fieldName = fieldName;
}
@Override
public int compare(SolrInputDocument doc1, SolrInputDocument doc2) {
SolrInputField f1 = doc1.getField(fieldName);
SolrInputField f2 = doc2.getField(fieldName);
if (f1 == f2) {
return 0;
} else if (f1 == null) {
return -1;
} else if (f2 == null) {
return 1;
}
Object v1 = f1.getFirstValue();
Object v2 = f2.getFirstValue();
return child.compare(v1, v2);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
public static final class TimeStampComparator implements Comparator {
@Override
public int compare(Object v1, Object v2) {
if (v1 == v2) {
return 0;
} else if (v1 == null) {
return -1;
} else if (v2 == null) {
return 1;
}
long t1 = getLong(v1);
long t2 = getLong(v2);
return (t1 < t2 ? -1 : (t1==t2 ? 0 : 1));
}
private long getLong(Object v) {
if (v instanceof Long) {
return ((Long) v).longValue();
} else {
return Long.parseLong(v.toString());
}
}
}
}

View File

@ -1,79 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.dedup;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.hadoop.HdfsFileFieldNames;
/**
* UpdateConflictResolver implementation that orders colliding updates ascending
* from least recent to most recent (partial) update, based on a configurable
* numeric Solr field, which defaults to the file_last_modified timestamp.
*/
public class SortingUpdateConflictResolver implements UpdateConflictResolver, Configurable {
private Configuration conf;
private String orderByFieldName = ORDER_BY_FIELD_NAME_DEFAULT;
public static final String ORDER_BY_FIELD_NAME_KEY =
SortingUpdateConflictResolver.class.getName() + ".orderByFieldName";
public static final String ORDER_BY_FIELD_NAME_DEFAULT = HdfsFileFieldNames.FILE_LAST_MODIFIED;
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.orderByFieldName = conf.get(ORDER_BY_FIELD_NAME_KEY, orderByFieldName);
}
@Override
public Configuration getConf() {
return conf;
}
protected String getOrderByFieldName() {
return orderByFieldName;
}
@Override
public Iterator<SolrInputDocument> orderUpdates(Text key, Iterator<SolrInputDocument> updates, Context ctx) {
return sort(updates, getOrderByFieldName(), new SolrInputDocumentComparator.TimeStampComparator());
}
protected Iterator<SolrInputDocument> sort(Iterator<SolrInputDocument> updates, String fieldName, Comparator child) {
// TODO: use an external merge sort in the pathological case where there are a huge amount of collisions
List<SolrInputDocument> sortedUpdates = new ArrayList(1);
while (updates.hasNext()) {
sortedUpdates.add(updates.next());
}
if (sortedUpdates.size() > 1) { // conflicts are rare
Collections.sort(sortedUpdates, new SolrInputDocumentComparator(fieldName, child));
}
return sortedUpdates.iterator();
}
}

View File

@ -1,71 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.dedup;
import java.util.Iterator;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.solr.common.SolrInputDocument;
/**
* Interface that enables deduplication and ordering of a series of document
* updates for the same unique document key.
*
* For example, a MapReduce batch job might index multiple files in the same job
* where some of the files contain old and new versions of the very same
* document, using the same unique document key.
*
* Typically, implementations of this interface forbid collisions by throwing an
* exception, or ignore all but the most recent document version, or, in the
* general case, order colliding updates ascending from least recent to most
* recent (partial) update.
*
* The caller of this interface (i.e. the Hadoop Reducer) will then apply the
* updates to Solr in the order returned by the orderUpdates() method.
*
* Configuration: If an UpdateConflictResolver implementation also implements
* {@link Configurable} then the Hadoop Reducer will call
* {@link Configurable#setConf(org.apache.hadoop.conf.Configuration)} on
* instance construction and pass the standard Hadoop configuration information.
*/
public interface UpdateConflictResolver {
/**
* Given a list of all colliding document updates for the same unique document
* key, this method returns zero or more documents in an application specific
* order.
*
* The caller will then apply the updates for this key to Solr in the order
* returned by the orderUpdate() method.
*
* @param uniqueKey
* the document key common to all collidingUpdates mentioned below
* @param collidingUpdates
* all updates in the MapReduce job that have a key equal to
* {@code uniqueKey} mentioned above. The input order is unspecified.
* @param context
* The <code>Context</code> passed from the {@link Reducer}
* implementations.
* @return the order in which the updates shall be applied to Solr
*/
Iterator<SolrInputDocument> orderUpdates(
Text uniqueKey, Iterator<SolrInputDocument> collidingUpdates, Context context);
}

View File

@ -1,25 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Dedupe related code.
*/
package org.apache.solr.hadoop.dedup;

View File

@ -1,47 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.morphline;
import org.apache.solr.hadoop.Utils;
public enum MorphlineCounters {
FILES_READ (getClassName(MorphlineMapper.class) + ": Number of files read"),
FILE_BYTES_READ (getClassName(MorphlineMapper.class) + ": Number of file bytes read"),
DOCS_READ (getClassName(MorphlineMapper.class) + ": Number of documents read"),
PARSER_OUTPUT_BYTES (getClassName(MorphlineMapper.class) + ": Number of document bytes generated by Tika parser"),
ERRORS (getClassName(MorphlineMapper.class) + ": Number of errors");
private final String label;
private MorphlineCounters(String label) {
this.label = label;
}
public String toString() {
return label;
}
private static String getClassName(Class clazz) {
return Utils.getShortClassName(clazz);
}
}

View File

@ -1,268 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.morphline;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.hadoop.HdfsFileFieldNames;
import org.apache.solr.hadoop.PathParts;
import org.apache.solr.hadoop.Utils;
import org.apache.solr.morphlines.solr.DocumentLoader;
import org.apache.solr.morphlines.solr.SolrLocator;
import org.apache.solr.morphlines.solr.SolrMorphlineContext;
import org.apache.solr.schema.IndexSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.Compiler;
import org.kitesdk.morphline.base.FaultTolerance;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Metrics;
import org.kitesdk.morphline.base.Notifications;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.annotations.Beta;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
* Internal helper for {@link MorphlineMapper} and dryRun mode; This API is for *INTERNAL* use only
* and should not be considered public.
*/
@Beta
public final class MorphlineMapRunner {
private MorphlineContext morphlineContext;
private Command morphline;
private IndexSchema schema;
private Map<String, String> commandLineMorphlineHeaders;
private boolean disableFileOpen;
private String morphlineFileAndId;
private final Timer elapsedTime;
public static final String MORPHLINE_FILE_PARAM = "morphlineFile";
public static final String MORPHLINE_ID_PARAM = "morphlineId";
/**
* Morphline variables can be passed from the CLI to the Morphline, e.g.:
* hadoop ... -D morphlineVariable.zkHost=127.0.0.1:2181/solr
*/
public static final String MORPHLINE_VARIABLE_PARAM = "morphlineVariable";
/**
* Headers, including MIME types, can also explicitly be passed by force from the CLI to Morphline, e.g:
* hadoop ... -D morphlineField._attachment_mimetype=text/csv
*/
public static final String MORPHLINE_FIELD_PREFIX = "morphlineField.";
/**
* Flag to disable reading of file contents if indexing just file metadata is sufficient.
* This improves performance and confidentiality.
*/
public static final String DISABLE_FILE_OPEN = "morphlineDisableFileOpen";
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
MorphlineContext getMorphlineContext() {
return morphlineContext;
}
IndexSchema getSchema() {
return schema;
}
public MorphlineMapRunner(Configuration configuration, DocumentLoader loader, String solrHomeDir) throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace("CWD is {}", new File(".").getCanonicalPath());
TreeMap map = new TreeMap();
for (Map.Entry<String,String> entry : configuration) {
map.put(entry.getKey(), entry.getValue());
}
LOG.trace("Configuration:\n" +
map.entrySet().stream().map(Object::toString).collect(Collectors.joining("\n")));
}
FaultTolerance faultTolerance = new FaultTolerance(
configuration.getBoolean(FaultTolerance.IS_PRODUCTION_MODE, false),
configuration.getBoolean(FaultTolerance.IS_IGNORING_RECOVERABLE_EXCEPTIONS, false),
configuration.get(FaultTolerance.RECOVERABLE_EXCEPTION_CLASSES, SolrServerException.class.getName())
);
morphlineContext = new SolrMorphlineContext.Builder()
.setDocumentLoader(loader)
.setExceptionHandler(faultTolerance)
.setMetricRegistry(new MetricRegistry())
.build();
class MySolrLocator extends SolrLocator { // trick to access protected ctor
public MySolrLocator(MorphlineContext ctx) {
super(ctx);
}
}
SolrLocator locator = new MySolrLocator(morphlineContext);
locator.setSolrHomeDir(solrHomeDir);
schema = locator.getIndexSchema();
// rebuild context, now with schema
morphlineContext = new SolrMorphlineContext.Builder()
.setIndexSchema(schema)
.setDocumentLoader(loader)
.setExceptionHandler(faultTolerance)
.setMetricRegistry(morphlineContext.getMetricRegistry())
.build();
String morphlineFile = configuration.get(MORPHLINE_FILE_PARAM);
String morphlineId = configuration.get(MORPHLINE_ID_PARAM);
if (morphlineFile == null || morphlineFile.trim().length() == 0) {
throw new MorphlineCompilationException("Missing parameter: " + MORPHLINE_FILE_PARAM, null);
}
Map morphlineVariables = new HashMap();
for (Map.Entry<String, String> entry : configuration) {
String variablePrefix = MORPHLINE_VARIABLE_PARAM + ".";
if (entry.getKey().startsWith(variablePrefix)) {
morphlineVariables.put(entry.getKey().substring(variablePrefix.length()), entry.getValue());
}
}
Config override = ConfigFactory.parseMap(morphlineVariables);
morphline = new Compiler().compile(new File(morphlineFile), morphlineId, morphlineContext, null, override);
morphlineFileAndId = morphlineFile + "@" + morphlineId;
disableFileOpen = configuration.getBoolean(DISABLE_FILE_OPEN, false);
LOG.debug("disableFileOpen: {}", disableFileOpen);
commandLineMorphlineHeaders = new HashMap();
for (Map.Entry<String,String> entry : configuration) {
if (entry.getKey().startsWith(MORPHLINE_FIELD_PREFIX)) {
commandLineMorphlineHeaders.put(entry.getKey().substring(MORPHLINE_FIELD_PREFIX.length()), entry.getValue());
}
}
LOG.debug("Headers, including MIME types, passed by force from the CLI to morphline: {}", commandLineMorphlineHeaders);
String metricName = MetricRegistry.name(Utils.getShortClassName(getClass()), Metrics.ELAPSED_TIME);
this.elapsedTime = morphlineContext.getMetricRegistry().timer(metricName);
Notifications.notifyBeginTransaction(morphline);
}
/**
* Extract content from the path specified in the value. Key is useless.
*/
public void map(String value, Configuration configuration, Context context) throws IOException {
LOG.info("Processing file {}", value);
InputStream in = null;
Record record = null;
Timer.Context timerContext = elapsedTime.time();
try {
PathParts parts = new PathParts(value.toString(), configuration);
record = getRecord(parts);
if (record == null) {
return; // ignore
}
for (Map.Entry<String, String> entry : commandLineMorphlineHeaders.entrySet()) {
record.replaceValues(entry.getKey(), entry.getValue());
}
long fileLength = parts.getFileStatus().getLen();
if (disableFileOpen) {
in = new ByteArrayInputStream(new byte[0]);
} else {
in = new BufferedInputStream(parts.getFileSystem().open(parts.getUploadPath()));
}
record.put(Fields.ATTACHMENT_BODY, in);
Notifications.notifyStartSession(morphline);
if (!morphline.process(record)) {
LOG.warn("Morphline {} failed to process record: {}", morphlineFileAndId, record);
}
if (context != null) {
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.FILES_READ.toString()).increment(1);
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.FILE_BYTES_READ.toString()).increment(fileLength);
}
} catch (Exception e) {
LOG.error("Unable to process file " + value, e);
if (context != null) {
context.getCounter(getClass().getName() + ".errors", e.getClass().getName()).increment(1);
}
morphlineContext.getExceptionHandler().handleException(e, record);
} finally {
timerContext.stop();
if (in != null) {
in.close();
}
}
}
protected Record getRecord(PathParts parts) {
FileStatus stats;
try {
stats = parts.getFileStatus();
} catch (IOException e) {
stats = null;
}
if (stats == null) {
LOG.warn("Ignoring file that somehow has become unavailable since the job was submitted: {}",
parts.getUploadURL());
return null;
}
Record headers = new Record();
//headers.put(getSchema().getUniqueKeyField().getName(), parts.getId()); // use HDFS file path as docId if no docId is specified
headers.put(Fields.BASE_ID, parts.getId()); // with sanitizeUniqueKey command, use HDFS file path as docId if no docId is specified
headers.put(Fields.ATTACHMENT_NAME, parts.getName()); // Tika can use the file name in guessing the right MIME type
// enable indexing and storing of file meta data in Solr
headers.put(HdfsFileFieldNames.FILE_UPLOAD_URL, parts.getUploadURL());
headers.put(HdfsFileFieldNames.FILE_DOWNLOAD_URL, parts.getDownloadURL());
headers.put(HdfsFileFieldNames.FILE_SCHEME, parts.getScheme());
headers.put(HdfsFileFieldNames.FILE_HOST, parts.getHost());
headers.put(HdfsFileFieldNames.FILE_PORT, String.valueOf(parts.getPort()));
headers.put(HdfsFileFieldNames.FILE_PATH, parts.getURIPath());
headers.put(HdfsFileFieldNames.FILE_NAME, parts.getName());
headers.put(HdfsFileFieldNames.FILE_LAST_MODIFIED, String.valueOf(stats.getModificationTime())); // FIXME also add in SpoolDirectorySource
headers.put(HdfsFileFieldNames.FILE_LENGTH, String.valueOf(stats.getLen())); // FIXME also add in SpoolDirectorySource
headers.put(HdfsFileFieldNames.FILE_OWNER, stats.getOwner());
headers.put(HdfsFileFieldNames.FILE_GROUP, stats.getGroup());
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_USER, stats.getPermission().getUserAction().SYMBOL);
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_GROUP, stats.getPermission().getGroupAction().SYMBOL);
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_OTHER, stats.getPermission().getOtherAction().SYMBOL);
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_STICKYBIT, String.valueOf(stats.getPermission().getStickyBit()));
// TODO: consider to add stats.getAccessTime(), stats.getReplication(), stats.isSymlink(), stats.getBlockSize()
return headers;
}
public void cleanup() {
Notifications.notifyCommitTransaction(morphline);
Notifications.notifyShutdown(morphline);
}
}

View File

@ -1,193 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.morphline;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
import java.util.Map;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.hadoop.HeartBeater;
import org.apache.solr.hadoop.SolrInputDocumentWritable;
import org.apache.solr.hadoop.SolrMapper;
import org.apache.solr.morphlines.solr.DocumentLoader;
import org.apache.solr.schema.IndexSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Counting;
import com.codahale.metrics.Histogram;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
/**
* This class takes the input files, extracts the relevant content, transforms
* it and hands SolrInputDocuments to a set of reducers.
*
* More specifically, it consumes a list of &lt;offset, hdfsFilePath&gt; input pairs.
* For each such pair extracts a set of zero or more SolrInputDocuments and
* sends them to a downstream Reducer. The key for the reducer is the unique id
* of the SolrInputDocument specified in Solr schema.xml.
*/
public class MorphlineMapper extends SolrMapper<LongWritable, Text> {
private Context context;
private MorphlineMapRunner runner;
private HeartBeater heartBeater;
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
protected IndexSchema getSchema() {
return runner.getSchema();
}
protected Context getContext() {
return context;
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
this.context = context;
heartBeater = new HeartBeater(context);
this.runner = new MorphlineMapRunner(
context.getConfiguration(), new MyDocumentLoader(), getSolrHomeDir().toString());
}
/**
* Extract content from the path specified in the value. Key is useless.
*/
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
heartBeater.needHeartBeat();
try {
runner.map(value.toString(), context.getConfiguration(), context);
} finally {
heartBeater.cancelHeartBeat();
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
heartBeater.close();
runner.cleanup();
addMetricsToMRCounters(runner.getMorphlineContext().getMetricRegistry(), context);
super.cleanup(context);
}
private void addMetricsToMRCounters(MetricRegistry metricRegistry, Context context) {
for (Map.Entry<String, Counter> entry : metricRegistry.getCounters().entrySet()) {
addCounting(entry.getKey(), entry.getValue(), 1);
}
for (Map.Entry<String, Histogram> entry : metricRegistry.getHistograms().entrySet()) {
addCounting(entry.getKey(), entry.getValue(), 1);
}
for (Map.Entry<String, Meter> entry : metricRegistry.getMeters().entrySet()) {
addCounting(entry.getKey(), entry.getValue(), 1);
}
for (Map.Entry<String, Timer> entry : metricRegistry.getTimers().entrySet()) {
long nanosPerMilliSec = 1000 * 1000;
addCounting(entry.getKey(), entry.getValue(), nanosPerMilliSec);
}
}
private void addCounting(String metricName, Counting value, long scale) {
context.getCounter("morphline", metricName).increment(value.getCount() / scale);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private final class MyDocumentLoader implements DocumentLoader {
@Override
public void beginTransaction() {
}
@Override
public void load(SolrInputDocument doc) throws IOException, SolrServerException {
String uniqueKeyFieldName = getSchema().getUniqueKeyField().getName();
Object id = doc.getFieldValue(uniqueKeyFieldName);
if (id == null) {
throw new IllegalArgumentException("Missing value for (required) unique document key: " + uniqueKeyFieldName
+ " (see Solr schema.xml)");
}
try {
context.write(new Text(id.toString()), new SolrInputDocumentWritable(doc));
} catch (InterruptedException e) {
throw new IOException("Interrupted while writing " + doc, e);
}
if (LOG.isDebugEnabled()) {
long numParserOutputBytes = 0;
for (SolrInputField field : doc.values()) {
numParserOutputBytes += sizeOf(field.getValue());
}
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.PARSER_OUTPUT_BYTES.toString()).increment(numParserOutputBytes);
}
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.DOCS_READ.toString()).increment(1);
}
// just an approximation
private long sizeOf(Object value) {
if (value instanceof CharSequence) {
return ((CharSequence) value).length();
} else if (value instanceof Integer) {
return 4;
} else if (value instanceof Long) {
return 8;
} else if (value instanceof Collection) {
long size = 0;
for (Object val : (Collection) value) {
size += sizeOf(val);
}
return size;
} else {
return String.valueOf(value).length();
}
}
@Override
public void commitTransaction() {
}
@Override
public UpdateResponse rollbackTransaction() throws SolrServerException, IOException {
return new UpdateResponse();
}
@Override
public void shutdown() {
}
@Override
public SolrPingResponse ping() throws SolrServerException, IOException {
return new SolrPingResponse();
}
}
}

View File

@ -1,25 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Morphlines related code.
*/
package org.apache.solr.hadoop.morphline;

View File

@ -1,25 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* {@link org.apache.solr.hadoop.MapReduceIndexerTool} and related code.
*/
package org.apache.solr.hadoop;

View File

@ -1,21 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Apache Solr Search Server: Solr MapReduce contrib
</body>
</html>

View File

@ -1 +0,0 @@
The test-files by this module are located in the morphlines-core module.

View File

@ -1,46 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.util.Comparator;
import org.junit.Assert;
import org.junit.Test;
public class AlphaNumericComparatorTest extends Assert {
@Test
public void testBasic() {
Comparator c = new AlphaNumericComparator();
assertTrue(c.compare("a", "b") < 0);
assertTrue(c.compare("shard1", "shard1") == 0);
//assertTrue(c.compare("shard01", "shard1") == 0);
assertTrue(c.compare("shard10", "shard10") == 0);
assertTrue(c.compare("shard1", "shard2") < 0);
assertTrue(c.compare("shard9", "shard10") < 0);
assertTrue(c.compare("shard09", "shard10") < 0);
assertTrue(c.compare("shard019", "shard10") > 0);
assertTrue(c.compare("shard10", "shard11") < 0);
assertTrue(c.compare("shard10z", "shard10z") == 0);
assertTrue(c.compare("shard10z", "shard11z") < 0);
assertTrue(c.compare("shard10a", "shard10z") < 0);
assertTrue(c.compare("shard10z", "shard10a") > 0);
assertTrue(c.compare("shard1z", "shard1z") == 0);
assertTrue(c.compare("shard2", "shard1") > 0);
}
}

View File

@ -1,38 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IdentityMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
LOGGER.info("map key: {}, value: {}", key, value);
context.write(value, NullWritable.get());
}
}

View File

@ -1,37 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IdentityReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
LOGGER.info("reduce key: {}, value: {}", key, values);
context.write(key, NullWritable.get());
}
}

View File

@ -1,94 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class LineRandomizerMapperReducerTest extends Assert {
private MapReduceDriver<LongWritable, Text, LongWritable, Text, Text, NullWritable> mapReduceDriver;
@Before
public void setUp() {
LineRandomizerMapper mapper = new LineRandomizerMapper();
LineRandomizerReducer reducer = new LineRandomizerReducer();
mapReduceDriver = MapReduceDriver.newMapReduceDriver(mapper, reducer);
}
@Test
public void testMapReduce1Item() throws IOException {
mapReduceDriver.withInput(new LongWritable(0), new Text("hello"));
mapReduceDriver.withOutput(new Text("hello"), NullWritable.get());
mapReduceDriver.runTest();
}
@Test
public void testMapReduce2Items() throws IOException {
mapReduceDriver.withAll(Arrays.asList(
new Pair<>(new LongWritable(0), new Text("hello")),
new Pair<>(new LongWritable(1), new Text("world"))
));
mapReduceDriver.withAllOutput(Arrays.asList(
new Pair<>(new Text("world"), NullWritable.get()),
new Pair<>(new Text("hello"), NullWritable.get())
));
mapReduceDriver.runTest();
}
@Test
public void testMapReduce3Items() throws IOException {
mapReduceDriver.withAll(Arrays.asList(
new Pair<>(new LongWritable(0), new Text("hello")),
new Pair<>(new LongWritable(1), new Text("world")),
new Pair<>(new LongWritable(2), new Text("nadja"))
));
mapReduceDriver.withAllOutput(Arrays.asList(
new Pair<>(new Text("nadja"), NullWritable.get()),
new Pair<>(new Text("world"), NullWritable.get()),
new Pair<>(new Text("hello"), NullWritable.get())
));
mapReduceDriver.runTest();
}
@Test
public void testMapReduce4Items() throws IOException {
mapReduceDriver.withAll(Arrays.asList(
new Pair<>(new LongWritable(0), new Text("hello")),
new Pair<>(new LongWritable(1), new Text("world")),
new Pair<>(new LongWritable(2), new Text("nadja")),
new Pair<>(new LongWritable(3), new Text("basti"))
));
mapReduceDriver.withAllOutput(Arrays.asList(
new Pair<>(new Text("nadja"), NullWritable.get()),
new Pair<>(new Text("world"), NullWritable.get()),
new Pair<>(new Text("basti"), NullWritable.get()),
new Pair<>(new Text("hello"), NullWritable.get())
));
mapReduceDriver.runTest();
}
}

View File

@ -1,64 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Locale;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public abstract class MRUnitBase extends SolrTestCaseJ4 {
protected static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
protected static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
protected static File solrHomeZip;
@BeforeClass
public static void setupClass() throws Exception {
assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
new Locale("tr").getLanguage().equals(Locale.getDefault().getLanguage()));
solrHomeZip = SolrOutputFormat.createSolrHomeZip(new File(RESOURCES_DIR + "/solr/mrunit"));
assertNotNull(solrHomeZip);
}
@AfterClass
public static void teardownClass() throws Exception {
if (solrHomeZip != null) Files.delete(solrHomeZip.toPath());
solrHomeZip = null;
}
protected void setupHadoopConfig(Configuration config) throws IOException {
String tempDir = createTempDir().toFile().getAbsolutePath();
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf");
config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName());
}
}

View File

@ -1,468 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.Locale;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.util.Constants;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.hadoop.dedup.NoChangeUpdateConflictResolver;
import org.apache.solr.hadoop.dedup.RetainMostRecentUpdateConflictResolver;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
public class MapReduceIndexerToolArgumentParserTest extends SolrTestCaseJ4 {
private Configuration conf;
private MapReduceIndexerTool.MyArgumentParser parser;
private MapReduceIndexerTool.Options opts;
private PrintStream oldSystemOut;
private PrintStream oldSystemErr;
private ByteArrayOutputStream bout;
private ByteArrayOutputStream berr;
private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
private static final File MINIMR_INSTANCE_DIR = new File(RESOURCES_DIR + "/solr/minimr");
private static final String MORPHLINE_FILE = RESOURCES_DIR + "/test-morphlines/solrCellDocumentTypes.conf";
private final File solrHomeDirectory = createTempDir().toFile();
@BeforeClass
public static void beforeClass() {
assumeFalse("Does not work on Windows, because it uses UNIX shell commands or POSIX paths", Constants.WINDOWS);
assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
new Locale("tr").getLanguage().equals(Locale.getDefault().getLanguage()));
}
@Before
public void setUp() throws Exception {
super.setUp();
AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
FileUtils.copyDirectory(MINIMR_INSTANCE_DIR, solrHomeDirectory);
conf = new Configuration();
parser = new MapReduceIndexerTool.MyArgumentParser();
opts = new MapReduceIndexerTool.Options();
oldSystemOut = System.out;
bout = new ByteArrayOutputStream();
System.setOut(new PrintStream(bout, true, "UTF-8"));
oldSystemErr = System.err;
berr = new ByteArrayOutputStream();
System.setErr(new PrintStream(berr, true, "UTF-8"));
}
@After
public void tearDown() throws Exception {
super.tearDown();
System.setOut(oldSystemOut);
System.setErr(oldSystemErr);
}
@Test
public void testArgsParserTypicalUse() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--morphline-id", "morphline_xyz",
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--mappers", "10",
"--reducers", "9",
"--fanout", "8",
"--max-segments", "7",
"--shards", "1",
"--verbose",
"file:///home",
"file:///dev",
};
Integer res = parser.parseArgs(args, conf, opts);
assertNull(res != null ? res.toString() : "", res);
assertEquals(Collections.singletonList(new Path("file:///tmp")), opts.inputLists);
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
assertEquals(10, opts.mappers);
assertEquals(9, opts.reducers);
assertEquals(8, opts.fanout);
assertEquals(7, opts.maxSegments);
assertEquals(new Integer(1), opts.shards);
assertEquals(null, opts.fairSchedulerPool);
assertTrue(opts.isVerbose);
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
assertEquals(RetainMostRecentUpdateConflictResolver.class.getName(), opts.updateConflictResolver);
assertEquals(MORPHLINE_FILE, opts.morphlineFile.getPath());
assertEquals("morphline_xyz", opts.morphlineId);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsParserMultipleSpecsOfSameKind() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--input-list", "file:///",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
"file:///home",
"file:///dev",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEquals(Arrays.asList(new Path("file:///tmp"), new Path("file:///")), opts.inputLists);
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsParserTypicalUseWithEqualsSign() {
String[] args = new String[] {
"--input-list=file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir=file:/tmp/foo",
"--solr-home-dir=" + MINIMR_INSTANCE_DIR.getPath(),
"--mappers=10",
"--shards", "1",
"--verbose",
"file:///home",
"file:///dev",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEquals(Collections.singletonList(new Path("file:///tmp")), opts.inputLists);
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
assertEquals(10, opts.mappers);
assertEquals(new Integer(1), opts.shards);
assertEquals(null, opts.fairSchedulerPool);
assertTrue(opts.isVerbose);
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsParserMultipleSpecsOfSameKindWithEqualsSign() {
String[] args = new String[] {
"--input-list=file:///tmp",
"--input-list=file:///",
"--morphline-file", MORPHLINE_FILE,
"--output-dir=file:/tmp/foo",
"--solr-home-dir=" + MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
"file:///home",
"file:///dev",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEquals(Arrays.asList(new Path("file:///tmp"), new Path("file:///")), opts.inputLists);
assertEquals(Arrays.asList(new Path("file:///home"), new Path("file:///dev")), opts.inputFiles);
assertEquals(new Path("file:/tmp/foo"), opts.outputDir);
assertEquals(new File(MINIMR_INSTANCE_DIR.getPath()), opts.solrHomeDir);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsParserHelp() throws UnsupportedEncodingException {
String[] args = new String[] { "--help" };
assertEquals(new Integer(0), parser.parseArgs(args, conf, opts));
String helpText = new String(bout.toByteArray(), StandardCharsets.UTF_8);
assertTrue(helpText.contains("MapReduce batch job driver that "));
assertTrue(helpText.contains("bin/hadoop command"));
assertEquals(0, berr.toByteArray().length);
}
@Test
public void testArgsParserOk() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEquals(new Integer(1), opts.shards);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsParserUpdateConflictResolver() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
"--update-conflict-resolver", NoChangeUpdateConflictResolver.class.getName(),
};
assertNull(parser.parseArgs(args, conf, opts));
assertEquals(NoChangeUpdateConflictResolver.class.getName(), opts.updateConflictResolver);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsParserUnknownArgName() throws Exception {
String[] args = new String[] {
"--xxxxxxxxinputlist", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
};
assertArgumentParserException(args);
}
@Test
public void testArgsParserFileNotFound1() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/fileNotFound/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
};
assertArgumentParserException(args);
}
@Test
public void testArgsParserFileNotFound2() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", "/fileNotFound",
"--shards", "1",
};
assertArgumentParserException(args);
}
@Test
public void testArgsParserIntOutOfRange() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
"--mappers", "-20"
};
assertArgumentParserException(args);
}
@Test
public void testArgsParserIllegalFanout() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
"--fanout", "1" // must be >= 2
};
assertArgumentParserException(args);
}
@Test
public void testArgsParserSolrHomeMustContainSolrConfigFile() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--shards", "1",
"--solr-home-dir", "/",
};
assertArgumentParserException(args);
}
@Test
public void testArgsShardUrlOk() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shard-url", "http://localhost:8983/solr/collection1",
"--shard-url", "http://localhost:8983/solr/collection2",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEquals(Arrays.asList(
Collections.singletonList("http://localhost:8983/solr/collection1"),
Collections.singletonList("http://localhost:8983/solr/collection2")),
opts.shardUrls);
assertEquals(new Integer(2), opts.shards);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsShardUrlMustHaveAParam() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shard-url",
};
assertArgumentParserException(args);
}
@Test
public void testArgsShardUrlAndShardsSucceeds() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shards", "1",
"--shard-url", "http://localhost:8983/solr/collection1",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsShardUrlNoGoLive() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shard-url", "http://localhost:8983/solr/collection1"
};
assertNull(parser.parseArgs(args, conf, opts));
assertEmptySystemErrAndEmptySystemOut();
assertEquals(new Integer(1), opts.shards);
}
@Test
public void testArgsShardUrlsAndZkhostAreMutuallyExclusive() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shard-url", "http://localhost:8983/solr/collection1",
"--shard-url", "http://localhost:8983/solr/collection1",
"--zk-host", "http://localhost:2185",
"--go-live"
};
assertArgumentParserException(args);
}
@Test
public void testArgsGoLiveAndSolrUrl() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--shard-url", "http://localhost:8983/solr/collection1",
"--shard-url", "http://localhost:8983/solr/collection1",
"--go-live"
};
Integer result = parser.parseArgs(args, conf, opts);
assertNull(result);
assertEmptySystemErrAndEmptySystemOut();
}
@Test
public void testArgsZkHostNoGoLive() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--zk-host", "http://localhost:2185",
};
assertArgumentParserException(args);
}
@Test
public void testArgsGoLiveZkHostNoCollection() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--zk-host", "http://localhost:2185",
"--go-live"
};
assertArgumentParserException(args);
}
@Test
public void testArgsGoLiveNoZkHostOrSolrUrl() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--solr-home-dir", MINIMR_INSTANCE_DIR.getPath(),
"--go-live"
};
assertArgumentParserException(args);
}
@Test
public void testNoSolrHomeDirOrZKHost() throws Exception {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--shards", "1",
};
assertArgumentParserException(args);
}
@Test
public void testZKHostNoSolrHomeDirOk() {
String[] args = new String[] {
"--input-list", "file:///tmp",
"--morphline-file", MORPHLINE_FILE,
"--output-dir", "file:/tmp/foo",
"--zk-host", "http://localhost:2185",
"--collection", "collection1",
};
assertNull(parser.parseArgs(args, conf, opts));
assertEmptySystemErrAndEmptySystemOut();
}
private void assertEmptySystemErrAndEmptySystemOut() {
assertEquals(0, bout.toByteArray().length);
assertEquals(0, berr.toByteArray().length);
}
private void assertArgumentParserException(String[] args) throws UnsupportedEncodingException {
assertEquals("should have returned fail code", new Integer(1), parser.parseArgs(args, conf, opts));
assertEquals("no sys out expected:" + new String(bout.toByteArray(), StandardCharsets.UTF_8), 0, bout.toByteArray().length);
String usageText;
usageText = new String(berr.toByteArray(), StandardCharsets.UTF_8);
assertTrue("should start with usage msg \"usage: hadoop \":" + usageText, usageText.startsWith("usage: hadoop "));
}
}

View File

@ -1,415 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.reflect.Array;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.security.authorize.ProxyUsers;
import org.apache.hadoop.util.JarFinder;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.hadoop.hack.MiniMRCluster;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.BadHdfsThreadsFilter;
import org.apache.solr.util.BadMrClusterThreadsFilter;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
@ThreadLeakAction({Action.WARN})
@ThreadLeakLingering(linger = 0)
@ThreadLeakZombies(Consequence.CONTINUE)
@ThreadLeakFilters(defaultFilters = true, filters = {
BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s)
})
@Slow
@Nightly
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076")
public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 {
private static final boolean ENABLE_LOCAL_JOB_RUNNER = false; // for debugging only
private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr");
private static String SEARCH_ARCHIVES_JAR;
private static MiniDFSCluster dfsCluster = null;
private static MiniMRCluster mrCluster = null;
private static int numRuns = 0;
private final String inputAvroFile;
private final int count;
private static String tempDir;
private static File solrHomeDirectory;
protected MapReduceIndexerTool createTool() {
return new MapReduceIndexerTool();
}
public MorphlineBasicMiniMRTest() {
int data = random().nextInt(3);
switch (data) {
case 0:
this.inputAvroFile = "sample-statuses-20120906-141433.avro";
this.count = 2;
break;
case 1:
this.inputAvroFile = "sample-statuses-20120521-100919.avro";
this.count = 20;
break;
case 2:
this.inputAvroFile = "sample-statuses-20120906-141433-medium.avro";
this.count = 2104;
break;
default:
throw new RuntimeException("Test setup is broken");
}
}
@BeforeClass
public static void setupClass() throws Exception {
solrHomeDirectory = createTempDir().toFile();
assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
assumeFalse("FIXME: This test does not work with Windows because of native library requirements", Constants.WINDOWS);
AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
FileUtils.copyDirectory(MINIMR_CONF_DIR, solrHomeDirectory);
File dataDir = createTempDir().toFile();
tempDir = dataDir.getAbsolutePath();
new File(tempDir).mkdirs();
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());
int taskTrackers = 1;
int dataNodes = 2;
// String proxyUser = System.getProperty("user.name");
// String proxyGroup = "g";
// StringBuilder sb = new StringBuilder();
// sb.append("127.0.0.1,localhost");
// for (InetAddress i : InetAddress.getAllByName(InetAddress.getLocalHost().getHostName())) {
// sb.append(",").append(i.getCanonicalHostName());
// }
new File(dataDir, "nm-local-dirs").mkdirs();
System.setProperty("solr.hdfs.blockcache.enabled", "false");
System.setProperty("test.build.dir", dataDir + File.separator + "hdfs" + File.separator + "test-build-dir");
System.setProperty("test.build.data", dataDir + File.separator + "hdfs" + File.separator + "build");
System.setProperty("test.cache.data", dataDir + File.separator + "hdfs" + File.separator + "cache");
// Initialize AFTER test.build.dir is set, JarFinder uses it.
SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class);
JobConf conf = new JobConf();
conf.set("dfs.block.access.token.enable", "false");
conf.set("dfs.permissions", "true");
conf.set("hadoop.security.authentication", "simple");
conf.set(YarnConfiguration.NM_LOCAL_DIRS, dataDir.getPath() + File.separator + "nm-local-dirs");
conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, dataDir + File.separator + "nm-logs");
conf.set("testWorkDir", dataDir.getPath() + File.separator + "testWorkDir");
conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false");
conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0");
dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
FileSystem fileSystem = dfsCluster.getFileSystem();
fileSystem.mkdirs(new Path("/tmp"));
fileSystem.mkdirs(new Path("/user"));
fileSystem.mkdirs(new Path("/hadoop/mapred/system"));
fileSystem.setPermission(new Path("/tmp"), FsPermission.valueOf("-rwxrwxrwx"));
fileSystem.setPermission(new Path("/user"), FsPermission.valueOf("-rwxrwxrwx"));
fileSystem.setPermission(new Path("/hadoop/mapred/system"), FsPermission.valueOf("-rwx------"));
String nnURI = fileSystem.getUri().toString();
int numDirs = 1;
String[] racks = null;
String[] hosts = null;
mrCluster = new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks, hosts, null, conf);
ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
}
@AfterClass
public static void teardownClass() throws Exception {
System.clearProperty("solr.hdfs.blockcache.enabled");
System.clearProperty("test.build.dir");
System.clearProperty("test.build.data");
System.clearProperty("test.cache.data");
if (mrCluster != null) {
mrCluster.shutdown();
mrCluster = null;
}
if (dfsCluster != null) {
dfsCluster.shutdown();
dfsCluster = null;
}
FileSystem.closeAll();
}
@After
public void tearDown() throws Exception {
System.clearProperty("hadoop.log.dir");
System.clearProperty("solr.hdfs.blockcache.enabled");
super.tearDown();
}
private JobConf getJobConf() {
return mrCluster.createJobConf();
}
@Test
public void testPathParts() throws Exception { // see PathParts
FileSystem fs = dfsCluster.getFileSystem();
int dfsClusterPort = fs.getWorkingDirectory().toUri().getPort();
assertTrue(dfsClusterPort > 0);
JobConf jobConf = getJobConf();
Configuration simpleConf = new Configuration();
for (Configuration conf : Arrays.asList(jobConf, simpleConf)) {
for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
for (String up : Arrays.asList("", "../")) {
String down = up.length() == 0 ? "foo/" : "";
String uploadURL = "hdfs://localhost:12345/user/foo/" + up + "bar.txt" + queryAndFragment;
PathParts parts = new PathParts(uploadURL, conf);
assertEquals(uploadURL, parts.getUploadURL());
assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
assertEquals("bar.txt", parts.getName());
assertEquals("hdfs", parts.getScheme());
assertEquals("localhost", parts.getHost());
assertEquals(12345, parts.getPort());
assertEquals("hdfs://localhost:12345/user/" + down + "bar.txt", parts.getId());
assertEquals(parts.getId(), parts.getDownloadURL());
assertFileNotFound(parts);
uploadURL = "hdfs://localhost/user/foo/" + up + "bar.txt" + queryAndFragment;
parts = new PathParts(uploadURL, conf);
assertEquals(uploadURL, parts.getUploadURL());
assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
assertEquals("bar.txt", parts.getName());
assertEquals("hdfs", parts.getScheme());
assertEquals("localhost", parts.getHost());
assertEquals(8020, parts.getPort());
assertEquals("hdfs://localhost:8020/user/" + down + "bar.txt", parts.getId());
assertEquals(parts.getId(), parts.getDownloadURL());
assertFileNotFound(parts);
}
}
}
for (Configuration conf : Arrays.asList(jobConf)) {
for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
for (String up : Arrays.asList("", "../")) {
// verify using absolute path
String down = up.length() == 0 ? "foo/" : "";
String uploadURL = "/user/foo/" + up + "bar.txt" + queryAndFragment;
PathParts parts = new PathParts(uploadURL, conf);
assertEquals(uploadURL, parts.getUploadURL());
assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
assertEquals("bar.txt", parts.getName());
assertEquals("hdfs", parts.getScheme());
assertTrue("localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
assertEquals(dfsClusterPort, parts.getPort());
assertTrue(parts.getId().equals("hdfs://localhost:" + dfsClusterPort + "/user/" + down + "bar.txt")
|| parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + "/user/" + down + "bar.txt")
);
assertFileNotFound(parts);
// verify relative path is interpreted to be relative to user's home dir and resolved to an absolute path
uploadURL = "xuser/foo/" + up + "bar.txt" + queryAndFragment;
parts = new PathParts(uploadURL, conf);
assertEquals(uploadURL, parts.getUploadURL());
String homeDir = "/user/" + System.getProperty("user.name");
assertEquals(homeDir + "/xuser/" + down + "bar.txt", parts.getURIPath());
assertEquals("bar.txt", parts.getName());
assertEquals("hdfs", parts.getScheme());
assertTrue("localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
assertEquals(dfsClusterPort, parts.getPort());
assertTrue(parts.getId().equals("hdfs://localhost:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
|| parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
);
assertFileNotFound(parts);
}
}
}
try {
new PathParts("/user/foo/bar.txt", simpleConf);
fail("host/port resolution requires minimr conf, not a simple conf");
} catch (IllegalArgumentException e) {
; // expected
}
}
private void assertFileNotFound(PathParts parts) {
try {
parts.getFileSystem().getFileStatus(parts.getUploadPath());
fail();
} catch (IOException e) {
; // expected
}
}
@Test
public void mrRun() throws Exception {
FileSystem fs = dfsCluster.getFileSystem();
Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
fs.delete(inDir, true);
String DATADIR = "/user/testing/testMapperReducer/data";
Path dataDir = fs.makeQualified(new Path(DATADIR));
fs.delete(dataDir, true);
Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
fs.delete(outDir, true);
assertTrue(fs.mkdirs(inDir));
Path INPATH = new Path(inDir, "input.txt");
OutputStream os = fs.create(INPATH);
Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
wr.write(DATADIR + "/" + inputAvroFile);
wr.close();
assertTrue(fs.mkdirs(dataDir));
fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir);
JobConf jobConf = getJobConf();
jobConf.set("jobclient.output.filter", "ALL");
if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger and set breakpoints
jobConf.set("mapred.job.tracker", "local");
}
jobConf.setMaxMapAttempts(1);
jobConf.setMaxReduceAttempts(1);
jobConf.setJar(SEARCH_ARCHIVES_JAR);
int shards = 2;
int maxReducers = Integer.MAX_VALUE;
if (ENABLE_LOCAL_JOB_RUNNER) {
// local job runner has a couple of limitations: only one reducer is supported and the DistributedCache doesn't work.
// see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/
maxReducers = 1;
shards = 1;
}
String[] args = new String[] {
"--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
"--morphline-id=morphline1",
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
"--output-dir=" + outDir.toString(),
"--shards=" + shards,
"--verbose",
numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
numRuns % 3 == 0 ? "--reducers=" + shards : (numRuns % 3 == 1 ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers))
};
if (numRuns % 3 == 2) {
args = concat(args, new String[] {"--fanout=2"});
}
if (numRuns == 0) {
// force (slow) MapReduce based randomization to get coverage for that as well
args = concat(new String[] {"-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1"}, args);
}
MapReduceIndexerTool tool = createTool();
int res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
Job job = tool.job;
assertTrue(job.isComplete());
assertTrue(job.isSuccessful());
if (numRuns % 3 != 2) {
// Only run this check if mtree merge is disabled.
// With mtree merge enabled the BatchWriter counters aren't available anymore because
// variable "job" now refers to the merge job rather than the indexing job
assertEquals("Invalid counter " + SolrRecordWriter.class.getName() + "." + SolrCounters.DOCUMENTS_WRITTEN,
count, job.getCounters().findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString()).getValue());
}
// Check the output is as expected
outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR);
Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir));
System.out.println("outputfiles:" + Arrays.toString(outputFiles));
UtilsForTests.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards);
// run again with --dryrun mode:
tool = createTool();
args = concat(args, new String[] {"--dry-run"});
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
numRuns++;
}
protected static <T> T[] concat(T[]... arrays) {
if (arrays.length <= 0) {
throw new IllegalArgumentException();
}
Class clazz = null;
int length = 0;
for (T[] array : arrays) {
clazz = array.getClass();
length += array.length;
}
T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length);
int pos = 0;
for (T[] array : arrays) {
System.arraycopy(array, 0, result, pos, array.length);
pos += array.length;
}
return result;
}
}

View File

@ -1,881 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.lang.reflect.Array;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.security.authorize.ProxyUsers;
import org.apache.hadoop.util.JarFinder;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.cloud.AbstractFullDistribZkTestBase;
import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.params.CollectionParams.CollectionAction;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.hadoop.hack.MiniMRClientCluster;
import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.BadHdfsThreadsFilter;
import org.apache.solr.util.BadMrClusterThreadsFilter;
import org.apache.solr.util.TimeOut;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
@ThreadLeakAction({Action.WARN})
@ThreadLeakLingering(linger = 0)
@ThreadLeakZombies(Consequence.CONTINUE)
@ThreadLeakFilters(defaultFilters = true, filters = {
BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s)
})
@SuppressSSL // SSL does not work with this test for currently unknown reasons
@Slow
@Nightly
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076")
public class MorphlineGoLiveMiniMRTest extends AbstractFullDistribZkTestBase {
private static final int RECORD_COUNT = 2104;
private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();
private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
private static final File MINIMR_INSTANCE_DIR = new File(RESOURCES_DIR + "/solr/minimr");
private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr");
private static String SEARCH_ARCHIVES_JAR;
private static MiniDFSCluster dfsCluster = null;
private static MiniMRClientCluster mrCluster = null;
private static String tempDir;
private final String inputAvroFile1;
private final String inputAvroFile2;
private final String inputAvroFile3;
private static File solrHomeDirectory;
@Override
public String getSolrHome() {
return solrHomeDirectory.getPath();
}
public MorphlineGoLiveMiniMRTest() {
this.inputAvroFile1 = "sample-statuses-20120521-100919.avro";
this.inputAvroFile2 = "sample-statuses-20120906-141433.avro";
this.inputAvroFile3 = "sample-statuses-20120906-141433-medium.avro";
sliceCount = TEST_NIGHTLY ? 5 : 3;
fixShardCount(TEST_NIGHTLY ? 5 : 3);
}
@BeforeClass
public static void setupClass() throws Exception {
System.setProperty("solr.hdfs.blockcache.global", Boolean.toString(LuceneTestCase.random().nextBoolean()));
System.setProperty("solr.hdfs.blockcache.enabled", Boolean.toString(LuceneTestCase.random().nextBoolean()));
System.setProperty("solr.hdfs.blockcache.blocksperbank", "2048");
solrHomeDirectory = createTempDir().toFile();
assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
assumeFalse("FIXME: This test does not work with Windows because of native library requirements", Constants.WINDOWS);
AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
FileUtils.copyDirectory(MINIMR_INSTANCE_DIR, AbstractZkTestCase.SOLRHOME);
tempDir = createTempDir().toFile().getAbsolutePath();
new File(tempDir).mkdirs();
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
System.setProperty("hadoop.log.dir", new File(tempDir, "logs").getAbsolutePath());
int dataNodes = 2;
JobConf conf = new JobConf();
conf.set("dfs.block.access.token.enable", "false");
conf.set("dfs.permissions", "true");
conf.set("hadoop.security.authentication", "simple");
conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false");
conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0");
conf.set(YarnConfiguration.NM_LOCAL_DIRS, tempDir + File.separator + "nm-local-dirs");
conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, tempDir + File.separator + "nm-logs");
new File(tempDir + File.separator + "nm-local-dirs").mkdirs();
System.setProperty("test.build.dir", tempDir + File.separator + "hdfs" + File.separator + "test-build-dir");
System.setProperty("test.build.data", tempDir + File.separator + "hdfs" + File.separator + "build");
System.setProperty("test.cache.data", tempDir + File.separator + "hdfs" + File.separator + "cache");
// Initialize AFTER test.build.dir is set, JarFinder uses it.
SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class);
dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
FileSystem fileSystem = dfsCluster.getFileSystem();
fileSystem.mkdirs(new Path("/tmp"));
fileSystem.mkdirs(new Path("/user"));
fileSystem.mkdirs(new Path("/hadoop/mapred/system"));
fileSystem.setPermission(new Path("/tmp"),
FsPermission.valueOf("-rwxrwxrwx"));
fileSystem.setPermission(new Path("/user"),
FsPermission.valueOf("-rwxrwxrwx"));
fileSystem.setPermission(new Path("/hadoop/mapred/system"),
FsPermission.valueOf("-rwx------"));
mrCluster = MiniMRClientClusterFactory.create(MorphlineGoLiveMiniMRTest.class, 1, conf, new File(tempDir, "mrCluster"));
//new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks,
//hosts, null, conf);
ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
}
@Override
public void distribSetUp() throws Exception {
super.distribSetUp();
System.setProperty("host", "127.0.0.1");
System.setProperty("numShards", Integer.toString(sliceCount));
URI uri = dfsCluster.getFileSystem().getUri();
System.setProperty("solr.hdfs.home", uri.toString() + "/" + this.getClass().getName());
uploadConfFiles();
}
@Override
public void distribTearDown() throws Exception {
super.distribTearDown();
System.clearProperty("host");
System.clearProperty("numShards");
System.clearProperty("solr.hdfs.home");
}
@AfterClass
public static void teardownClass() throws Exception {
System.clearProperty("solr.hdfs.blockcache.global");
System.clearProperty("solr.hdfs.blockcache.blocksperbank");
System.clearProperty("solr.hdfs.blockcache.enabled");
System.clearProperty("hadoop.log.dir");
System.clearProperty("test.build.dir");
System.clearProperty("test.build.data");
System.clearProperty("test.cache.data");
if (mrCluster != null) {
mrCluster.stop();
mrCluster = null;
}
if (dfsCluster != null) {
dfsCluster.shutdown();
dfsCluster = null;
}
FileSystem.closeAll();
}
private JobConf getJobConf() throws IOException {
JobConf jobConf = new JobConf(mrCluster.getConfig());
return jobConf;
}
@Test
public void testBuildShardUrls() throws Exception {
// 2x3
Integer numShards = 2;
List<Object> urls = new ArrayList<>();
urls.add("shard1");
urls.add("shard2");
urls.add("shard3");
urls.add("shard4");
urls.add("shard5");
urls.add("shard6");
List<List<String>> shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 2, shardUrls.size());
for (List<String> u : shardUrls) {
assertEquals(3, u.size());
}
// 1x6
numShards = 1;
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 1, shardUrls.size());
for (List<String> u : shardUrls) {
assertEquals(6, u.size());
}
// 6x1
numShards = 6;
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 6, shardUrls.size());
for (List<String> u : shardUrls) {
assertEquals(1, u.size());
}
// 3x2
numShards = 3;
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 3, shardUrls.size());
for (List<String> u : shardUrls) {
assertEquals(2, u.size());
}
// null shards, 6x1
numShards = null;
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 6, shardUrls.size());
for (List<String> u : shardUrls) {
assertEquals(1, u.size());
}
// null shards 3x1
numShards = null;
urls = new ArrayList<>();
urls.add("shard1");
urls.add("shard2");
urls.add("shard3");
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 3, shardUrls.size());
for (List<String> u : shardUrls) {
assertEquals(1, u.size());
}
// 2x(2,3) off balance
numShards = 2;
urls = new ArrayList<>();
urls.add("shard1");
urls.add("shard2");
urls.add("shard3");
urls.add("shard4");
urls.add("shard5");
shardUrls = MapReduceIndexerTool.buildShardUrls(urls , numShards);
assertEquals(shardUrls.toString(), 2, shardUrls.size());
Set<Integer> counts = new HashSet<>();
counts.add(shardUrls.get(0).size());
counts.add(shardUrls.get(1).size());
assertTrue(counts.contains(2));
assertTrue(counts.contains(3));
}
private String[] prependInitialArgs(String[] args) {
String[] head = new String[] {
"--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
"--morphline-id=morphline1",
};
return concat(head, args);
}
@Nightly
@Test
public void test() throws Exception {
waitForRecoveriesToFinish(false);
FileSystem fs = dfsCluster.getFileSystem();
Path inDir = fs.makeQualified(new Path(
"/user/testing/testMapperReducer/input"));
fs.delete(inDir, true);
String DATADIR = "/user/testing/testMapperReducer/data";
Path dataDir = fs.makeQualified(new Path(DATADIR));
fs.delete(dataDir, true);
Path outDir = fs.makeQualified(new Path(
"/user/testing/testMapperReducer/output"));
fs.delete(outDir, true);
assertTrue(fs.mkdirs(inDir));
Path INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile1);
JobConf jobConf = getJobConf();
jobConf.set("jobclient.output.filter", "ALL");
// enable mapred.job.tracker = local to run in debugger and set breakpoints
// jobConf.set("mapred.job.tracker", "local");
jobConf.setMaxMapAttempts(1);
jobConf.setMaxReduceAttempts(1);
jobConf.setJar(SEARCH_ARCHIVES_JAR);
MapReduceIndexerTool tool;
int res;
QueryResponse results;
String[] args = new String[]{};
List<String> argList = new ArrayList<>();
try (HttpSolrClient server = getHttpSolrClient(cloudJettys.get(0).url)) {
args = new String[]{
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
"--output-dir=" + outDir.toString(),
"--log4j=" + getFile("log4j.properties").getAbsolutePath(),
"--mappers=3",
random().nextBoolean() ? "--input-list=" + INPATH.toString() : dataDir.toString(),
"--go-live-threads", Integer.toString(random().nextInt(15) + 1),
"--verbose",
"--go-live"
};
args = prependInitialArgs(args);
getShardUrlArgs(argList);
args = concat(args, argList.toArray(new String[0]));
if (true) {
tool = new MapReduceIndexerTool();
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
assertTrue(tool.job.isComplete());
assertTrue(tool.job.isSuccessful());
results = server.query(new SolrQuery("*:*"));
assertEquals(20, results.getResults().getNumFound());
}
fs.delete(inDir, true);
fs.delete(outDir, true);
fs.delete(dataDir, true);
assertTrue(fs.mkdirs(inDir));
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile2);
args = new String[]{
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
"--output-dir=" + outDir.toString(),
"--mappers=3",
"--verbose",
"--go-live",
random().nextBoolean() ? "--input-list=" + INPATH.toString() : dataDir.toString(),
"--go-live-threads", Integer.toString(random().nextInt(15) + 1)
};
args = prependInitialArgs(args);
getShardUrlArgs(argList);
args = concat(args, argList.toArray(new String[0]));
if (true) {
tool = new MapReduceIndexerTool();
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
assertTrue(tool.job.isComplete());
assertTrue(tool.job.isSuccessful());
results = server.query(new SolrQuery("*:*"));
assertEquals(22, results.getResults().getNumFound());
}
// try using zookeeper
String collection = "collection1";
if (random().nextBoolean()) {
// sometimes, use an alias
createAlias("updatealias", "collection1");
collection = "updatealias";
}
fs.delete(inDir, true);
fs.delete(outDir, true);
fs.delete(dataDir, true);
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);
cloudClient.deleteByQuery("*:*");
cloudClient.commit();
assertEquals(0, cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound());
args = new String[]{
"--output-dir=" + outDir.toString(),
"--mappers=3",
"--reducers=6",
"--fanout=2",
"--verbose",
"--go-live",
random().nextBoolean() ? "--input-list=" + INPATH.toString() : dataDir.toString(),
"--zk-host", zkServer.getZkAddress(),
"--collection", collection
};
args = prependInitialArgs(args);
if (true) {
tool = new MapReduceIndexerTool();
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
assertTrue(tool.job.isComplete());
assertTrue(tool.job.isSuccessful());
SolrDocumentList resultDocs = executeSolrQuery(cloudClient, "*:*");
assertEquals(RECORD_COUNT, resultDocs.getNumFound());
assertEquals(RECORD_COUNT, resultDocs.size());
// perform updates
for (int i = 0; i < RECORD_COUNT; i++) {
SolrDocument doc = resultDocs.get(i);
SolrInputDocument update = new SolrInputDocument();
for (Map.Entry<String, Object> entry : doc.entrySet()) {
update.setField(entry.getKey(), entry.getValue());
}
update.setField("user_screen_name", "Nadja" + i);
update.removeField("_version_");
cloudClient.add(update);
}
cloudClient.commit();
// verify updates
SolrDocumentList resultDocs2 = executeSolrQuery(cloudClient, "*:*");
assertEquals(RECORD_COUNT, resultDocs2.getNumFound());
assertEquals(RECORD_COUNT, resultDocs2.size());
for (int i = 0; i < RECORD_COUNT; i++) {
SolrDocument doc = resultDocs.get(i);
SolrDocument doc2 = resultDocs2.get(i);
assertEquals(doc.getFirstValue("id"), doc2.getFirstValue("id"));
assertEquals("Nadja" + i, doc2.getFirstValue("user_screen_name"));
assertEquals(doc.getFirstValue("text"), doc2.getFirstValue("text"));
// perform delete
cloudClient.deleteById((String) doc.getFirstValue("id"));
}
cloudClient.commit();
// verify deletes
assertEquals(0, executeSolrQuery(cloudClient, "*:*").size());
}
cloudClient.deleteByQuery("*:*");
cloudClient.commit();
assertEquals(0, cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound());
}
// try using zookeeper with replication
String replicatedCollection = "replicated_collection";
if (TEST_NIGHTLY) {
createCollection(replicatedCollection, 3, 3, 3);
} else {
createCollection(replicatedCollection, 2, 3, 2);
}
waitForRecoveriesToFinish(false);
cloudClient.setDefaultCollection(replicatedCollection);
fs.delete(inDir, true);
fs.delete(outDir, true);
fs.delete(dataDir, true);
assertTrue(fs.mkdirs(dataDir));
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);
args = new String[] {
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
"--output-dir=" + outDir.toString(),
"--mappers=3",
"--reducers=12",
"--fanout=2",
"--verbose",
"--go-live",
"--zk-host", zkServer.getZkAddress(),
"--collection", replicatedCollection, dataDir.toString()
};
args = prependInitialArgs(args);
if (true) {
tool = new MapReduceIndexerTool();
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
assertTrue(tool.job.isComplete());
assertTrue(tool.job.isSuccessful());
SolrDocumentList resultDocs = executeSolrQuery(cloudClient, "*:*");
assertEquals(RECORD_COUNT, resultDocs.getNumFound());
assertEquals(RECORD_COUNT, resultDocs.size());
checkConsistency(replicatedCollection);
// perform updates
for (int i = 0; i < RECORD_COUNT; i++) {
SolrDocument doc = resultDocs.get(i);
SolrInputDocument update = new SolrInputDocument();
for (Map.Entry<String, Object> entry : doc.entrySet()) {
update.setField(entry.getKey(), entry.getValue());
}
update.setField("user_screen_name", "@Nadja" + i);
update.removeField("_version_");
cloudClient.add(update);
}
cloudClient.commit();
// verify updates
SolrDocumentList resultDocs2 = executeSolrQuery(cloudClient, "*:*");
assertEquals(RECORD_COUNT, resultDocs2.getNumFound());
assertEquals(RECORD_COUNT, resultDocs2.size());
for (int i = 0; i < RECORD_COUNT; i++) {
SolrDocument doc = resultDocs.get(i);
SolrDocument doc2 = resultDocs2.get(i);
assertEquals(doc.getFieldValues("id"), doc2.getFieldValues("id"));
assertEquals(1, doc.getFieldValues("id").size());
assertEquals(Arrays.asList("@Nadja" + i), doc2.getFieldValues("user_screen_name"));
assertEquals(doc.getFieldValues("text"), doc2.getFieldValues("text"));
// perform delete
cloudClient.deleteById((String)doc.getFirstValue("id"));
}
cloudClient.commit();
// verify deletes
assertEquals(0, executeSolrQuery(cloudClient, "*:*").size());
}
// try using solr_url with replication
cloudClient.deleteByQuery("*:*");
cloudClient.commit();
assertEquals(0, executeSolrQuery(cloudClient, "*:*").getNumFound());
assertEquals(0, executeSolrQuery(cloudClient, "*:*").size());
fs.delete(inDir, true);
fs.delete(dataDir, true);
assertTrue(fs.mkdirs(dataDir));
INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);
args = new String[] {
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
"--output-dir=" + outDir.toString(),
"--shards", "2",
"--mappers=3",
"--verbose",
"--go-live",
"--go-live-threads", Integer.toString(random().nextInt(15) + 1), dataDir.toString()
};
args = prependInitialArgs(args);
argList = new ArrayList<>();
getShardUrlArgs(argList, replicatedCollection);
args = concat(args, argList.toArray(new String[0]));
if (true) {
tool = new MapReduceIndexerTool();
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
assertTrue(tool.job.isComplete());
assertTrue(tool.job.isSuccessful());
checkConsistency(replicatedCollection);
assertEquals(RECORD_COUNT, executeSolrQuery(cloudClient, "*:*").size());
}
// delete collection
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("action", CollectionAction.DELETE.toString());
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, true);
params.set(CoreAdminParams.DELETE_DATA_DIR, true);
params.set(CoreAdminParams.DELETE_INDEX, true);
params.set("name", replicatedCollection);
QueryRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
cloudClient.request(request);
final TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS);
while (cloudClient.getZkStateReader().getClusterState().hasCollection(replicatedCollection)) {
if (timeout.hasTimedOut()) {
throw new AssertionError("Timeout waiting to see removed collection leave clusterstate");
}
Thread.sleep(200);
}
if (TEST_NIGHTLY) {
createCollection(replicatedCollection, 3, 3, 3);
} else {
createCollection(replicatedCollection, 2, 3, 2);
}
waitForRecoveriesToFinish(replicatedCollection, false);
printLayout();
assertEquals(0, executeSolrQuery(cloudClient, "*:*").getNumFound());
args = new String[] {
"--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
"--output-dir=" + outDir.toString(),
"--shards", "2",
"--mappers=3",
"--verbose",
"--go-live",
"--go-live-threads", Integer.toString(random().nextInt(15) + 1), dataDir.toString()
};
args = prependInitialArgs(args);
argList = new ArrayList<>();
getShardUrlArgs(argList, replicatedCollection);
args = concat(args, argList.toArray(new String[0]));
tool = new MapReduceIndexerTool();
res = ToolRunner.run(jobConf, tool, args);
assertEquals(0, res);
assertTrue(tool.job.isComplete());
assertTrue(tool.job.isSuccessful());
checkConsistency(replicatedCollection);
assertEquals(RECORD_COUNT, executeSolrQuery(cloudClient, "*:*").size());
}
private void getShardUrlArgs(List<String> args) {
for (int i = 0; i < getShardCount(); i++) {
args.add("--shard-url");
args.add(cloudJettys.get(i).url);
}
}
private SolrDocumentList executeSolrQuery(SolrClient collection, String queryString) throws SolrServerException, IOException {
SolrQuery query = new SolrQuery(queryString).setRows(2 * RECORD_COUNT).addSort("id", ORDER.asc);
QueryResponse response = collection.query(query);
return response.getResults();
}
private void checkConsistency(String replicatedCollection)
throws Exception {
Collection<Slice> slices = cloudClient.getZkStateReader().getClusterState()
.getSlices(replicatedCollection);
for (Slice slice : slices) {
Collection<Replica> replicas = slice.getReplicas();
long found = -1;
for (Replica replica : replicas) {
try (HttpSolrClient client = getHttpSolrClient(new ZkCoreNodeProps(replica).getCoreUrl())) {
SolrQuery query = new SolrQuery("*:*");
query.set("distrib", false);
QueryResponse replicaResults = client.query(query);
long count = replicaResults.getResults().getNumFound();
if (found != -1) {
assertEquals(slice.getName() + " is inconsistent "
+ new ZkCoreNodeProps(replica).getCoreUrl(), found, count);
}
found = count;
}
}
}
}
private void getShardUrlArgs(List<String> args, String replicatedCollection) {
Collection<Slice> slices = cloudClient.getZkStateReader().getClusterState().getSlices(replicatedCollection);
for (Slice slice : slices) {
Collection<Replica> replicas = slice.getReplicas();
for (Replica replica : replicas) {
args.add("--shard-url");
args.add(new ZkCoreNodeProps(replica).getCoreUrl());
}
}
}
private Path upAvroFile(FileSystem fs, Path inDir, String DATADIR,
Path dataDir, String localFile) throws IOException, UnsupportedEncodingException {
Path INPATH = new Path(inDir, "input.txt");
OutputStream os = fs.create(INPATH);
Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
wr.write(DATADIR + File.separator + localFile);
wr.close();
assertTrue(fs.mkdirs(dataDir));
fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, localFile), dataDir);
return INPATH;
}
@Override
public JettySolrRunner createJetty(File solrHome, String dataDir,
String shardList, String solrConfigOverride, String schemaOverride)
throws Exception {
Properties props = new Properties();
if (solrConfigOverride != null)
props.setProperty("solrconfig", solrConfigOverride);
if (schemaOverride != null)
props.setProperty("schema", schemaOverride);
if (shardList != null)
props.setProperty("shards", shardList);
String collection = System.getProperty("collection");
if (collection == null)
collection = "collection1";
props.setProperty("collection", collection);
JettySolrRunner jetty = new JettySolrRunner(solrHome.getAbsolutePath(), props, buildJettyConfig(context));
jetty.start();
return jetty;
}
private static void putConfig(SolrZkClient zkClient, File solrhome, String name) throws Exception {
putConfig(zkClient, solrhome, name, name);
}
private static void putConfig(SolrZkClient zkClient, File solrhome, String srcName, String destName)
throws Exception {
File file = new File(solrhome, "conf" + File.separator + srcName);
if (!file.exists()) {
// LOG.info("skipping " + file.getAbsolutePath() +
// " because it doesn't exist");
return;
}
String destPath = "/configs/conf1/" + destName;
// LOG.info("put " + file.getAbsolutePath() + " to " + destPath);
zkClient.makePath(destPath, file, false, true);
}
private void uploadConfFiles() throws Exception {
// upload our own config files
SolrZkClient zkClient = new SolrZkClient(zkServer.getZkAddress(), 10000);
putConfig(zkClient, new File(RESOURCES_DIR + "/solr/solrcloud"),
"solrconfig.xml");
putConfig(zkClient, MINIMR_CONF_DIR, "schema.xml");
putConfig(zkClient, MINIMR_CONF_DIR, "elevate.xml");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_en.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ar.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_bg.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ca.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_cz.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_da.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_el.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_es.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_eu.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_de.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_fa.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_fi.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_fr.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ga.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_gl.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_hi.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_hu.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_hy.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_id.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_it.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ja.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_lv.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_nl.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_no.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_pt.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ro.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_ru.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_sv.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_th.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stopwords_tr.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_ca.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_fr.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_ga.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/contractions_it.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/stemdict_nl.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "lang/hyphenations_ga.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "stopwords.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "protwords.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "currency.xml");
putConfig(zkClient, MINIMR_CONF_DIR, "open-exchange-rates.json");
putConfig(zkClient, MINIMR_CONF_DIR, "mapping-ISOLatin1Accent.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "old_synonyms.txt");
putConfig(zkClient, MINIMR_CONF_DIR, "synonyms.txt");
zkClient.close();
}
protected static <T> T[] concat(T[]... arrays) {
if (arrays.length <= 0) {
throw new IllegalArgumentException();
}
Class clazz = null;
int length = 0;
for (T[] array : arrays) {
clazz = array.getClass();
length += array.length;
}
T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length);
int pos = 0;
for (T[] array : arrays) {
System.arraycopy(array, 0, result, pos, array.length);
pos += array.length;
}
return result;
}
private NamedList<Object> createAlias(String alias, String collections) throws SolrServerException, IOException {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("collections", collections);
params.set("name", alias);
params.set("action", CollectionAction.CREATEALIAS.toString());
QueryRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
return cloudClient.request(request);
}
}

View File

@ -1,76 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.net.URLEncoder;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.hadoop.morphline.MorphlineMapper;
import org.apache.solr.util.BadHdfsThreadsFilter;
import org.junit.BeforeClass;
import org.junit.Test;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
@ThreadLeakFilters(defaultFilters = true, filters = {
BadHdfsThreadsFilter.class // hdfs currently leaks thread(s)
})
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-9220")
public class MorphlineMapperTest extends MRUnitBase {
@BeforeClass
public static void beforeClass() {
assumeFalse("Does not work on Windows, because it uses UNIX shell commands or POSIX paths", Constants.WINDOWS);
}
@Test
public void testMapper() throws Exception {
MorphlineMapper mapper = new MorphlineMapper();
MapDriver<LongWritable, Text, Text, SolrInputDocumentWritable> mapDriver = MapDriver.newMapDriver(mapper);;
Configuration config = mapDriver.getConfiguration();
setupHadoopConfig(config);
mapDriver.withInput(new LongWritable(0L), new Text("hdfs://localhost/" +
URLEncoder.encode(DOCUMENTS_DIR, "UTF-8").replace("+", "%20") +
"/sample-statuses-20120906-141433.avro"));
SolrInputDocument sid = new SolrInputDocument();
sid.addField("id", "uniqueid1");
sid.addField("user_name", "user1");
sid.addField("text", "content of record one");
SolrInputDocumentWritable sidw = new SolrInputDocumentWritable(sid);
mapDriver
.withCacheArchive(solrHomeZip.getAbsolutePath())
.withOutput(new Text("0"), sidw);
//mapDriver.runTest();
List<Pair<Text, SolrInputDocumentWritable>> result = mapDriver.run();
for (Pair<Text, SolrInputDocumentWritable> p: result) {
System.out.println(p.getFirst());
System.out.println(p.getSecond());
}
}
}

View File

@ -1,131 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.lucene.util.Constants;
import org.apache.solr.common.SolrInputDocument;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import com.google.common.collect.Lists;
@Ignore("This test cannot currently work because it uses a local filesystem output path for the indexes and Solr requires hdfs output paths")
public class MorphlineReducerTest extends MRUnitBase {
@BeforeClass
public static void beforeClass2() {
assumeFalse("Does not work on Windows, because it uses UNIX shell commands or POSIX paths", Constants.WINDOWS);
System.setProperty("verifyPartitionAssignment", "false");
}
@AfterClass
public static void afterClass2() {
System.clearProperty("verifyPartitionAssignment");
}
public static class MySolrReducer extends SolrReducer {
Context context;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
this.context = context;
// handle a bug in MRUnit - should be fixed in MRUnit 1.0.0
when(context.getTaskAttemptID()).thenAnswer(new Answer<TaskAttemptID>() {
@Override
public TaskAttemptID answer(final InvocationOnMock invocation) {
// FIXME MRUNIT seems to pass taskid to the reduce task as mapred.TaskID rather than mapreduce.TaskID
return new TaskAttemptID(new TaskID("000000000000", 0, true, 0), 0);
}
});
super.setup(context);
}
}
public static class NullInputFormat<K, V> extends InputFormat<K, V> {
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
return Lists.newArrayList();
}
@Override
public RecordReader<K, V> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return null;
}
}
@Test
public void testReducer() throws Exception {
MySolrReducer myReducer = new MySolrReducer();
try {
ReduceDriver<Text,SolrInputDocumentWritable,Text,SolrInputDocumentWritable> reduceDriver = ReduceDriver
.newReduceDriver(myReducer);
Configuration config = reduceDriver.getConfiguration();
setupHadoopConfig(config);
List<SolrInputDocumentWritable> values = new ArrayList<>();
SolrInputDocument sid = new SolrInputDocument();
String id = "myid1";
sid.addField("id", id);
sid.addField("text", "some unique text");
SolrInputDocumentWritable sidw = new SolrInputDocumentWritable(sid);
values.add(sidw);
reduceDriver.withInput(new Text(id), values);
reduceDriver.withCacheArchive(solrHomeZip.getAbsolutePath());
reduceDriver.withOutputFormat(SolrOutputFormat.class,
NullInputFormat.class);
reduceDriver.run();
assertEquals("Expected 1 counter increment", 1,
reduceDriver.getCounters().findCounter(SolrCounters.class.getName(),
SolrCounters.DOCUMENTS_WRITTEN.toString()).getValue());
} finally {
myReducer.cleanup(myReducer.context);
}
}
}

View File

@ -1,57 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import java.io.File;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
public class UtilsForTests {
public static void validateSolrServerDocumentCount(File solrHomeDir, FileSystem fs, Path outDir, int expectedDocs, int expectedShards)
throws IOException, SolrServerException {
long actualDocs = 0;
int actualShards = 0;
for (FileStatus dir : fs.listStatus(outDir)) { // for each shard
if (dir.getPath().getName().startsWith("part") && dir.isDirectory()) {
actualShards++;
try (EmbeddedSolrServer solr
= SolrRecordWriter.createEmbeddedSolrServer(new Path(solrHomeDir.getAbsolutePath()), fs, dir.getPath())) {
SolrQuery query = new SolrQuery();
query.setQuery("*:*");
QueryResponse resp = solr.query(query);
long numDocs = resp.getResults().getNumFound();
actualDocs += numDocs;
}
}
}
assertEquals(expectedShards, actualShards);
assertEquals(expectedDocs, actualDocs);
}
}

View File

@ -1,41 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.hack;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
/*
* A simple interface for a client MR cluster used for testing. This interface
* provides basic methods which are independent of the underlying Mini Cluster (
* either through MR1 or MR2).
*/
public interface MiniMRClientCluster {
public void start() throws IOException;
/**
* Stop and start back the cluster using the same configuration.
*/
public void restart() throws IOException;
public void stop() throws IOException;
public Configuration getConfig() throws IOException;
}

View File

@ -1,88 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.hack;
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.JarFinder;
/**
* A MiniMRCluster factory. In MR2, it provides a wrapper MiniMRClientCluster
* interface around the MiniMRYarnCluster. While in MR1, it provides such
* wrapper around MiniMRCluster. This factory should be used in tests to provide
* an easy migration of tests across MR1 and MR2.
*/
public class MiniMRClientClusterFactory {
public static MiniMRClientCluster create(Class<?> caller, int noOfNMs,
Configuration conf, File testWorkDir) throws IOException {
return create(caller, caller.getSimpleName(), noOfNMs, conf, testWorkDir);
}
public static MiniMRClientCluster create(Class<?> caller, String identifier,
int noOfNMs, Configuration conf, File testWorkDir) throws IOException {
if (conf == null) {
conf = new Configuration();
}
FileSystem fs = FileSystem.get(conf);
Path testRootDir = new Path(testWorkDir.getPath(), identifier + "-tmpDir")
.makeQualified(fs);
Path appJar = new Path(testRootDir, "MRAppJar.jar");
// Copy MRAppJar and make it private.
Path appMasterJar = new Path(MiniMRYarnCluster.APPJAR);
fs.copyFromLocalFile(appMasterJar, appJar);
fs.setPermission(appJar, new FsPermission("744"));
Job job = Job.getInstance(conf);
job.addFileToClassPath(appJar);
Path callerJar = new Path(JarFinder.getJar(caller));
Path remoteCallerJar = new Path(testRootDir, callerJar.getName());
fs.copyFromLocalFile(callerJar, remoteCallerJar);
fs.setPermission(remoteCallerJar, new FsPermission("744"));
job.addFileToClassPath(remoteCallerJar);
MiniMRYarnCluster miniMRYarnCluster;
try {
miniMRYarnCluster = new MiniMRYarnCluster(identifier,
noOfNMs, testWorkDir);
} catch (Exception e) {
throw new RuntimeException(e);
}
job.getConfiguration().set("minimrclientcluster.caller.name",
identifier);
job.getConfiguration().setInt("minimrclientcluster.nodemanagers.number",
noOfNMs);
miniMRYarnCluster.init(job.getConfiguration());
miniMRYarnCluster.start();
return new MiniMRYarnClusterAdapter(miniMRYarnCluster, testWorkDir);
}
}

View File

@ -1,266 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.hack;
import java.io.File;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.JobPriority;
import org.apache.hadoop.mapred.MapTaskCompletionEventsUpdate;
import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.lucene.util.LuceneTestCase;
/**
* This class is an MR2 replacement for older MR1 MiniMRCluster, that was used
* by tests prior to MR2. This replacement class uses the new MiniMRYarnCluster
* in MR2 but provides the same old MR1 interface, so tests can be migrated from
* MR1 to MR2 with minimal changes.
*
* Due to major differences between MR1 and MR2, a number of methods are either
* unimplemented/unsupported or were re-implemented to provide wrappers around
* MR2 functionality.
*
* @deprecated Use {@link org.apache.hadoop.mapred.MiniMRClientClusterFactory}
* instead
*/
@Deprecated
public class MiniMRCluster {
private static final Log LOG = LogFactory.getLog(MiniMRCluster.class);
private MiniMRClientCluster mrClientCluster;
public String getTaskTrackerLocalDir(int taskTracker) {
throw new UnsupportedOperationException();
}
public String[] getTaskTrackerLocalDirs(int taskTracker) {
throw new UnsupportedOperationException();
}
class JobTrackerRunner {
// Mock class
}
class TaskTrackerRunner {
// Mock class
}
public JobTrackerRunner getJobTrackerRunner() {
throw new UnsupportedOperationException();
}
TaskTrackerRunner getTaskTrackerRunner(int id) {
throw new UnsupportedOperationException();
}
public int getNumTaskTrackers() {
throw new UnsupportedOperationException();
}
public void setInlineCleanupThreads() {
throw new UnsupportedOperationException();
}
public void waitUntilIdle() {
throw new UnsupportedOperationException();
}
private void waitTaskTrackers() {
throw new UnsupportedOperationException();
}
public int getJobTrackerPort() {
throw new UnsupportedOperationException();
}
public JobConf createJobConf() {
JobConf jobConf = null;
try {
jobConf = new JobConf(mrClientCluster.getConfig());
} catch (IOException e) {
LOG.error(e);
}
return jobConf;
}
public JobConf createJobConf(JobConf conf) {
JobConf jobConf = null;
try {
jobConf = new JobConf(mrClientCluster.getConfig());
} catch (IOException e) {
LOG.error(e);
}
return jobConf;
}
static JobConf configureJobConf(JobConf conf, String namenode,
int jobTrackerPort, int jobTrackerInfoPort, UserGroupInformation ugi) {
throw new UnsupportedOperationException();
}
public MiniMRCluster(int numTaskTrackers, String namenode, int numDir,
String[] racks, String[] hosts) throws Exception {
this(0, 0, numTaskTrackers, namenode, numDir, racks, hosts);
}
public MiniMRCluster(int numTaskTrackers, String namenode, int numDir,
String[] racks, String[] hosts, JobConf conf) throws Exception {
this(0, 0, numTaskTrackers, namenode, numDir, racks, hosts, null, conf);
}
public MiniMRCluster(int numTaskTrackers, String namenode, int numDir)
throws Exception {
this(0, 0, numTaskTrackers, namenode, numDir);
}
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
int numTaskTrackers, String namenode, int numDir) throws Exception {
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
null);
}
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
int numTaskTrackers, String namenode, int numDir, String[] racks)
throws Exception {
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
racks, null);
}
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
int numTaskTrackers, String namenode, int numDir, String[] racks,
String[] hosts) throws Exception {
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
racks, hosts, null);
}
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
int numTaskTrackers, String namenode, int numDir, String[] racks,
String[] hosts, UserGroupInformation ugi) throws Exception {
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
racks, hosts, ugi, null);
}
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
int numTaskTrackers, String namenode, int numDir, String[] racks,
String[] hosts, UserGroupInformation ugi, JobConf conf)
throws Exception {
this(jobTrackerPort, taskTrackerPort, numTaskTrackers, namenode, numDir,
racks, hosts, ugi, conf, 0);
}
public MiniMRCluster(int jobTrackerPort, int taskTrackerPort,
int numTaskTrackers, String namenode, int numDir, String[] racks,
String[] hosts, UserGroupInformation ugi, JobConf conf,
int numTrackerToExclude) throws Exception {
if (conf == null) conf = new JobConf();
FileSystem.setDefaultUri(conf, namenode);
String identifier = this.getClass().getSimpleName() + "_"
+ Integer.toString(LuceneTestCase.random().nextInt(Integer.MAX_VALUE));
mrClientCluster = MiniMRClientClusterFactory.create(this.getClass(),
identifier, numTaskTrackers, conf, new File(conf.get("testWorkDir")));
}
public UserGroupInformation getUgi() {
throw new UnsupportedOperationException();
}
public TaskCompletionEvent[] getTaskCompletionEvents(JobID id, int from,
int max) throws IOException {
throw new UnsupportedOperationException();
}
public void setJobPriority(JobID jobId, JobPriority priority)
throws AccessControlException, IOException {
throw new UnsupportedOperationException();
}
public JobPriority getJobPriority(JobID jobId) {
throw new UnsupportedOperationException();
}
public long getJobFinishTime(JobID jobId) {
throw new UnsupportedOperationException();
}
public void initializeJob(JobID jobId) throws IOException {
throw new UnsupportedOperationException();
}
public MapTaskCompletionEventsUpdate getMapTaskCompletionEventsUpdates(
int index, JobID jobId, int max) throws IOException {
throw new UnsupportedOperationException();
}
public JobConf getJobTrackerConf() {
JobConf jobConf = null;
try {
jobConf = new JobConf(mrClientCluster.getConfig());
} catch (IOException e) {
LOG.error(e);
}
return jobConf;
}
public int getFaultCount(String hostName) {
throw new UnsupportedOperationException();
}
public void startJobTracker() {
// Do nothing
}
public void startJobTracker(boolean wait) {
// Do nothing
}
public void stopJobTracker() {
// Do nothing
}
public void stopTaskTracker(int id) {
// Do nothing
}
public void startTaskTracker(String host, String rack, int idx, int numDir)
throws IOException {
// Do nothing
}
void addTaskTracker(TaskTrackerRunner taskTracker) {
throw new UnsupportedOperationException();
}
int getTaskTrackerID(String trackerName) {
throw new UnsupportedOperationException();
}
public void shutdown() {
try {
mrClientCluster.stop();
} catch (IOException e) {
LOG.error(e);
}
}
}

View File

@ -1,205 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.hack;
import java.io.File;
import java.io.IOException;
import java.util.Locale;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.LocalContainerLauncher;
import org.apache.hadoop.mapred.ShuffleHandler;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.util.JarFinder;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
/**
* Configures and starts the MR-specific components in the YARN cluster.
*
*/
public class MiniMRYarnCluster extends MiniYARNCluster {
public static final String APPJAR = JarFinder.getJar(LocalContainerLauncher.class);
private static final Log LOG = LogFactory.getLog(MiniMRYarnCluster.class);
private JobHistoryServer historyServer;
private JobHistoryServerWrapper historyServerWrapper;
public MiniMRYarnCluster(String testName, File testWorkDir) {
this(testName, 1, testWorkDir);
}
public MiniMRYarnCluster(String testName, int noOfNMs, File testWorkDir) {
super(testName, noOfNMs, 4, 4, testWorkDir);
//TODO: add the history server
historyServerWrapper = new JobHistoryServerWrapper();
addService(historyServerWrapper);
}
@Override
public void serviceInit(Configuration conf) throws Exception {
conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME);
if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
conf.set(MRJobConfig.MR_AM_STAGING_DIR, new File(getTestWorkDir(),
"apps_staging_dir/").getAbsolutePath());
}
// By default, VMEM monitoring disabled, PMEM monitoring enabled.
if (!conf.getBoolean(
MRConfig.MAPREDUCE_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
MRConfig.DEFAULT_MAPREDUCE_MINICLUSTER_CONTROL_RESOURCE_MONITORING)) {
conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
}
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");
try {
Path stagingPath = FileContext.getFileContext(conf).makeQualified(
new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
/*
* Re-configure the staging path on Windows if the file system is localFs.
* We need to use a absolute path that contains the drive letter. The unit
* test could run on a different drive than the AM. We can run into the
* issue that job files are localized to the drive where the test runs on,
* while the AM starts on a different drive and fails to find the job
* metafiles. Using absolute path can avoid this ambiguity.
*/
if (Path.WINDOWS) {
if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
conf.set(MRJobConfig.MR_AM_STAGING_DIR,
new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR))
.getAbsolutePath());
}
}
FileContext fc=FileContext.getFileContext(stagingPath.toUri(), conf);
if (fc.util().exists(stagingPath)) {
LOG.info(stagingPath + " exists! deleting...");
fc.delete(stagingPath, true);
}
LOG.info("mkdir: " + stagingPath);
//mkdir the staging directory so that right permissions are set while running as proxy user
fc.mkdir(stagingPath, null, true);
//mkdir done directory as well
String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
Path doneDirPath = fc.makeQualified(new Path(doneDir));
fc.mkdir(doneDirPath, null, true);
} catch (IOException e) {
throw new YarnRuntimeException("Could not create staging directory. ", e);
}
conf.set(MRConfig.MASTER_ADDRESS, "test"); // The default is local because of
// which shuffle doesn't happen
//configure the shuffle service in NM
conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
conf.setClass(String.format(Locale.ENGLISH, YarnConfiguration.NM_AUX_SERVICE_FMT,
ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class,
Service.class);
// Non-standard shuffle port
conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);
conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
DefaultContainerExecutor.class, ContainerExecutor.class);
// TestMRJobs is for testing non-uberized operation only; see TestUberAM
// for corresponding uberized tests.
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
super.serviceInit(conf);
}
private class JobHistoryServerWrapper extends AbstractService {
public JobHistoryServerWrapper() {
super(JobHistoryServerWrapper.class.getName());
}
@Override
public synchronized void serviceStart() throws Exception {
try {
if (!getConfig().getBoolean(
JHAdminConfig.MR_HISTORY_MINICLUSTER_FIXED_PORTS,
JHAdminConfig.DEFAULT_MR_HISTORY_MINICLUSTER_FIXED_PORTS)) {
// pick free random ports.
getConfig().set(JHAdminConfig.MR_HISTORY_ADDRESS,
MiniYARNCluster.getHostname() + ":0");
getConfig().set(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS,
MiniYARNCluster.getHostname() + ":0");
}
historyServer = new JobHistoryServer();
historyServer.init(getConfig());
new Thread() {
public void run() {
historyServer.start();
};
}.start();
while (historyServer.getServiceState() == STATE.INITED) {
LOG.info("Waiting for HistoryServer to start...");
Thread.sleep(1500);
}
//TODO Add a timeout. State.STOPPED check ?
if (historyServer.getServiceState() != STATE.STARTED) {
throw new IOException("HistoryServer failed to start");
}
super.serviceStart();
} catch (Throwable t) {
throw new YarnRuntimeException(t);
}
//need to do this because historyServer.init creates a new Configuration
getConfig().set(JHAdminConfig.MR_HISTORY_ADDRESS,
historyServer.getConfig().get(JHAdminConfig.MR_HISTORY_ADDRESS));
getConfig().set(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS,
historyServer.getConfig().get(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS));
LOG.info("MiniMRYARN ResourceManager address: " +
getConfig().get(YarnConfiguration.RM_ADDRESS));
LOG.info("MiniMRYARN ResourceManager web address: " +
getConfig().get(YarnConfiguration.RM_WEBAPP_ADDRESS));
LOG.info("MiniMRYARN HistoryServer address: " +
getConfig().get(JHAdminConfig.MR_HISTORY_ADDRESS));
LOG.info("MiniMRYARN HistoryServer web address: " +
getConfig().get(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS));
}
@Override
public synchronized void serviceStop() throws Exception {
if (historyServer != null) {
historyServer.stop();
}
super.serviceStop();
}
}
public JobHistoryServer getHistoryServer() {
return this.historyServer;
}
}

View File

@ -1,78 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.hack;
import java.io.File;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
import org.apache.hadoop.service.Service.STATE;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
/**
* An adapter for MiniMRYarnCluster providing a MiniMRClientCluster interface.
* This interface could be used by tests across both MR1 and MR2.
*/
public class MiniMRYarnClusterAdapter implements MiniMRClientCluster {
private MiniMRYarnCluster miniMRYarnCluster;
private File testWorkDir;
private static final Log LOG = LogFactory.getLog(MiniMRYarnClusterAdapter.class);
public MiniMRYarnClusterAdapter(MiniMRYarnCluster miniMRYarnCluster, File testWorkDir) {
this.miniMRYarnCluster = miniMRYarnCluster;
this.testWorkDir = testWorkDir;
}
@Override
public Configuration getConfig() {
return miniMRYarnCluster.getConfig();
}
@Override
public void start() {
miniMRYarnCluster.start();
}
@Override
public void stop() {
miniMRYarnCluster.stop();
}
@Override
public void restart() {
if (!miniMRYarnCluster.getServiceState().equals(STATE.STARTED)){
LOG.warn("Cannot restart the mini cluster, start it first");
return;
}
Configuration oldConf = new Configuration(getConfig());
String callerName = oldConf.get("minimrclientcluster.caller.name",
this.getClass().getName());
int noOfNMs = oldConf.getInt("minimrclientcluster.nodemanagers.number", 1);
oldConf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_FIXED_PORTS, true);
oldConf.setBoolean(JHAdminConfig.MR_HISTORY_MINICLUSTER_FIXED_PORTS, true);
stop();
miniMRYarnCluster = new MiniMRYarnCluster(callerName, noOfNMs, testWorkDir);
miniMRYarnCluster.init(oldConf);
miniMRYarnCluster.start();
}
}

View File

@ -1,409 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.hack;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Locale;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService;
public class MiniYARNCluster extends CompositeService {
private static final Log LOG = LogFactory.getLog(MiniYARNCluster.class);
// temp fix until metrics system can auto-detect itself running in unit test:
static {
DefaultMetricsSystem.setMiniClusterMode(true);
}
private NodeManager[] nodeManagers;
private ResourceManager resourceManager;
private ResourceManagerWrapper resourceManagerWrapper;
private File testWorkDir;
// Number of nm-local-dirs per nodemanager
private int numLocalDirs;
// Number of nm-log-dirs per nodemanager
private int numLogDirs;
/**
* @param testName name of the test
* @param noOfNodeManagers the number of node managers in the cluster
* @param numLocalDirs the number of nm-local-dirs per nodemanager
* @param numLogDirs the number of nm-log-dirs per nodemanager
*/
public MiniYARNCluster(String testName, int noOfNodeManagers,
int numLocalDirs, int numLogDirs, File testWorkDir) {
super(testName.replace("$", ""));
this.numLocalDirs = numLocalDirs;
this.numLogDirs = numLogDirs;
String testSubDir = testName.replace("$", "");
File targetWorkDir = new File(testWorkDir, testSubDir);
try {
FileContext.getLocalFSFileContext().delete(
new Path(targetWorkDir.getAbsolutePath()), true);
} catch (Exception e) {
LOG.warn("COULD NOT CLEANUP", e);
throw new YarnRuntimeException("could not cleanup test dir: "+ e, e);
}
if (Shell.WINDOWS) {
// The test working directory can exceed the maximum path length supported
// by some Windows APIs and cmd.exe (260 characters). To work around this,
// create a symlink in temporary storage with a much shorter path,
// targeting the full path to the test working directory. Then, use the
// symlink as the test working directory.
String targetPath = targetWorkDir.getAbsolutePath();
File link = new File(System.getProperty("java.io.tmpdir"),
String.valueOf(System.nanoTime()));
String linkPath = link.getAbsolutePath();
try {
FileContext.getLocalFSFileContext().delete(new Path(linkPath), true);
} catch (IOException e) {
throw new YarnRuntimeException("could not cleanup symlink: " + linkPath, e);
}
// Guarantee target exists before creating symlink.
targetWorkDir.mkdirs();
ShellCommandExecutor shexec = new ShellCommandExecutor(
Shell.getSymlinkCommand(targetPath, linkPath));
try {
shexec.execute();
} catch (IOException e) {
throw new YarnRuntimeException(String.format(Locale.ENGLISH,
"failed to create symlink from %s to %s, shell output: %s", linkPath,
targetPath, shexec.getOutput()), e);
}
this.testWorkDir = link;
} else {
this.testWorkDir = targetWorkDir;
}
resourceManagerWrapper = new ResourceManagerWrapper();
addService(resourceManagerWrapper);
nodeManagers = new CustomNodeManager[noOfNodeManagers];
for(int index = 0; index < noOfNodeManagers; index++) {
addService(new NodeManagerWrapper(index));
nodeManagers[index] = new CustomNodeManager();
}
}
@Override
public void serviceInit(Configuration conf) throws Exception {
super.serviceInit(conf instanceof YarnConfiguration ? conf
: new YarnConfiguration(
conf));
}
public File getTestWorkDir() {
return testWorkDir;
}
public ResourceManager getResourceManager() {
return this.resourceManager;
}
public NodeManager getNodeManager(int i) {
return this.nodeManagers[i];
}
public static String getHostname() {
try {
return InetAddress.getLocalHost().getHostName();
}
catch (UnknownHostException ex) {
throw new RuntimeException(ex);
}
}
private class ResourceManagerWrapper extends AbstractService {
public ResourceManagerWrapper() {
super(ResourceManagerWrapper.class.getName());
}
@Override
public synchronized void serviceStart() throws Exception {
try {
getConfig().setBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, true);
if (!getConfig().getBoolean(
YarnConfiguration.YARN_MINICLUSTER_FIXED_PORTS,
YarnConfiguration.DEFAULT_YARN_MINICLUSTER_FIXED_PORTS)) {
// pick free random ports.
String hostname = MiniYARNCluster.getHostname();
getConfig().set(YarnConfiguration.RM_ADDRESS,
hostname + ":0");
getConfig().set(YarnConfiguration.RM_ADMIN_ADDRESS,
hostname + ":0");
getConfig().set(YarnConfiguration.RM_SCHEDULER_ADDRESS,
hostname + ":0");
getConfig().set(YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
hostname + ":0");
getConfig().set(YarnConfiguration.RM_WEBAPP_ADDRESS,
hostname + ":0");
}
resourceManager = new ResourceManager() {
@Override
protected void doSecureLogin() throws IOException {
// Don't try to login using keytab in the testcase.
};
};
resourceManager.init(getConfig());
new Thread() {
public void run() {
resourceManager.start();
};
}.start();
int waitCount = 0;
while (resourceManager.getServiceState() == STATE.INITED
&& waitCount++ < 60) {
LOG.info("Waiting for RM to start...");
Thread.sleep(1500);
}
if (resourceManager.getServiceState() != STATE.STARTED) {
// RM could have failed.
throw new IOException(
"ResourceManager failed to start. Final state is "
+ resourceManager.getServiceState());
}
super.serviceStart();
} catch (Throwable t) {
throw new YarnRuntimeException(t);
}
LOG.info("MiniYARN ResourceManager address: " +
getConfig().get(YarnConfiguration.RM_ADDRESS));
LOG.info("MiniYARN ResourceManager web address: " +
getConfig().get(YarnConfiguration.RM_WEBAPP_ADDRESS));
}
@Override
public synchronized void serviceStop() throws Exception {
if (resourceManager != null) {
resourceManager.stop();
}
super.serviceStop();
if (Shell.WINDOWS) {
// On Windows, clean up the short temporary symlink that was created to
// work around path length limitation.
String testWorkDirPath = testWorkDir.getAbsolutePath();
try {
FileContext.getLocalFSFileContext().delete(new Path(testWorkDirPath),
true);
} catch (IOException e) {
LOG.warn("could not cleanup symlink: " +
testWorkDir.getAbsolutePath());
}
}
}
}
private class NodeManagerWrapper extends AbstractService {
int index = 0;
public NodeManagerWrapper(int i) {
super(NodeManagerWrapper.class.getName() + "_" + i);
index = i;
}
public synchronized void serviceInit(Configuration conf) throws Exception {
Configuration config = new YarnConfiguration(conf);
super.serviceInit(config);
}
/**
* Create local/log directories
* @param dirType type of directories i.e. local dirs or log dirs
* @param numDirs number of directories
* @return the created directories as a comma delimited String
*/
private String prepareDirs(String dirType, int numDirs) {
File []dirs = new File[numDirs];
String dirsString = "";
for (int i = 0; i < numDirs; i++) {
dirs[i]= new File(testWorkDir, MiniYARNCluster.this.getName()
+ "-" + dirType + "Dir-nm-" + index + "_" + i);
dirs[i].mkdirs();
LOG.info("Created " + dirType + "Dir in " + dirs[i].getAbsolutePath());
String delimiter = (i > 0) ? "," : "";
dirsString = dirsString.concat(delimiter + dirs[i].getAbsolutePath());
}
return dirsString;
}
public synchronized void serviceStart() throws Exception {
try {
// create nm-local-dirs and configure them for the nodemanager
String localDirsString = prepareDirs("local", numLocalDirs);
getConfig().set(YarnConfiguration.NM_LOCAL_DIRS, localDirsString);
// create nm-log-dirs and configure them for the nodemanager
String logDirsString = prepareDirs("log", numLogDirs);
getConfig().set(YarnConfiguration.NM_LOG_DIRS, logDirsString);
File remoteLogDir =
new File(testWorkDir, MiniYARNCluster.this.getName()
+ "-remoteLogDir-nm-" + index);
remoteLogDir.mkdir();
getConfig().set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
remoteLogDir.getAbsolutePath());
// By default AM + 2 containers
getConfig().setInt(YarnConfiguration.NM_PMEM_MB, 4*1024);
getConfig().set(YarnConfiguration.NM_ADDRESS,
MiniYARNCluster.getHostname() + ":0");
getConfig().set(YarnConfiguration.NM_LOCALIZER_ADDRESS,
MiniYARNCluster.getHostname() + ":0");
getConfig().set(YarnConfiguration.NM_WEBAPP_ADDRESS,
MiniYARNCluster.getHostname() + ":0");
// Disable resource checks by default
if (!getConfig().getBoolean(
YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
YarnConfiguration.
DEFAULT_YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING)) {
getConfig().setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
getConfig().setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
}
LOG.info("Starting NM: " + index);
nodeManagers[index].init(getConfig());
new Thread() {
public void run() {
nodeManagers[index].start();
};
}.start();
int waitCount = 0;
while (nodeManagers[index].getServiceState() == STATE.INITED
&& waitCount++ < 60) {
LOG.info("Waiting for NM " + index + " to start...");
Thread.sleep(1000);
}
if (nodeManagers[index].getServiceState() != STATE.STARTED) {
// RM could have failed.
throw new IOException("NodeManager " + index + " failed to start");
}
super.serviceStart();
} catch (Throwable t) {
throw new YarnRuntimeException(t);
}
}
@Override
public synchronized void serviceStop() throws Exception {
if (nodeManagers[index] != null) {
nodeManagers[index].stop();
}
super.serviceStop();
}
}
private class CustomNodeManager extends NodeManager {
@Override
protected void doSecureLogin() throws IOException {
// Don't try to login using keytab in the testcase.
};
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
return new NodeStatusUpdaterImpl(context, dispatcher,
healthChecker, metrics) {
@Override
protected ResourceTracker getRMClient() {
final ResourceTrackerService rt = resourceManager
.getResourceTrackerService();
final RecordFactory recordFactory =
RecordFactoryProvider.getRecordFactory(null);
// For in-process communication without RPC
return new ResourceTracker() {
@Override
public NodeHeartbeatResponse nodeHeartbeat(
NodeHeartbeatRequest request) throws YarnException,
IOException {
NodeHeartbeatResponse response = recordFactory.newRecordInstance(
NodeHeartbeatResponse.class);
try {
response = rt.nodeHeartbeat(request);
} catch (YarnException e) {
LOG.info("Exception in heartbeat from node " +
request.getNodeStatus().getNodeId(), e);
throw e;
}
return response;
}
@Override
public RegisterNodeManagerResponse registerNodeManager(
RegisterNodeManagerRequest request)
throws YarnException, IOException {
RegisterNodeManagerResponse response = recordFactory.
newRecordInstance(RegisterNodeManagerResponse.class);
try {
response = rt.registerNodeManager(request);
} catch (YarnException e) {
LOG.info("Exception in node registration from "
+ request.getNodeId().toString(), e);
throw e;
}
return response;
}
};
};
@Override
protected void stopRMProxy() {
return;
}
};
};
}
}

View File

@ -1,6 +0,0 @@
Apache Solr Morphlines-Cell
*Experimental* - This contrib is currently subject to change in ways that may
break back compatibility.
This contrib provides a variety of Kite Morphlines features for Solr Cell type functionality.

View File

@ -1,144 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="solr-morphlines-cell" default="default">
<description>
Solr Cell Morphline commands.
</description>
<import file="../contrib-build.xml"/>
<solr-contrib-uptodate name="extraction"
property="solr-extraction.uptodate"
classpath.property="solr-cell.jar"/>
<target name="compile-solr-extraction" unless="solr-extraction.uptodate">
<ant dir="${common-solr.dir}/contrib/extraction" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<solr-contrib-uptodate name="morphlines-core"
property="solr-morphlines-core.uptodate"/>
<target name="compile-morphlines-core" unless="solr-morphlines-core.uptodate">
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="compile-test" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<solr-contrib-uptodate name="map-reduce"
property="solr-map-reduce.uptodate"
classpath.property="MapReduceIndexerTool.jar"/>
<target name="compile-map-reduce" unless="solr-map-reduce.uptodate">
<ant dir="${common-solr.dir}/contrib/map-reduce" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-extraction-libs">
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-morphlines-core-libs">
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-map-reduce-libs">
<ant dir="${common-solr.dir}/contrib/map-reduce" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<path id="classpath.additions">
<pathelement location="${common-solr.dir}/build/contrib/solr-cell/classes/java"/>
<fileset dir="${common-solr.dir}/contrib/extraction/lib" excludes="${common.classpath.excludes}"/>
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/java"/>
<fileset dir="${common-solr.dir}/contrib/morphlines-core/lib" excludes="${common.classpath.excludes}"/>
<!-- <pathelement location="${common-solr.dir}/build/contrib/solr-map-reduce/classes/java"/> -->
<!-- <fileset dir="${common-solr.dir}/contrib/map-reduce/lib" excludes="${common.classpath.excludes}"/> -->
</path>
<path id="classpath">
<path refid="solr.base.classpath"/>
<path refid="classpath.additions"/>
</path>
<path id="test.classpath">
<path refid="solr.test.base.classpath"/>
<path refid="classpath.additions"/>
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
<pathelement location="${common-solr.dir}/contrib/morphlines-core/src/test-files"/>
<fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
</path>
<path id="javadoc.classpath">
<path refid="junit-path"/>
<path refid="classpath"/>
<pathelement location="${ant.home}/lib/ant.jar"/>
<fileset dir=".">
<exclude name="build/**/*.jar"/>
<include name="**/lib/*.jar"/>
</fileset>
</path>
<!-- TODO: make this nicer like lucene? -->
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,javadocs-extraction,javadocs-morphlines-core,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
<sequential>
<mkdir dir="${javadoc.dir}/${name}"/>
<solr-invoke-javadoc>
<solrsources>
<packageset dir="${src.dir}"/>
</solrsources>
<links>
<link href="../solr-solrj"/>
<link href="../solr-core"/>
<link href="../solr-cell"/>
<link href="../solr-morphlines-core"/>
</links>
</solr-invoke-javadoc>
<solr-jarify basedir="${javadoc.dir}/${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
</sequential>
</target>
<target name="javadocs-extraction">
<ant dir="${common-solr.dir}/contrib/extraction" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="javadocs-morphlines-core">
<ant dir="${common-solr.dir}/contrib/morphlines-core" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="compile-core" depends="resolve-extraction-libs, resolve-morphlines-core-libs, resolve-map-reduce-libs, compile-solr-extraction, compile-morphlines-core, solr-contrib-build.compile-core"/>
<target name="dist" depends="common-solr.dist"/>
</project>

View File

@ -1,35 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<ivy-module version="2.0">
<info organisation="org.apache.solr" module="morphlines-cell" />
<configurations defaultconfmapping="compile->master;test->master">
<conf name="compile" transitive="false" />
<conf name="test" transitive="false" />
</configurations>
<dependencies>
<dependency org="org.kitesdk" name="kite-morphlines-tika-core" rev="${/org.kitesdk/kite-morphlines-tika-core}" conf="compile" />
<dependency org="org.kitesdk" name="kite-morphlines-tika-decompress" rev="${/org.kitesdk/kite-morphlines-tika-decompress}" conf="compile" />
<dependency org="org.kitesdk" name="kite-morphlines-json" rev="${/org.kitesdk/kite-morphlines-json}" conf="compile" />
<dependency org="org.kitesdk" name="kite-morphlines-twitter" rev="${/org.kitesdk/kite-morphlines-twitter}" conf="compile" />
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
</dependencies>
</ivy-module>

View File

@ -1,348 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.cell;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.IllformedLocaleException;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.stream.Collectors;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.io.Closeables;
import com.typesafe.config.Config;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.MultiMapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.SuppressForbidden;
import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractionDateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
import org.apache.solr.morphlines.solr.SolrLocator;
import org.apache.solr.schema.IndexSchema;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.Configs;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.stdio.AbstractParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Command that pipes the first attachment of a record into one of the given Tika parsers, then maps
* the Tika output back to a record using SolrCell.
* <p>
* The Tika parser is chosen from the configurable list of parsers, depending on the MIME type
* specified in the input record. Typically, this requires an upstream DetectMimeTypeBuilder
* in a prior command.
*/
public final class SolrCellBuilder implements CommandBuilder {
@Override
public Collection<String> getNames() {
return Collections.singletonList("solrCell");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new SolrCell(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class SolrCell extends AbstractParser {
private final IndexSchema schema;
private final List<String> dateFormats;
private final String xpathExpr;
private final List<Parser> parsers = new ArrayList<>();
private final SolrContentHandlerFactory solrContentHandlerFactory;
private final Locale locale;
private final SolrParams solrParams;
private final Map<MediaType, Parser> mediaTypeToParserMap;
private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
public static final String ADDITIONAL_SUPPORTED_MIME_TYPES = "additionalSupportedMimeTypes";
public SolrCell(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
LOG.debug("solrLocator: {}", locator);
this.schema = Objects.requireNonNull(locator.getIndexSchema());
if (LOG.isTraceEnabled()) {
LOG.trace("Solr schema: \n" + schema.getFields().entrySet().stream()
.sorted(Map.Entry.comparingByKey()).map(Map.Entry::getValue).map(Object::toString)
.collect(Collectors.joining("\n")));
}
ListMultimap<String, String> cellParams = ArrayListMultimap.create();
String uprefix = getConfigs().getString(config, ExtractingParams.UNKNOWN_FIELD_PREFIX, null);
if (uprefix != null) {
cellParams.put(ExtractingParams.UNKNOWN_FIELD_PREFIX, uprefix);
}
for (String capture : getConfigs().getStringList(config, ExtractingParams.CAPTURE_ELEMENTS, Collections.<String>emptyList())) {
cellParams.put(ExtractingParams.CAPTURE_ELEMENTS, capture);
}
Config fmapConfig = getConfigs().getConfig(config, "fmap", null);
if (fmapConfig != null) {
for (Map.Entry<String, Object> entry : new Configs().getEntrySet(fmapConfig)) {
cellParams.put(ExtractingParams.MAP_PREFIX + entry.getKey(), entry.getValue().toString());
}
}
String captureAttributes = getConfigs().getString(config, ExtractingParams.CAPTURE_ATTRIBUTES, null);
if (captureAttributes != null) {
cellParams.put(ExtractingParams.CAPTURE_ATTRIBUTES, captureAttributes);
}
String lowerNames = getConfigs().getString(config, ExtractingParams.LOWERNAMES, null);
if (lowerNames != null) {
cellParams.put(ExtractingParams.LOWERNAMES, lowerNames);
}
String defaultField = getConfigs().getString(config, ExtractingParams.DEFAULT_FIELD, null);
if (defaultField != null) {
cellParams.put(ExtractingParams.DEFAULT_FIELD, defaultField);
}
xpathExpr = getConfigs().getString(config, ExtractingParams.XPATH_EXPRESSION, null);
if (xpathExpr != null) {
cellParams.put(ExtractingParams.XPATH_EXPRESSION, xpathExpr);
}
this.dateFormats = getConfigs().getStringList(config, "dateFormats", new ArrayList<>(ExtractionDateUtil.DEFAULT_DATE_FORMATS));
String handlerStr = getConfigs().getString(config, "solrContentHandlerFactory", TrimSolrContentHandlerFactory.class.getName());
Class<? extends SolrContentHandlerFactory> factoryClass;
try {
factoryClass = Class.forName(handlerStr).asSubclass(SolrContentHandlerFactory.class);
} catch (ClassNotFoundException cnfe) {
throw new MorphlineCompilationException("Could not find class "
+ handlerStr + " to use for " + "solrContentHandlerFactory", config, cnfe);
}
this.solrContentHandlerFactory = getSolrContentHandlerFactory(factoryClass, dateFormats, config);
this.locale = getLocale(getConfigs().getString(config, "locale", null));
this.mediaTypeToParserMap = new HashMap<>();
//MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); // FIXME getMediaTypeRegistry.normalize()
List<? extends Config> parserConfigs = getConfigs().getConfigList(config, "parsers");
for (Config parserConfig : parserConfigs) {
String parserClassName = getConfigs().getString(parserConfig, "parser");
Object obj;
try {
obj = Class.forName(parserClassName).newInstance();
} catch (Throwable e) {
throw new MorphlineCompilationException("Cannot instantiate Tika parser: " + parserClassName, config, e);
}
if (!(obj instanceof Parser)) {
throw new MorphlineCompilationException("Tika parser " + obj.getClass().getName()
+ " must be an instance of class " + Parser.class.getName(), config);
}
Parser parser = (Parser) obj;
this.parsers.add(parser);
List<String> mediaTypes = getConfigs().getStringList(parserConfig, SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
for (String mediaTypeStr : mediaTypes) {
MediaType mediaType = parseMediaType(mediaTypeStr);
addSupportedMimeType(mediaTypeStr);
this.mediaTypeToParserMap.put(mediaType, parser);
}
if (!parserConfig.hasPath(SUPPORTED_MIME_TYPES)) {
for (MediaType mediaType : parser.getSupportedTypes(new ParseContext())) {
mediaType = mediaType.getBaseType();
addSupportedMimeType(mediaType.toString());
this.mediaTypeToParserMap.put(mediaType, parser);
}
List<String> extras = getConfigs().getStringList(parserConfig, ADDITIONAL_SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
for (String mediaTypeStr : extras) {
MediaType mediaType = parseMediaType(mediaTypeStr);
addSupportedMimeType(mediaTypeStr);
this.mediaTypeToParserMap.put(mediaType, parser);
}
}
}
//LOG.info("mediaTypeToParserMap="+mediaTypeToParserMap);
Map<String, String[]> tmp = new HashMap<>();
for (Map.Entry<String,Collection<String>> entry : cellParams.asMap().entrySet()) {
tmp.put(entry.getKey(), entry.getValue().toArray(new String[entry.getValue().size()]));
}
this.solrParams = new MultiMapSolrParams(tmp);
validateArguments();
}
@Override
protected boolean doProcess(Record record, InputStream inputStream) {
Parser parser = detectParser(record);
if (parser == null) {
return false;
}
ParseContext parseContext = new ParseContext();
parseContext.set(Locale.class, locale);
Metadata metadata = new Metadata();
for (Entry<String, Object> entry : record.getFields().entries()) {
metadata.add(entry.getKey(), entry.getValue().toString());
}
SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
try {
inputStream = TikaInputStream.get(inputStream);
ContentHandler parsingHandler = handler;
// String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
}
try {
parser.parse(inputStream, parsingHandler, metadata, parseContext);
} catch (IOException | TikaException | SAXException e) {
throw new MorphlineRuntimeException("Cannot parse", e);
}
} finally {
if (inputStream != null) {
Closeables.closeQuietly(inputStream);
}
}
SolrInputDocument doc = handler.newDocument();
LOG.debug("solr doc: {}", doc);
Record outputRecord = toRecord(doc);
return getChild().process(outputRecord);
}
private Parser detectParser(Record record) {
if (!hasAtLeastOneMimeType(record)) {
return null;
}
String mediaTypeStr = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); //ExtractingParams.STREAM_TYPE);
assert mediaTypeStr != null;
MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType();
Parser parser = mediaTypeToParserMap.get(mediaType); // fast path
if (parser != null) {
return parser;
}
// wildcard matching
for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) {
if (isMediaTypeMatch(mediaType, entry.getKey())) {
return entry.getValue();
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("No supported MIME type parser found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr);
}
return null;
}
private boolean hasAtLeastOneMimeType(Record record) {
if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
LOG.debug("Command failed because of missing MIME type for record: {}", record);
return false;
}
return true;
}
private MediaType parseMediaType(String mediaTypeStr) {
MediaType mediaType = MediaType.parse(mediaTypeStr.trim().toLowerCase(Locale.ROOT));
return mediaType.getBaseType();
};
/** Returns true if mediaType falls withing the given range (pattern), false otherwise */
private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) {
String WILDCARD = "*";
String rangePatternType = rangePattern.getType();
String rangePatternSubtype = rangePattern.getSubtype();
return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
&& (rangePatternSubtype.equals(WILDCARD) || rangePatternSubtype.equals(mediaType.getSubtype()));
}
private static SolrContentHandlerFactory getSolrContentHandlerFactory(
Class<? extends SolrContentHandlerFactory> factoryClass, Collection<String> dateFormats, Config config) {
try {
return factoryClass.getConstructor(Collection.class).newInstance(dateFormats);
} catch (NoSuchMethodException nsme) {
throw new MorphlineCompilationException("Unable to find valid constructor of type "
+ factoryClass.getName() + " for creating SolrContentHandler", config, nsme);
} catch (Exception e) {
throw new MorphlineCompilationException("Unexpected exception when trying to create SolrContentHandlerFactory of type "
+ factoryClass.getName(), config, e);
}
}
private Record toRecord(SolrInputDocument doc) {
Record record = new Record();
for (Entry<String, SolrInputField> entry : doc.entrySet()) {
record.getFields().putAll(entry.getKey(), entry.getValue().getValues());
}
return record;
}
@SuppressForbidden(reason = "Usage of outdated locale parsing with Locale#toString() because of backwards compatibility")
private Locale getLocale(String name) {
if (name == null) {
return Locale.ROOT;
}
for (Locale locale : Locale.getAvailableLocales()) {
if (locale.toString().equals(name)) {
return locale;
}
}
try {
return new Locale.Builder().setLanguageTag(name).build();
} catch (IllformedLocaleException ex) {
throw new MorphlineCompilationException("Malformed / non-existent locale: " + name, getConfig(), ex);
}
}
}
}

View File

@ -1,81 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.cell;
import java.util.Collection;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.tika.metadata.Metadata;
/**
* {@link SolrContentHandler} and associated factory that strips non-characters and trims on output.
* This prevents exceptions on parsing integer fields inside Solr server.
*/
public class StripNonCharSolrContentHandlerFactory extends SolrContentHandlerFactory {
public StripNonCharSolrContentHandlerFactory(Collection<String> dateFormats) {
super(dateFormats);
}
@Override
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
return new StripNonCharSolrContentHandler(metadata, params, schema, dateFormats);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class StripNonCharSolrContentHandler extends SolrContentHandler {
public StripNonCharSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema, Collection<String> dateFormats) {
super(metadata, params, schema, dateFormats);
}
/**
* Strip all non-characters, which can cause SolrReducer problems if present.
* This is borrowed from Apache Nutch.
*/
private static String stripNonCharCodepoints(String input) {
StringBuilder stripped = new StringBuilder(input.length());
char ch;
for (int i = 0; i < input.length(); i++) {
ch = input.charAt(i);
// Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line and carriage return
if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
(ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
(ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
stripped.append(ch);
}
}
return stripped.toString();
}
@Override
protected String transformValue(String val, SchemaField schemaField) {
String ret = super.transformValue(val, schemaField).trim();
ret = stripNonCharCodepoints(ret);
return ret;
}
}
}

View File

@ -1,58 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.cell;
import java.util.Collection;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.tika.metadata.Metadata;
/**
* {@link SolrContentHandler} and associated factory that trims field values on output.
* This prevents exceptions on parsing integer fields inside Solr server.
*/
public class TrimSolrContentHandlerFactory extends SolrContentHandlerFactory {
public TrimSolrContentHandlerFactory(Collection<String> dateFormats) {
super(dateFormats);
}
@Override
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
return new TrimSolrContentHandler(metadata, params, schema, dateFormats);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class TrimSolrContentHandler extends SolrContentHandler {
public TrimSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema, Collection<String> dateFormats) {
super(metadata, params, schema, dateFormats);
}
@Override
protected String transformValue(String val, SchemaField schemaField) {
return super.transformValue(val, schemaField).trim();
}
}
}

View File

@ -1,25 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Morphlines Solr Cell related code.
*/
package org.apache.solr.morphlines.cell;

View File

@ -1,21 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Apache Solr Search Server: Solr Cell Morphline Commands
</body>
</html>

View File

@ -1 +0,0 @@
The test-files by this module are located in the morphlines-core module.

View File

@ -1,292 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.cell;
import java.io.File;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.util.Constants;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.handler.extraction.ExtractionDateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.schema.IndexSchema;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
private Map<String,Integer> expectedRecords = new HashMap<>();
private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<>();
@BeforeClass
public static void beforeClass2() {
assumeFalse("FIXME: Morphlines currently has issues with Windows paths", Constants.WINDOWS);
}
@Before
public void setUp() throws Exception {
super.setUp();
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
expectedRecords.put(path + "sample-statuses-20120906-141433.avro", 2);
expectedRecords.put(path + "sample-statuses-20120906-141433", 2);
expectedRecords.put(path + "sample-statuses-20120906-141433.gz", 2);
expectedRecords.put(path + "sample-statuses-20120906-141433.bz2", 2);
expectedRecords.put(path + "cars.csv", 6);
expectedRecords.put(path + "cars.csv.gz", 6);
expectedRecords.put(path + "cars.tar.gz", 4);
expectedRecords.put(path + "cars.tsv", 6);
expectedRecords.put(path + "cars.ssv", 6);
expectedRecords.put(path + "test-documents.7z", 9);
expectedRecords.put(path + "test-documents.cpio", 9);
expectedRecords.put(path + "test-documents.tar", 9);
expectedRecords.put(path + "test-documents.tbz2", 9);
expectedRecords.put(path + "test-documents.tgz", 9);
expectedRecords.put(path + "test-documents.zip", 9);
expectedRecords.put(path + "multiline-stacktrace.log", 4);
{
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "image/jpeg");
record.put("ignored_exif_isospeedratings", "400");
record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
record.put("ignored_tiff_model", "Canon EOS 40D");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
}
{
String file = path + "testWORD_various.doc";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/msword");
record.put("ignored_author", "Michael McCandless");
record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
record.put("ignored_title", "");
record.put("ignored_keywords", "Keyword1 Keyword2");
record.put("ignored_subject", "Subject is here");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "testPDF.pdf";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/pdf");
record.put("ignored_author", "Bertrand Delacrétaz");
record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
record.put("ignored_title", "Apache Tika - Apache Tika");
record.put("ignored_xmp_creatortool", "Firefox");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "email.eml";
Map<String, Object> record = new LinkedHashMap();
String name = "Patrick Foo <foo@cloudera.com>";
record.put("ignored__attachment_mimetype", "message/rfc822");
record.put("ignored_author", name);
//record.put("ignored_content_length", "1068");
record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_message_from", name);
record.put("ignored_message_to", name);
record.put("ignored_creator", name);
record.put("ignored_dc_creator", name);
record.put("ignored_dc_title", "Test EML");
record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
record.put("ignored_meta_author", name);
record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_subject", "Test EML");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "testEXCEL.xlsx";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
record.put("ignored_author", "Keith Bennett");
record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
record.put("ignored_title", "Simple Excel document");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
}
@Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
public void testSolrCellJPGCompressed() throws Exception {
morphline = createMorphline("test-morphlines" + File.separator + "solrCellJPGCompressed");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testJPEG_EXIF.jpg",
path + "testJPEG_EXIF.jpg.gz",
path + "testJPEG_EXIF.jpg.tar.gz",
//path + "jpeg2000.jp2",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
public void testSolrCellXML() throws Exception {
morphline = createMorphline("test-morphlines" + File.separator + "solrCellXML");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testXML2.xml",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
public void testSolrCellDocumentTypes() throws Exception {
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testBMPfp.txt",
path + "boilerplate.html",
path + "NullHeader.docx",
path + "testWORD_various.doc",
path + "testPDF.pdf",
path + "testJPEG_EXIF.jpg",
path + "testJPEG_EXIF.jpg.gz",
path + "testJPEG_EXIF.jpg.tar.gz",
path + "testXML.xml",
path + "cars.csv",
// path + "cars.tsv",
// path + "cars.ssv",
path + "cars.csv.gz",
path + "cars.tar.gz",
path + "sample-statuses-20120906-141433.avro",
path + "sample-statuses-20120906-141433",
path + "sample-statuses-20120906-141433.gz",
path + "sample-statuses-20120906-141433.bz2",
path + "email.eml",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-9220")
public void testSolrCellDocumentTypes2() throws Exception {
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testPPT_various.ppt",
path + "testPPT_various.pptx",
path + "testEXCEL.xlsx",
path + "testEXCEL.xls",
path + "testPages.pages",
//path + "testNumbers.numbers",
//path + "testKeynote.key",
path + "testRTFVarious.rtf",
path + "complex.mbox",
path + "test-outlook.msg",
path + "testEMLX.emlx",
path + "testRFC822",
path + "rsstest.rss",
// path + "testDITA.dita",
path + "testMP3i18n.mp3",
path + "testAIFF.aif",
path + "testFLAC.flac",
// path + "testFLAC.oga",
// path + "testVORBIS.ogg",
path + "testMP4.m4a",
path + "testWAV.wav",
// path + "testWMA.wma",
path + "testFLV.flv",
// path + "testWMV.wmv",
path + "testBMP.bmp",
path + "testPNG.png",
path + "testPSD.psd",
path + "testSVG.svg",
path + "testTIFF.tif",
// path + "test-documents.7z",
// path + "test-documents.cpio",
// path + "test-documents.tar",
// path + "test-documents.tbz2",
// path + "test-documents.tgz",
// path + "test-documents.zip",
// path + "test-zip-of-zip.zip",
// path + "testJAR.jar",
// path + "testKML.kml",
// path + "testRDF.rdf",
path + "testVISIO.vsd",
// path + "testWAR.war",
// path + "testWindows-x86-32.exe",
// path + "testWINMAIL.dat",
// path + "testWMF.wmf",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
/**
* Test that the ContentHandler properly strips the illegal characters
*/
@Test
public void testTransformValue() {
String fieldName = "user_name";
assertFalse("foobar".equals(getFoobarWithNonChars()));
Metadata metadata = new Metadata();
// load illegal char string into a metadata field and generate a new document,
// which will cause the ContentHandler to be invoked.
metadata.set(fieldName, getFoobarWithNonChars());
StripNonCharSolrContentHandlerFactory contentHandlerFactory =
new StripNonCharSolrContentHandlerFactory(ExtractionDateUtil.DEFAULT_DATE_FORMATS);
IndexSchema schema = h.getCore().getLatestSchema();
SolrContentHandler contentHandler =
contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
SolrInputDocument doc = contentHandler.newDocument();
String foobar = doc.getFieldValue(fieldName).toString();
assertTrue("foobar".equals(foobar));
}
/**
* Returns string "foobar" with illegal characters interspersed.
*/
private String getFoobarWithNonChars() {
char illegalChar = '\uffff';
StringBuilder builder = new StringBuilder();
builder.append(illegalChar).append(illegalChar).append("foo").append(illegalChar)
.append(illegalChar).append("bar").append(illegalChar).append(illegalChar);
return builder.toString();
}
}

View File

@ -1,6 +0,0 @@
Apache Solr Morphlines-Core
*Experimental* - This contrib is currently subject to change in ways that may
break back compatibility.
This contrib provides a variety of Kite Morphlines features for Solr.

View File

@ -1,105 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="solr-morphlines-core" default="default" xmlns:ivy="antlib:org.apache.ivy.ant">
<description>
Solr Morphlines commands.
</description>
<import file="../contrib-build.xml"/>
<solr-contrib-uptodate name="extraction"
property="solr-extraction.uptodate"
classpath.property="solr-cell.jar"/>
<target name="compile-solr-extraction" unless="solr-extraction.uptodate">
<ant dir="${common-solr.dir}/contrib/extraction" target="compile-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve-extraction-libs">
<ant dir="${common-solr.dir}/contrib/extraction" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<path id="classpath.additions">
<pathelement location="${common-solr.dir}/build/contrib/solr-cell/classes/java"/>
<fileset dir="${common-solr.dir}/contrib/extraction/lib" excludes="${common.classpath.excludes}"/>
</path>
<path id="classpath">
<path refid="solr.base.classpath"/>
<path refid="classpath.additions"/>
</path>
<path id="test.classpath">
<path refid="solr.test.base.classpath"/>
<path refid="classpath.additions"/>
<fileset dir="${test.lib.dir}" includes="*.jar"/>
</path>
<path id="javadoc.classpath">
<path refid="junit-path"/>
<path refid="classpath"/>
<pathelement location="${ant.home}/lib/ant.jar"/>
<fileset dir=".">
<exclude name="build/**/*.jar"/>
<include name="**/lib/*.jar"/>
</fileset>
</path>
<!-- TODO: make this nicer like lucene? -->
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,javadocs-extraction,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
<sequential>
<mkdir dir="${javadoc.dir}/${name}"/>
<solr-invoke-javadoc>
<solrsources>
<packageset dir="${src.dir}"/>
</solrsources>
<links>
<link href="../solr-solrj"/>
<link href="../solr-core"/>
<link href="../solr-cell"/>
</links>
</solr-invoke-javadoc>
<solr-jarify basedir="${javadoc.dir}/${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
</sequential>
</target>
<target name="javadocs-extraction">
<ant dir="${common-solr.dir}/contrib/extraction" target="javadocs" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="resolve" depends="ivy-availability-check,ivy-fail,ivy-configure">
<sequential>
<ivy:retrieve conf="compile" type="jar,bundle" sync="${ivy.sync}" log="download-only" symlink="${ivy.symlink}"/>
<ivy:retrieve conf="test,test.DfsMiniCluster" type="jar,bundle,test" sync="${ivy.sync}" log="download-only" symlink="${ivy.symlink}"
pattern="${test.lib.dir}/[artifact]-[revision](-[classifier]).[ext]"/>
</sequential>
</target>
<target name="compile-core" depends="resolve-extraction-libs, compile-solr-extraction, solr-contrib-build.compile-core"/>
<target name="dist" depends="common-solr.dist"/>
</project>

View File

@ -1,128 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<ivy-module version="2.0" xmlns:maven="http://ant.apache.org/ivy/maven">
<info organisation="org.apache.solr" module="morphlines-core" />
<configurations defaultconfmapping="compile->master;test->master;test.DfsMiniCluster->master">
<!-- artifacts in the "compile" configuration will go into morphlines-core/lib/ -->
<conf name="compile" transitive="false" />
<!-- artifacts in the "test" and "test.DfsMiniCluster" configuration will go into morphlines-core/test-lib/ -->
<conf name="test" transitive="false" />
<conf name="test.DfsMiniCluster" transitive="false" />
</configurations>
<dependencies>
<dependency org="org.kitesdk" name="kite-morphlines-core" rev="${/org.kitesdk/kite-morphlines-core}" conf="compile;test">
<artifact name="kite-morphlines-core" ext="jar" />
<artifact name="kite-morphlines-core" type="test" ext="jar" maven:classifier="tests" />
</dependency>
<dependency org="org.kitesdk" name="kite-morphlines-avro" rev="${/org.kitesdk/kite-morphlines-avro}" conf="compile" />
<dependency org="io.dropwizard.metrics" name="metrics-core" rev="${/io.dropwizard.metrics/metrics-core}" conf="compile" />
<dependency org="io.dropwizard.metrics" name="metrics-healthchecks" rev="${/io.dropwizard.metrics/metrics-healthchecks}" conf="compile" />
<dependency org="com.typesafe" name="config" rev="${/com.typesafe/config}" conf="compile" />
<!-- Test Dependencies -->
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="${/org.apache.hadoop/hadoop-mapreduce-client-core}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-common" rev="${/org.apache.hadoop/hadoop-yarn-common}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-api" rev="${/org.apache.hadoop/hadoop-yarn-api}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-client" rev="${/org.apache.hadoop/hadoop-yarn-client}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-tests" rev="${/org.apache.hadoop/hadoop-yarn-server-tests}" conf="test">
<artifact name="hadoop-yarn-server-tests" type="test" ext="jar" maven:classifier="tests" />
</dependency>
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-common" rev="${/org.apache.hadoop/hadoop-yarn-server-common}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-nodemanager" rev="${/org.apache.hadoop/hadoop-yarn-server-nodemanager}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-app" rev="${/org.apache.hadoop/hadoop-mapreduce-client-app}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-common" rev="${/org.apache.hadoop/hadoop-mapreduce-client-common}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-hs" rev="${/org.apache.hadoop/hadoop-mapreduce-client-hs}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-resourcemanager" rev="${/org.apache.hadoop/hadoop-yarn-server-resourcemanager}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-shuffle" rev="${/org.apache.hadoop/hadoop-mapreduce-client-shuffle}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-web-proxy" rev="${/org.apache.hadoop/hadoop-yarn-server-web-proxy}" conf="test" />
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="${/org.apache.hadoop/hadoop-mapreduce-client-jobclient}" conf="test">
<artifact name="hadoop-mapreduce-client-jobclient" type="jar" ext="jar" />
<artifact name="hadoop-mapreduce-client-jobclient" type="test" ext="jar" maven:classifier="tests" />
</dependency>
<dependency org="org.apache.hadoop" name="hadoop-yarn-server-applicationhistoryservice" rev="${/org.apache.hadoop/hadoop-yarn-server-applicationhistoryservice}" conf="test"/>
<dependency org="org.fusesource.leveldbjni" name="leveldbjni" rev="${/org.fusesource.leveldbjni/leveldbjni}" conf="test"/>
<dependency org="org.iq80.leveldb" name="leveldb" rev="${/org.iq80.leveldb/leveldb}" conf="test.DfsMiniCluster"/>
<dependency org="org.iq80.leveldb" name="leveldb-api" rev="${/org.iq80.leveldb/leveldb-api}" conf="test.DfsMiniCluster"/>
<dependency org="org.apache.curator" name="curator-framework" rev="${/org.apache.curator/curator-framework}" conf="test"/>
<dependency org="org.apache.curator" name="curator-client" rev="${/org.apache.curator/curator-client}" conf="test"/>
<dependency org="aopalliance" name="aopalliance" rev="${/aopalliance/aopalliance}" conf="test" />
<dependency org="com.sun.xml.bind" name="jaxb-impl" rev="${/com.sun.xml.bind/jaxb-impl}" conf="test" />
<dependency org="io.netty" name="netty-all" rev="${/io.netty/netty-all}" conf="test" />
<dependency org="org.apache.mrunit" name="mrunit" rev="${/org.apache.mrunit/mrunit}" conf="test">
<artifact name="mrunit" maven:classifier="hadoop2" />
<exclude org="log4j" module="log4j" />
</dependency>
<!-- Mocking -->
<dependency org="org.mockito" name="mockito-core" rev="${/org.mockito/mockito-core}" conf="test"/>
<dependency org="net.bytebuddy" name="byte-buddy" rev="${/net.bytebuddy/byte-buddy}" conf="test"/>
<dependency org="org.objenesis" name="objenesis" rev="${/org.objenesis/objenesis}" conf="test"/>
<dependency org="commons-collections" name="commons-collections" rev="${/commons-collections/commons-collections}" conf="test" />
<!-- FasterXml Jackson Dependencies -->
<dependency org="com.fasterxml.jackson.core" name="jackson-core" rev="${/com.fasterxml.jackson.core/jackson-core}" conf="test" />
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="${/com.fasterxml.jackson.core/jackson-databind}" conf="test" />
<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="${/com.fasterxml.jackson.core/jackson-annotations}" conf="test" />
<!-- CodeHaus Jackson Dependencies -->
<dependency org="org.codehaus.jackson" name="jackson-jaxrs" rev="${/org.codehaus.jackson/jackson-jaxrs}" conf="test" />
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="${/org.codehaus.jackson/jackson-mapper-asl}" conf="test" />
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="${/org.codehaus.jackson/jackson-core-asl}" conf="test" />
<!-- Jersey Dependencies -->
<dependency org="com.sun.jersey.contribs" name="jersey-guice" rev="${/com.sun.jersey.contribs/jersey-guice}" conf="test" />
<dependency org="com.sun.jersey" name="jersey-core" rev="${/com.sun.jersey/jersey-core}" conf="test" />
<dependency org="com.sun.jersey" name="jersey-json" rev="${/com.sun.jersey/jersey-json}" conf="test" />
<dependency org="com.sun.jersey" name="jersey-server" rev="${/com.sun.jersey/jersey-server}" conf="test" />
<dependency org="com.sun.jersey" name="jersey-bundle" rev="${/com.sun.jersey/jersey-bundle}" conf="test" />
<!-- Guice Dependencies -->
<dependency org="com.google.inject" name="guice" rev="${/com.google.inject/guice}" conf="test" />
<dependency org="com.google.inject.extensions" name="guice-servlet" rev="${/com.google.inject.extensions/guice-servlet}" conf="test" />
<dependency org="javax.inject" name="javax.inject" rev="${/javax.inject/javax.inject}" conf="test" />
<!-- Avro Dependencies -->
<dependency org="org.apache.avro" name="avro" rev="${/org.apache.avro/avro}" conf="test" />
<dependency org="com.thoughtworks.paranamer" name="paranamer" rev="${/com.thoughtworks.paranamer/paranamer}" conf="test" />
<dependency org="org.xerial.snappy" name="snappy-java" rev="${/org.xerial.snappy/snappy-java}" conf="test" />
<!-- Hadoop DfsMiniCluster Dependencies -->
<dependency org="org.apache.hadoop" name="hadoop-common" rev="${/org.apache.hadoop/hadoop-common}" conf="test.DfsMiniCluster">
<artifact name="hadoop-common" type="jar" ext="jar" />
<artifact name="hadoop-common" type="test" ext="jar" maven:classifier="tests" />
</dependency>
<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="${/org.apache.hadoop/hadoop-hdfs}" conf="test.DfsMiniCluster">
<artifact name="hadoop-hdfs" type="test" ext="jar" maven:classifier="tests" />
</dependency>
<dependency org="org.mortbay.jetty" name="jetty" rev="${/org.mortbay.jetty/jetty}" conf="test.DfsMiniCluster" />
<dependency org="org.mortbay.jetty" name="jetty-util" rev="${/org.mortbay.jetty/jetty-util}" conf="test.DfsMiniCluster" />
<dependency org="com.sun.jersey" name="jersey-core" rev="${/com.sun.jersey/jersey-core}" conf="test.DfsMiniCluster" />
<dependency org="org.apache.htrace" name="htrace-core" rev="${/org.apache.htrace/htrace-core}" conf="test.DfsMiniCluster"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
</dependencies>
</ivy-module>

View File

@ -1,73 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.io.IOException;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
/**
* A vehicle to load a list of Solr documents into some kind of destination,
* such as a SolrServer or MapReduce RecordWriter.
*/
public interface DocumentLoader {
/** Begins a transaction */
public void beginTransaction() throws IOException, SolrServerException;
/** Loads the given document into the destination */
public void load(SolrInputDocument doc) throws IOException, SolrServerException;
/**
* Sends any outstanding documents to the destination and waits for a positive
* or negative ack (i.e. exception). Depending on the outcome the caller
* should then commit or rollback the current flume transaction
* correspondingly.
*
* @throws IOException
* If there is a low-level I/O error.
*/
public void commitTransaction() throws IOException, SolrServerException;
/**
* Performs a rollback of all non-committed documents pending.
* <p>
* Note that this is not a true rollback as in databases. Content you have
* previously added may have already been committed due to autoCommit, buffer
* full, other client performing a commit etc. So this is only a best-effort
* rollback.
*
* @throws IOException
* If there is a low-level I/O error.
*/
public UpdateResponse rollbackTransaction() throws IOException, SolrServerException;
/** Releases allocated resources */
public void shutdown() throws IOException, SolrServerException;
/**
* Issues a ping request to check if the server is alive
*
* @throws IOException
* If there is a low-level I/O error.
*/
public SolrPingResponse ping() throws IOException, SolrServerException;
}

View File

@ -1,140 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
class FileUtils {
//-----------------------------------------------------------------------
/**
* Deletes a directory recursively.
*
* @param directory directory to delete
* @throws IOException in case deletion is unsuccessful
*/
public static void deleteDirectory(File directory) throws IOException {
if (!directory.exists()) {
return;
}
if (!isSymlink(directory)) {
cleanDirectory(directory);
}
Files.delete(directory.toPath());
}
/**
* Determines whether the specified file is a Symbolic Link rather than an actual file.
* <p>
* Will not return true if there is a Symbolic Link anywhere in the path,
* only if the specific file is.
*
* @param file the file to check
* @return true if the file is a Symbolic Link
* @throws IOException if an IO error occurs while checking the file
* @since Commons IO 2.0
*/
public static boolean isSymlink(File file) throws IOException {
if (file == null) {
throw new NullPointerException("File must not be null");
}
// if (FilenameUtils.isSystemWindows()) {
if (File.separatorChar == '\\') {
return false;
}
File fileInCanonicalDir = null;
if (file.getParent() == null) {
fileInCanonicalDir = file;
} else {
File canonicalDir = file.getParentFile().getCanonicalFile();
fileInCanonicalDir = new File(canonicalDir, file.getName());
}
if (fileInCanonicalDir.getCanonicalFile().equals(fileInCanonicalDir.getAbsoluteFile())) {
return false;
} else {
return true;
}
}
/**
* Cleans a directory without deleting it.
*
* @param directory directory to clean
* @throws IOException in case cleaning is unsuccessful
*/
public static void cleanDirectory(File directory) throws IOException {
if (!directory.exists()) {
String message = directory + " does not exist";
throw new IllegalArgumentException(message);
}
if (!directory.isDirectory()) {
String message = directory + " is not a directory";
throw new IllegalArgumentException(message);
}
File[] files = directory.listFiles();
if (files == null) { // null if security restricted
throw new IOException("Failed to list contents of " + directory);
}
IOException exception = null;
for (File file : files) {
try {
forceDelete(file);
} catch (IOException ioe) {
exception = ioe;
}
}
if (null != exception) {
throw exception;
}
}
//-----------------------------------------------------------------------
/**
* Deletes a file. If file is a directory, delete it and all sub-directories.
* <p>
* The difference between File.delete() and this method are:
* <ul>
* <li>A directory to be deleted does not have to be empty.</li>
* <li>You get exceptions when a file or directory cannot be deleted.
* (java.io.File methods returns a boolean)</li>
* </ul>
*
* @param file file or directory to delete, must not be <code>null</code>
* @throws NullPointerException if the directory is <code>null</code>
* @throws FileNotFoundException if the file was not found
* @throws IOException in case deletion is unsuccessful
*/
public static void forceDelete(File file) throws IOException {
if (file.isDirectory()) {
deleteDirectory(file);
} else {
Files.delete(file.toPath());
}
}
}

View File

@ -1,143 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.security.SecureRandom;
import java.util.Arrays;
import java.util.Collection;
import java.util.Random;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Notifications;
import com.typesafe.config.Config;
/**
* A command that assigns a record unique key that is the concatenation of the given
* <code>baseIdField</code> record field, followed by a running count of the record number within
* the current session. The count is reset to zero whenever a "startSession" notification is
* received.
* <p>
* For example, assume a CSV file containing multiple records but no unique ids, and the
* <code>baseIdField</code> field is the filesystem path of the file. Now this command can be used
* to assign the following record values to Solr's unique key field:
* <code>$path#0, $path#1, ... $path#N</code>.
* <p>
* The name of the unique key field is fetched from Solr's schema.xml file, as directed by the
* <code>solrLocator</code> configuration parameter.
*/
public final class GenerateSolrSequenceKeyBuilder implements CommandBuilder {
@Override
public Collection<String> getNames() {
return Arrays.asList(
"generateSolrSequenceKey",
"sanitizeUniqueSolrKey" // old name (retained for backwards compatibility)
);
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new GenerateSolrSequenceKey(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class GenerateSolrSequenceKey extends AbstractCommand {
private final boolean preserveExisting;
private final String baseIdFieldName;
private final String uniqueKeyName;
private long recordCounter = 0;
private final String idPrefix; // for load testing only; enables adding same document many times with a different unique key
private final Random randomIdPrefix; // for load testing only; enables adding same document many times with a different unique key
public GenerateSolrSequenceKey(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
this.baseIdFieldName = getConfigs().getString(config, "baseIdField", Fields.BASE_ID);
this.preserveExisting = getConfigs().getBoolean(config, "preserveExisting", true);
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
LOG.debug("solrLocator: {}", locator);
IndexSchema schema = locator.getIndexSchema();
SchemaField uniqueKey = schema.getUniqueKeyField();
uniqueKeyName = uniqueKey == null ? null : uniqueKey.getName();
String tmpIdPrefix = getConfigs().getString(config, "idPrefix", null); // for load testing only
Random tmpRandomIdPrefx = null;
if ("random".equals(tmpIdPrefix)) { // for load testing only
tmpRandomIdPrefx = new Random(new SecureRandom().nextLong());
tmpIdPrefix = null;
}
idPrefix = tmpIdPrefix;
randomIdPrefix = tmpRandomIdPrefx;
validateArguments();
}
@Override
protected boolean doProcess(Record doc) {
long num = recordCounter++;
// LOG.debug("record #{} id before sanitizing doc: {}", num, doc);
if (uniqueKeyName == null || (preserveExisting && doc.getFields().containsKey(uniqueKeyName))) {
; // we must preserve the existing id
} else {
Object baseId = doc.getFirstValue(baseIdFieldName);
if (baseId == null) {
throw new MorphlineRuntimeException("Record field " + baseIdFieldName
+ " must not be null as it is needed as a basis for a unique key for solr doc: " + doc);
}
doc.replaceValues(uniqueKeyName, baseId.toString() + "#" + num);
}
// for load testing only; enables adding same document many times with a different unique key
if (idPrefix != null) {
String id = doc.getFirstValue(uniqueKeyName).toString();
id = idPrefix + id;
doc.replaceValues(uniqueKeyName, id);
} else if (randomIdPrefix != null) {
String id = doc.getFirstValue(uniqueKeyName).toString();
id = String.valueOf(Math.abs(randomIdPrefix.nextInt())) + "#" + id;
doc.replaceValues(uniqueKeyName, id);
}
LOG.debug("record #{} unique key sanitized to this: {}", num, doc);
return super.doProcess(doc);
}
@Override
protected void doNotify(Record notification) {
if (Notifications.containsLifecycleEvent(notification, Notifications.LifecycleEvent.START_SESSION)) {
recordCounter = 0; // reset
}
super.doNotify(notification);
}
}
}

View File

@ -1,153 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrInputDocument;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Configs;
import org.kitesdk.morphline.base.Metrics;
import org.kitesdk.morphline.base.Notifications;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Timer;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
* A command that loads a record into a SolrServer or MapReduce SolrOutputFormat.
*/
public final class LoadSolrBuilder implements CommandBuilder {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final AtomicBoolean WARNED_ABOUT_INDEX_TIME_BOOSTS = new AtomicBoolean();
@Override
public Collection<String> getNames() {
return Collections.singletonList("loadSolr");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new LoadSolr(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class LoadSolr extends AbstractCommand {
private final DocumentLoader loader;
private final Timer elapsedTime;
public LoadSolr(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
LOG.debug("solrLocator: {}", locator);
this.loader = locator.getLoader();
Config boostsConfig = getConfigs().getConfig(config, "boosts", ConfigFactory.empty());
if (new Configs().getEntrySet(boostsConfig).isEmpty() == false) {
String message = "Ignoring field boosts: as index-time boosts are not supported anymore";
if (WARNED_ABOUT_INDEX_TIME_BOOSTS.compareAndSet(false, true)) {
log.warn(message);
} else {
log.debug(message);
}
}
validateArguments();
this.elapsedTime = getTimer(Metrics.ELAPSED_TIME);
}
@Override
protected void doNotify(Record notification) {
for (Object event : Notifications.getLifecycleEvents(notification)) {
if (event == Notifications.LifecycleEvent.BEGIN_TRANSACTION) {
try {
loader.beginTransaction();
} catch (SolrServerException | IOException e) {
throw new MorphlineRuntimeException(e);
}
} else if (event == Notifications.LifecycleEvent.COMMIT_TRANSACTION) {
try {
loader.commitTransaction();
} catch (SolrServerException | IOException e) {
throw new MorphlineRuntimeException(e);
}
}
else if (event == Notifications.LifecycleEvent.ROLLBACK_TRANSACTION) {
try {
loader.rollbackTransaction();
} catch (SolrServerException | IOException e) {
throw new MorphlineRuntimeException(e);
}
}
else if (event == Notifications.LifecycleEvent.SHUTDOWN) {
try {
loader.shutdown();
} catch (SolrServerException | IOException e) {
throw new MorphlineRuntimeException(e);
}
}
}
super.doNotify(notification);
}
@Override
protected boolean doProcess(Record record) {
Timer.Context timerContext = elapsedTime.time();
SolrInputDocument doc = convert(record);
try {
loader.load(doc);
} catch (IOException | SolrServerException e) {
throw new MorphlineRuntimeException(e);
} finally {
timerContext.stop();
}
// pass record to next command in chain:
return super.doProcess(record);
}
private SolrInputDocument convert(Record record) {
Map<String, Collection<Object>> map = record.getFields().asMap();
SolrInputDocument doc = new SolrInputDocument(new HashMap(2 * map.size()));
for (Map.Entry<String, Collection<Object>> entry : map.entrySet()) {
String key = entry.getKey();
doc.setField(key, entry.getValue());
}
return doc;
}
}
}

View File

@ -1,70 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.lang.invoke.MethodHandles;
import org.apache.http.client.HttpClient;
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* ConcurrentUpdateSolrServer that propagates exceptions up to the submitter of
* requests on blockUntilFinished()
*/
final class SafeConcurrentUpdateSolrClient extends ConcurrentUpdateSolrClient {
private Throwable currentException = null;
private final Object myLock = new Object();
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public SafeConcurrentUpdateSolrClient(String solrServerUrl, int queueSize, int threadCount) {
this(solrServerUrl, null, queueSize, threadCount);
}
public SafeConcurrentUpdateSolrClient(String solrServerUrl, HttpClient client, int queueSize, int threadCount) {
super(solrServerUrl, client, queueSize, threadCount, null, false);
}
@Override
public void handleError(Throwable ex) {
assert ex != null;
synchronized (myLock) {
currentException = ex;
}
LOGGER.error("handleError", ex);
}
@Override
public void blockUntilFinished() {
super.blockUntilFinished();
synchronized (myLock) {
if (currentException != null) {
throw new RuntimeException(currentException);
}
}
}
public void clearException() {
synchronized (myLock) {
currentException = null;
}
}
}

View File

@ -1,101 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.solr.schema.IndexSchema;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import com.typesafe.config.Config;
/**
* Command that sanitizes record fields that are unknown to Solr schema.xml by either deleting them
* (renameToPrefix is absent or a zero length string), or by moving them to a field prefixed with
* the given renameToPrefix (e.g. renameToPrefix = "ignored_" to use typical dynamic Solr fields).
* <p>
* Recall that Solr throws an exception on any attempt to load a document that contains a field that
* isn't specified in schema.xml.
*/
public final class SanitizeUnknownSolrFieldsBuilder implements CommandBuilder {
@Override
public Collection<String> getNames() {
return Collections.singletonList("sanitizeUnknownSolrFields");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new SanitizeUnknownSolrFields(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class SanitizeUnknownSolrFields extends AbstractCommand {
private final IndexSchema schema;
private final String renameToPrefix;
public SanitizeUnknownSolrFields(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
LOG.debug("solrLocator: {}", locator);
this.schema = Objects.requireNonNull(locator.getIndexSchema());
if (LOG.isTraceEnabled()) {
LOG.trace("Solr schema: \n" +
schema.getFields().entrySet().stream().sorted(Map.Entry.comparingByKey())
.map(Map.Entry::getValue).map(Object::toString).collect(Collectors.joining("\n"))
);
}
String str = getConfigs().getString(config, "renameToPrefix", "").trim();
this.renameToPrefix = str.length() > 0 ? str : null;
validateArguments();
}
@Override
protected boolean doProcess(Record record) {
Collection<Map.Entry> entries = new ArrayList<Map.Entry>(record.getFields().asMap().entrySet());
for (Map.Entry<String, Collection<Object>> entry : entries) {
String key = entry.getKey();
if (schema.getFieldOrNull(key) == null) {
LOG.debug("Sanitizing unknown Solr field: {}", key);
Collection values = entry.getValue();
if (renameToPrefix != null) {
record.getFields().putAll(renameToPrefix + key, values);
}
values.clear(); // implicitly removes key from record
}
}
return super.doProcess(record);
}
}
}

View File

@ -1,124 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
/**
* A vehicle to load a list of Solr documents into a local or remote {@link org.apache.solr.client.solrj.SolrClient}.
*/
public class SolrClientDocumentLoader implements DocumentLoader {
private final SolrClient client; // proxy to local or remote solr server
private long numLoadedDocs = 0; // number of documents loaded in the current transaction
private final int batchSize;
private final List<SolrInputDocument> batch = new ArrayList();
private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public SolrClientDocumentLoader(SolrClient client, int batchSize) {
if (client == null) {
throw new IllegalArgumentException("solr server must not be null");
}
this.client = client;
if (batchSize <= 0) {
throw new IllegalArgumentException("batchSize must be a positive number: " + batchSize);
}
this.batchSize = batchSize;
}
@Override
public void beginTransaction() {
LOGGER.trace("beginTransaction");
batch.clear();
numLoadedDocs = 0;
if (client instanceof SafeConcurrentUpdateSolrClient) {
((SafeConcurrentUpdateSolrClient) client).clearException();
}
}
@Override
public void load(SolrInputDocument doc) throws IOException, SolrServerException {
LOGGER.trace("load doc: {}", doc);
batch.add(doc);
if (batch.size() >= batchSize) {
loadBatch();
}
}
@Override
public void commitTransaction() throws SolrServerException, IOException {
LOGGER.trace("commitTransaction");
if (batch.size() > 0) {
loadBatch();
}
if (numLoadedDocs > 0) {
if (client instanceof ConcurrentUpdateSolrClient) {
((ConcurrentUpdateSolrClient) client).blockUntilFinished();
}
}
}
private void loadBatch() throws SolrServerException, IOException {
numLoadedDocs += batch.size();
try {
UpdateResponse rsp = client.add(batch);
} finally {
batch.clear();
}
}
@Override
public UpdateResponse rollbackTransaction() throws SolrServerException, IOException {
LOGGER.trace("rollback");
if (!(client instanceof CloudSolrClient)) {
return client.rollback();
} else {
return new UpdateResponse();
}
}
@Override
public void shutdown() throws IOException {
LOGGER.trace("shutdown");
client.close();
}
@Override
public SolrPingResponse ping() throws SolrServerException, IOException {
LOGGER.trace("ping");
return client.ping();
}
public SolrClient getSolrClient() {
return client;
}
}

View File

@ -1,254 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Paths;
import java.util.Objects;
import com.google.common.io.Files;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;
import com.typesafe.config.ConfigUtil;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.CloudSolrClient.Builder;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.util.SystemIdResolver;
import org.apache.zookeeper.KeeperException;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.base.Configs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Set of configuration parameters that identify the location and schema of a Solr server or
* SolrCloud; Based on this information this class can return the schema and a corresponding
* {@link DocumentLoader}.
*/
public class SolrLocator {
private Config config;
private MorphlineContext context;
private String collectionName;
private String zkHost;
private String solrUrl;
private String solrHomeDir;
private int batchSize = 1000;
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
protected SolrLocator(MorphlineContext context) {
this.context = Objects.requireNonNull(context);
}
public SolrLocator(Config config, MorphlineContext context) {
this(context);
this.config = config;
Configs configs = new Configs();
collectionName = configs.getString(config, "collection", null);
zkHost = configs.getString(config, "zkHost", null);
solrHomeDir = configs.getString(config, "solrHomeDir", null);
solrUrl = configs.getString(config, "solrUrl", null);
batchSize = configs.getInt(config, "batchSize", batchSize);
LOG.trace("Constructed solrLocator: {}", this);
configs.validateArguments(config);
}
public DocumentLoader getLoader() {
if (context instanceof SolrMorphlineContext) {
DocumentLoader loader = ((SolrMorphlineContext)context).getDocumentLoader();
if (loader != null) {
return loader;
}
}
if (zkHost != null && zkHost.length() > 0) {
if (collectionName == null || collectionName.length() == 0) {
throw new MorphlineCompilationException("Parameter 'zkHost' requires that you also pass parameter 'collection'", config);
}
CloudSolrClient cloudSolrClient = new Builder()
.withZkHost(zkHost)
.build();
cloudSolrClient.setDefaultCollection(collectionName);
cloudSolrClient.connect();
return new SolrClientDocumentLoader(cloudSolrClient, batchSize);
} else {
if (solrUrl == null || solrUrl.length() == 0) {
throw new MorphlineCompilationException("Missing parameter 'solrUrl'", config);
}
int solrServerNumThreads = 2;
int solrServerQueueLength = solrServerNumThreads;
SolrClient server = new SafeConcurrentUpdateSolrClient(solrUrl, solrServerQueueLength, solrServerNumThreads);
// SolrServer server = new HttpSolrServer(solrServerUrl);
// SolrServer server = new ConcurrentUpdateSolrServer(solrServerUrl, solrServerQueueLength, solrServerNumThreads);
// server.setParser(new XMLResponseParser()); // binary parser is used by default
return new SolrClientDocumentLoader(server, batchSize);
}
}
public IndexSchema getIndexSchema() {
if (context instanceof SolrMorphlineContext) {
IndexSchema schema = ((SolrMorphlineContext)context).getIndexSchema();
if (schema != null) {
validateSchema(schema);
return schema;
}
}
File downloadedSolrHomeDir = null;
try {
// If solrHomeDir isn't defined and zkHost and collectionName are defined
// then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir
String mySolrHomeDir = solrHomeDir;
if (solrHomeDir == null || solrHomeDir.length() == 0) {
if (zkHost == null || zkHost.length() == 0) {
// TODO: implement download from solrUrl if specified
throw new MorphlineCompilationException(
"Downloading a Solr schema requires either parameter 'solrHomeDir' or parameters 'zkHost' and 'collection'",
config);
}
if (collectionName == null || collectionName.length() == 0) {
throw new MorphlineCompilationException(
"Parameter 'zkHost' requires that you also pass parameter 'collection'", config);
}
ZooKeeperDownloader zki = new ZooKeeperDownloader();
SolrZkClient zkClient = zki.getZkClient(zkHost);
try {
String configName = zki.readConfigName(zkClient, collectionName);
downloadedSolrHomeDir = Files.createTempDir();
downloadedSolrHomeDir = zki.downloadConfigDir(zkClient, configName, downloadedSolrHomeDir);
mySolrHomeDir = downloadedSolrHomeDir.getAbsolutePath();
} catch (KeeperException | InterruptedException | IOException e) {
throw new MorphlineCompilationException("Cannot download schema.xml from ZooKeeper", config, e);
} finally {
zkClient.close();
}
}
LOG.debug("SolrLocator loading IndexSchema from dir {}", mySolrHomeDir);
try {
SolrResourceLoader loader = new SolrResourceLoader(Paths.get(mySolrHomeDir));
SolrConfig solrConfig = new SolrConfig(loader, "solrconfig.xml", null);
InputSource is = new InputSource(loader.openSchema("schema.xml"));
is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml"));
IndexSchema schema = new IndexSchema(solrConfig, "schema.xml", is);
validateSchema(schema);
return schema;
} catch (ParserConfigurationException | IOException | SAXException e) {
throw new MorphlineRuntimeException(e);
}
} finally {
if (downloadedSolrHomeDir != null) {
try {
FileUtils.deleteDirectory(downloadedSolrHomeDir);
} catch (IOException e) {
LOG.warn("Cannot delete tmp directory", e);
}
}
}
}
private void validateSchema(IndexSchema schema) {
if (schema.getUniqueKeyField() == null) {
throw new MorphlineCompilationException("Solr schema.xml is missing unique key field", config);
}
if (!schema.getUniqueKeyField().isRequired()) {
throw new MorphlineCompilationException("Solr schema.xml must contain a required unique key field", config);
}
}
@Override
public String toString() {
return toConfig(null).root().render(ConfigRenderOptions.concise());
}
public Config toConfig(String key) {
String json = "";
if (key != null) {
json = toJson(key) + " : ";
}
json +=
"{" +
" collection : " + toJson(collectionName) + ", " +
" zkHost : " + toJson(zkHost) + ", " +
" solrUrl : " + toJson(solrUrl) + ", " +
" solrHomeDir : " + toJson(solrHomeDir) + ", " +
" batchSize : " + toJson(batchSize) + " " +
"}";
return ConfigFactory.parseString(json);
}
private String toJson(Object key) {
String str = key == null ? "" : key.toString();
str = ConfigUtil.quoteString(str);
return str;
}
public String getCollectionName() {
return this.collectionName;
}
public void setCollectionName(String collectionName) {
this.collectionName = collectionName;
}
public String getZkHost() {
return this.zkHost;
}
public void setZkHost(String zkHost) {
this.zkHost = zkHost;
}
public String getSolrHomeDir() {
return this.solrHomeDir;
}
public void setSolrHomeDir(String solrHomeDir) {
this.solrHomeDir = solrHomeDir;
}
public String getServerUrl() {
return this.solrUrl;
}
public void setServerUrl(String solrUrl) {
this.solrUrl = solrUrl;
}
public int getBatchSize() {
return this.batchSize;
}
public void setBatchSize(int batchSize) {
this.batchSize = batchSize;
}
}

View File

@ -1,80 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import org.apache.solr.schema.IndexSchema;
import org.kitesdk.morphline.api.MorphlineContext;
/**
* A context that is specific to Solr.
*/
public class SolrMorphlineContext extends MorphlineContext {
private DocumentLoader loader;
private IndexSchema schema;
/** For public access use {@link Builder#build()} instead */
protected SolrMorphlineContext() {}
public DocumentLoader getDocumentLoader() {
return loader;
}
public IndexSchema getIndexSchema() {
return schema;
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* Helper to construct a {@link SolrMorphlineContext} instance.
*/
public static class Builder extends MorphlineContext.Builder {
private DocumentLoader loader;
private IndexSchema schema;
public Builder() {}
public Builder setDocumentLoader(DocumentLoader loader) {
this.loader = loader;
return this;
}
public Builder setIndexSchema(IndexSchema schema) {
this.schema = schema;
return this;
}
@Override
public SolrMorphlineContext build() {
((SolrMorphlineContext)context).loader = loader;
((SolrMorphlineContext)context).schema = schema;
return (SolrMorphlineContext) super.build();
}
@Override
protected SolrMorphlineContext create() {
return new SolrMorphlineContext();
}
}
}

Some files were not shown because too many files have changed in this diff Show More