From 3a9aadc3c76595d2c2cfc9abc2a5d328f05e936b Mon Sep 17 00:00:00 2001 From: Suresh Srinivas Date: Fri, 12 Oct 2012 04:48:40 +0000 Subject: [PATCH] HADOOP-8911. CRLF characters in source and text files (trunk equivalent patch). Contributed Raja Aluri. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1397435 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 3 + .../src/main/docs/releasenotes.html | 464 ++-- .../apache/hadoop/metrics/ContextFactory.java | 422 +-- .../apache/hadoop/metrics/MetricsContext.java | 244 +- .../hadoop/metrics/MetricsException.java | 94 +- .../apache/hadoop/metrics/MetricsRecord.java | 502 ++-- .../hadoop/metrics/file/FileContext.java | 308 +-- .../metrics/spi/AbstractMetricsContext.java | 962 +++---- .../hadoop/metrics/spi/MetricsRecordImpl.java | 562 ++-- .../org/apache/hadoop/util/DataChecksum.java | 920 +++---- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../documentation/content/xdocs/libhdfs.xml | 220 +- hadoop-mapreduce-project/CHANGES.txt | 3 + .../TestClientProtocolProviderImpls.java | 240 +- .../TestYarnClientProtocolProvider.java | 258 +- .../org/apache/hadoop/examples/WordMean.java | 390 +-- .../apache/hadoop/examples/WordMedian.java | 416 +-- .../examples/WordStandardDeviation.java | 418 +-- .../apache/hadoop/examples/TestWordStats.java | 544 ++-- .../src/contrib/index/sample/data.txt | 20 +- .../src/contrib/index/sample/data2.txt | 20 +- .../example/HashingDistributionPolicy.java | 112 +- .../index/example/IdentityLocalAnalysis.java | 114 +- .../index/example/LineDocInputFormat.java | 92 +- .../index/example/LineDocLocalAnalysis.java | 160 +- .../index/example/LineDocRecordReader.java | 462 ++-- .../index/example/LineDocTextAndOp.java | 184 +- .../example/RoundRobinDistributionPolicy.java | 116 +- .../lucene/LuceneIndexFileNameFilter.java | 110 +- .../contrib/index/lucene/LuceneUtil.java | 224 +- .../index/lucene/MixedDeletionPolicy.java | 98 +- .../contrib/index/lucene/MixedDirectory.java | 370 +-- .../index/lucene/RAMDirectoryUtil.java | 238 +- .../contrib/index/lucene/ShardWriter.java | 466 ++-- .../contrib/index/main/UpdateIndex.java | 548 ++-- .../contrib/index/mapred/DocumentAndOp.java | 416 +-- .../contrib/index/mapred/DocumentID.java | 178 +- .../index/mapred/IDistributionPolicy.java | 100 +- .../contrib/index/mapred/IIndexUpdater.java | 92 +- .../contrib/index/mapred/ILocalAnalysis.java | 64 +- .../index/mapred/IndexUpdateCombiner.java | 222 +- .../mapred/IndexUpdateConfiguration.java | 512 ++-- .../index/mapred/IndexUpdateMapper.java | 398 +-- .../index/mapred/IndexUpdatePartitioner.java | 120 +- .../index/mapred/IndexUpdateReducer.java | 286 +-- .../index/mapred/IntermediateForm.java | 504 ++-- .../index/lucene/TestMixedDirectory.java | 210 +- .../index/mapred/TestDistributionPolicy.java | 466 ++-- .../index/mapred/TestIndexUpdater.java | 510 ++-- hadoop-yarn-project/CHANGES.txt | 3 + .../src/site/apt/ClusterSetup.apt.vm | 2252 ++++++++--------- 51 files changed, 8326 insertions(+), 8314 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 827a58d37ee..864ece0b171 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -41,6 +41,9 @@ Release 2.0.3-alpha - Unreleased HADOOP-8909. Hadoop Common Maven protoc calls must not depend on external sh script. (Chris Nauroth via suresh) + HADOOP-8911. CRLF characters in source and text files. + (Raja Aluri via suresh) + OPTIMIZATIONS HADOOP-8866. SampleQuantiles#query is O(N^2) instead of O(N). (Andrew Wang diff --git a/hadoop-common-project/hadoop-common/src/main/docs/releasenotes.html b/hadoop-common-project/hadoop-common/src/main/docs/releasenotes.html index e3915caf83a..bd13cfb9b61 100644 --- a/hadoop-common-project/hadoop-common/src/main/docs/releasenotes.html +++ b/hadoop-common-project/hadoop-common/src/main/docs/releasenotes.html @@ -15,8 +15,8 @@ These release notes include new developer and user-facing incompatibilities, fea
  • YARN-137. Major improvement reported by Siddharth Seth and fixed by Siddharth Seth (scheduler)
    Change the default scheduler to the CapacityScheduler
    -
    There's some bugs in the FifoScheduler atm - doesn't distribute tasks across nodes and some headroom (available resource) issues. -That's not the best experience for users trying out the 2.0 branch. The CS with the default configuration of a single queue behaves the same as the FifoScheduler and doesn't have these issues. +
    There's some bugs in the FifoScheduler atm - doesn't distribute tasks across nodes and some headroom (available resource) issues. +That's not the best experience for users trying out the 2.0 branch. The CS with the default configuration of a single queue behaves the same as the FifoScheduler and doesn't have these issues.
  • YARN-108. Critical bug reported by Jason Lowe and fixed by Jason Lowe (nodemanager)
    @@ -45,73 +45,73 @@ That's not the best experience for users trying out the 2.0 branch. The CS with
  • YARN-79. Major bug reported by Bikas Saha and fixed by Vinod Kumar Vavilapalli (client)
    Calling YarnClientImpl.close throws Exception
    -
    The following exception is thrown -=========== -*org.apache.hadoop.HadoopIllegalArgumentException: Cannot close proxy - is not Closeable or does not provide closeable invocation handler class org.apache.hadoop.yarn.api.impl.pb.client.ClientRMProtocolPBClientImpl* - *at org.apache.hadoop.ipc.RPC.stopProxy(RPC.java:624)* - *at org.hadoop.yarn.client.YarnClientImpl.stop(YarnClientImpl.java:102)* - at org.apache.hadoop.yarn.applications.unmanagedamlauncher.UnmanagedAMLauncher.run(UnmanagedAMLauncher.java:336) - at org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher.testDSShell(TestUnmanagedAMLauncher.java:156) - at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) - at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) - at java.lang.reflect.Method.invoke(Method.java:597) - at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44) - at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15) - at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41) - at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20) - at org.junit.runners.BlockJUnit4ClassRunner.runNotIgnored(BlockJUnit4ClassRunner.java:79) - at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:71) - at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:49) - at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193) - at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52) - at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191) - at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42) - at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184) - at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:28) - at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:31) - at org.junit.runners.ParentRunner.run(ParentRunner.java:236) - at org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:236) - at org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:134) - at org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:113) - at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) - at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) - at java.lang.reflect.Method.invoke(Method.java:597) - at org.apache.maven.surefire.util.ReflectionUtils.invokeMethodWithArray(ReflectionUtils.java:189) - at org.apache.maven.surefire.booter.ProviderFactory$ProviderProxy.invoke(ProviderFactory.java:165) - at org.apache.maven.surefire.booter.ProviderFactory.invokeProvider(ProviderFactory.java:85) - at org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:103) - at org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:74) +
    The following exception is thrown +=========== +*org.apache.hadoop.HadoopIllegalArgumentException: Cannot close proxy - is not Closeable or does not provide closeable invocation handler class org.apache.hadoop.yarn.api.impl.pb.client.ClientRMProtocolPBClientImpl* + *at org.apache.hadoop.ipc.RPC.stopProxy(RPC.java:624)* + *at org.hadoop.yarn.client.YarnClientImpl.stop(YarnClientImpl.java:102)* + at org.apache.hadoop.yarn.applications.unmanagedamlauncher.UnmanagedAMLauncher.run(UnmanagedAMLauncher.java:336) + at org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher.testDSShell(TestUnmanagedAMLauncher.java:156) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) + at java.lang.reflect.Method.invoke(Method.java:597) + at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44) + at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15) + at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41) + at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20) + at org.junit.runners.BlockJUnit4ClassRunner.runNotIgnored(BlockJUnit4ClassRunner.java:79) + at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:71) + at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:49) + at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193) + at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52) + at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191) + at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42) + at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184) + at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:28) + at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:31) + at org.junit.runners.ParentRunner.run(ParentRunner.java:236) + at org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:236) + at org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:134) + at org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:113) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) + at java.lang.reflect.Method.invoke(Method.java:597) + at org.apache.maven.surefire.util.ReflectionUtils.invokeMethodWithArray(ReflectionUtils.java:189) + at org.apache.maven.surefire.booter.ProviderFactory$ProviderProxy.invoke(ProviderFactory.java:165) + at org.apache.maven.surefire.booter.ProviderFactory.invokeProvider(ProviderFactory.java:85) + at org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:103) + at org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:74) ===========
  • YARN-75. Major bug reported by Siddharth Seth and fixed by Siddharth Seth
    RMContainer should handle a RELEASE event while RUNNING
    -
    An AppMaster can send a container release at any point. Currently this results in an exception, if this is done while the RM considers the container to be RUNNING. +
    An AppMaster can send a container release at any point. Currently this results in an exception, if this is done while the RM considers the container to be RUNNING. The event not being processed correctly also implies that these containers do not show up in the Completed Container List seen by the AM (AMRMProtocol). MR-3902 depends on this set being complete.
  • YARN-68. Major bug reported by patrick white and fixed by Daryn Sharp (nodemanager)
    NodeManager will refuse to shutdown indefinitely due to container log aggregation
    -
    The nodemanager is able to get into a state where containermanager.logaggregation.AppLogAggregatorImpl will apparently wait -indefinitely for log aggregation to complete for an application, even if that application has abnormally terminated and is no longer present. - -Observed behavior is that an attempt to stop the nodemanager daemon will return but have no effect, the nm log continually displays messages similar to this: - -[Thread-1]2012-08-21 17:44:07,581 INFO -org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl: -Waiting for aggregation to complete for application_1345221477405_2733 - -The only recovery we found to work was to 'kill -9' the nm process. - -What exactly causes the NM to enter this state is unclear but we do see this behavior reliably when the NM has run a task which failed, for example when debugging oozie distcp actions and having a distcp map task fail, the NM that was running the container will now enter this state where a shutdown on said NM will never complete, 'never' in this case was waiting for 2 hours before killing the nodemanager process. +
    The nodemanager is able to get into a state where containermanager.logaggregation.AppLogAggregatorImpl will apparently wait +indefinitely for log aggregation to complete for an application, even if that application has abnormally terminated and is no longer present. + +Observed behavior is that an attempt to stop the nodemanager daemon will return but have no effect, the nm log continually displays messages similar to this: + +[Thread-1]2012-08-21 17:44:07,581 INFO +org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl: +Waiting for aggregation to complete for application_1345221477405_2733 + +The only recovery we found to work was to 'kill -9' the nm process. + +What exactly causes the NM to enter this state is unclear but we do see this behavior reliably when the NM has run a task which failed, for example when debugging oozie distcp actions and having a distcp map task fail, the NM that was running the container will now enter this state where a shutdown on said NM will never complete, 'never' in this case was waiting for 2 hours before killing the nodemanager process.
  • YARN-66. Critical bug reported by Thomas Graves and fixed by Thomas Graves (nodemanager)
    aggregated logs permissions not set properly
    -
    If the default file permissions are set to something restrictive - like 700, application logs get aggregated and created with those restrictive file permissions which doesn't allow the history server to serve them up. - - -They need to be created with group readable similar to how log aggregation sets up the directory permissions. +
    If the default file permissions are set to something restrictive - like 700, application logs get aggregated and created with those restrictive file permissions which doesn't allow the history server to serve them up. + + +They need to be created with group readable similar to how log aggregation sets up the directory permissions.
  • YARN-63. Major bug reported by Jason Lowe and fixed by Jason Lowe (resourcemanager)
    @@ -128,47 +128,47 @@ They need to be created with group readable similar to how log aggregation sets
  • YARN-42. Major bug reported by Devaraj K and fixed by Devaraj K (nodemanager)
    Node Manager throws NPE on startup
    -
    NM throws NPE on startup if it doesn't have persmission's on nm local dir's - - -{code:xml} -2012-05-14 16:32:13,468 FATAL org.apache.hadoop.yarn.server.nodemanager.NodeManager: Error starting NodeManager -org.apache.hadoop.yarn.YarnException: Failed to initialize LocalizationService - at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.init(ResourceLocalizationService.java:202) - at org.apache.hadoop.yarn.service.CompositeService.init(CompositeService.java:58) - at org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.init(ContainerManagerImpl.java:183) - at org.apache.hadoop.yarn.service.CompositeService.init(CompositeService.java:58) - at org.apache.hadoop.yarn.server.nodemanager.NodeManager.init(NodeManager.java:166) - at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:268) - at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:284) -Caused by: java.io.IOException: mkdir of /mrv2/tmp/nm-local-dir/usercache failed - at org.apache.hadoop.fs.FileSystem.primitiveMkdir(FileSystem.java:907) - at org.apache.hadoop.fs.DelegateToFileSystem.mkdir(DelegateToFileSystem.java:143) - at org.apache.hadoop.fs.FilterFs.mkdir(FilterFs.java:189) - at org.apache.hadoop.fs.FileContext$4.next(FileContext.java:706) - at org.apache.hadoop.fs.FileContext$4.next(FileContext.java:703) - at org.apache.hadoop.fs.FileContext$FSLinkResolver.resolve(FileContext.java:2325) - at org.apache.hadoop.fs.FileContext.mkdir(FileContext.java:703) - at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.init(ResourceLocalizationService.java:188) - ... 6 more -2012-05-14 16:32:13,472 INFO org.apache.hadoop.yarn.service.CompositeService: Error stopping org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler -java.lang.NullPointerException - at org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler.stop(NonAggregatingLogHandler.java:82) - at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:99) - at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:89) - at org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.stop(ContainerManagerImpl.java:266) - at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:99) - at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:89) - at org.apache.hadoop.yarn.server.nodemanager.NodeManager.stop(NodeManager.java:182) - at org.apache.hadoop.yarn.service.CompositeService$CompositeServiceShutdownHook.run(CompositeService.java:122) - at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54) -{code} +
    NM throws NPE on startup if it doesn't have persmission's on nm local dir's + + +{code:xml} +2012-05-14 16:32:13,468 FATAL org.apache.hadoop.yarn.server.nodemanager.NodeManager: Error starting NodeManager +org.apache.hadoop.yarn.YarnException: Failed to initialize LocalizationService + at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.init(ResourceLocalizationService.java:202) + at org.apache.hadoop.yarn.service.CompositeService.init(CompositeService.java:58) + at org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.init(ContainerManagerImpl.java:183) + at org.apache.hadoop.yarn.service.CompositeService.init(CompositeService.java:58) + at org.apache.hadoop.yarn.server.nodemanager.NodeManager.init(NodeManager.java:166) + at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:268) + at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:284) +Caused by: java.io.IOException: mkdir of /mrv2/tmp/nm-local-dir/usercache failed + at org.apache.hadoop.fs.FileSystem.primitiveMkdir(FileSystem.java:907) + at org.apache.hadoop.fs.DelegateToFileSystem.mkdir(DelegateToFileSystem.java:143) + at org.apache.hadoop.fs.FilterFs.mkdir(FilterFs.java:189) + at org.apache.hadoop.fs.FileContext$4.next(FileContext.java:706) + at org.apache.hadoop.fs.FileContext$4.next(FileContext.java:703) + at org.apache.hadoop.fs.FileContext$FSLinkResolver.resolve(FileContext.java:2325) + at org.apache.hadoop.fs.FileContext.mkdir(FileContext.java:703) + at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.init(ResourceLocalizationService.java:188) + ... 6 more +2012-05-14 16:32:13,472 INFO org.apache.hadoop.yarn.service.CompositeService: Error stopping org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler +java.lang.NullPointerException + at org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler.stop(NonAggregatingLogHandler.java:82) + at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:99) + at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:89) + at org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl.stop(ContainerManagerImpl.java:266) + at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:99) + at org.apache.hadoop.yarn.service.CompositeService.stop(CompositeService.java:89) + at org.apache.hadoop.yarn.server.nodemanager.NodeManager.stop(NodeManager.java:182) + at org.apache.hadoop.yarn.service.CompositeService$CompositeServiceShutdownHook.run(CompositeService.java:122) + at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54) +{code}
  • YARN-39. Critical sub-task reported by Vinod Kumar Vavilapalli and fixed by Vinod Kumar Vavilapalli
    RM-NM secret-keys should be randomly generated and rolled every so often
    -
    - RM should generate the master-key randomly - - The master-key should roll every so often +
    - RM should generate the master-key randomly + - The master-key should roll every so often - NM should remember old expired keys so that already doled out container-requests can be satisfied.
  • YARN-37. Minor bug reported by Jason Lowe and fixed by Mayank Bansal (resourcemanager)
    @@ -177,42 +177,42 @@ java.lang.NullPointerException
  • YARN-36. Blocker bug reported by Eli Collins and fixed by Radim Kolar
    branch-2.1.0-alpha doesn't build
    -
    branch-2.1.0-alpha doesn't build due to the following. Per YARN-1 I updated the mvn version to be 2.1.0-SNAPSHOT, before I hit this issue it didn't compile due to the bogus version. - -{noformat} -hadoop-branch-2.1.0-alpha $ mvn compile -[INFO] Scanning for projects... -[ERROR] The build could not read 1 project -> [Help 1] -[ERROR] -[ERROR] The project org.apache.hadoop:hadoop-yarn-project:2.1.0-SNAPSHOT (/home/eli/src/hadoop-branch-2.1.0-alpha/hadoop-yarn-project/pom.xml) has 1 error -[ERROR] 'dependencies.dependency.version' for org.hsqldb:hsqldb:jar is missing. @ line 160, column 17 +
    branch-2.1.0-alpha doesn't build due to the following. Per YARN-1 I updated the mvn version to be 2.1.0-SNAPSHOT, before I hit this issue it didn't compile due to the bogus version. + +{noformat} +hadoop-branch-2.1.0-alpha $ mvn compile +[INFO] Scanning for projects... +[ERROR] The build could not read 1 project -> [Help 1] +[ERROR] +[ERROR] The project org.apache.hadoop:hadoop-yarn-project:2.1.0-SNAPSHOT (/home/eli/src/hadoop-branch-2.1.0-alpha/hadoop-yarn-project/pom.xml) has 1 error +[ERROR] 'dependencies.dependency.version' for org.hsqldb:hsqldb:jar is missing. @ line 160, column 17 {noformat}
  • YARN-31. Major bug reported by Thomas Graves and fixed by Thomas Graves
    TestDelegationTokenRenewer fails on jdk7
    -
    TestDelegationTokenRenewer fails when run with jdk7. - +
    TestDelegationTokenRenewer fails when run with jdk7. + With JDK7, test methods run in an undefined order. Here it is expecting that testDTRenewal runs first but it no longer is.
  • YARN-29. Major bug reported by Vinod Kumar Vavilapalli and fixed by Vinod Kumar Vavilapalli (client)
    Add a yarn-client module
    -
    I see that we are duplicating (some) code for talking to RM via client API. In this light, a yarn-client module will be useful so that clients of all frameworks can use/extend it. - +
    I see that we are duplicating (some) code for talking to RM via client API. In this light, a yarn-client module will be useful so that clients of all frameworks can use/extend it. + And that same module can be the destination for all the YARN's command line tools.
  • YARN-27. Major bug reported by Ramya Sunil and fixed by Arun C Murthy
    Failed refreshQueues due to misconfiguration prevents further refreshing of queues
    -
    Stumbled upon this problem while refreshing queues with incorrect configuration. The exact scenario was: -1. Added a new queue "newQueue" without defining its capacity. -2. "bin/mapred queue -refreshQueues" fails correctly with "Illegal capacity of -1 for queue root.newQueue" -3. However, after defining the capacity of "newQueue" followed by a second "bin/mapred queue -refreshQueues" throws "org.apache.hadoop.metrics2.MetricsException: Metrics source QueueMetrics,q0=root,q1=newQueue already exists!" Also see Hadoop:name=QueueMetrics,q0=root,q1=newQueue,service=ResourceManager metrics being available even though the queue was not added. - +
    Stumbled upon this problem while refreshing queues with incorrect configuration. The exact scenario was: +1. Added a new queue "newQueue" without defining its capacity. +2. "bin/mapred queue -refreshQueues" fails correctly with "Illegal capacity of -1 for queue root.newQueue" +3. However, after defining the capacity of "newQueue" followed by a second "bin/mapred queue -refreshQueues" throws "org.apache.hadoop.metrics2.MetricsException: Metrics source QueueMetrics,q0=root,q1=newQueue already exists!" Also see Hadoop:name=QueueMetrics,q0=root,q1=newQueue,service=ResourceManager metrics being available even though the queue was not added. + The expected behavior would be to refresh the queues correctly and allow addition of "newQueue".
  • YARN-25. Major bug reported by Thomas Graves and fixed by Robert Joseph Evans
    remove old aggregated logs
    -
    Currently the aggregated user logs under NM_REMOTE_APP_LOG_DIR are never removed. We should have mechanism to remove them after certain period. - +
    Currently the aggregated user logs under NM_REMOTE_APP_LOG_DIR are never removed. We should have mechanism to remove them after certain period. + It might make sense for job history server to remove them.
  • YARN-22. Minor bug reported by Eli Collins and fixed by Mayank Bansal
    @@ -221,29 +221,29 @@ It might make sense for job history server to remove them.
  • YARN-15. Critical bug reported by Alejandro Abdelnur and fixed by Arun C Murthy (nodemanager)
    YarnConfiguration DEFAULT_YARN_APPLICATION_CLASSPATH should be updated
    -
    -{code} - /** - * Default CLASSPATH for YARN applications. A comma-separated list of - * CLASSPATH entries - */ - public static final String[] DEFAULT_YARN_APPLICATION_CLASSPATH = { - "$HADOOP_CONF_DIR", "$HADOOP_COMMON_HOME/share/hadoop/common/*", - "$HADOOP_COMMON_HOME/share/hadoop/common/lib/*", - "$HADOOP_HDFS_HOME/share/hadoop/hdfs/*", - "$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*", - "$YARN_HOME/share/hadoop/mapreduce/*", - "$YARN_HOME/share/hadoop/mapreduce/lib/*"}; -{code} - +
    +{code} + /** + * Default CLASSPATH for YARN applications. A comma-separated list of + * CLASSPATH entries + */ + public static final String[] DEFAULT_YARN_APPLICATION_CLASSPATH = { + "$HADOOP_CONF_DIR", "$HADOOP_COMMON_HOME/share/hadoop/common/*", + "$HADOOP_COMMON_HOME/share/hadoop/common/lib/*", + "$HADOOP_HDFS_HOME/share/hadoop/hdfs/*", + "$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*", + "$YARN_HOME/share/hadoop/mapreduce/*", + "$YARN_HOME/share/hadoop/mapreduce/lib/*"}; +{code} + It should have {{share/yarn/}} and MR should add the {{share/mapreduce/}} (another JIRA?)
  • YARN-14. Major bug reported by Jason Lowe and fixed by Jason Lowe (nodemanager)
    Symlinks to peer distributed cache files no longer work
    -
    Trying to create a symlink to another file that is specified for the distributed cache will fail to create the link. For example: - -hadoop jar ... -files "x,y,x#z" - +
    Trying to create a symlink to another file that is specified for the distributed cache will fail to create the link. For example: + +hadoop jar ... -files "x,y,x#z" + will localize the files x and y as x and y, but the z symlink for x will not be created. This is a regression from 1.x behavior.
  • YARN-13. Critical bug reported by Todd Lipcon and fixed by
    @@ -252,13 +252,13 @@ will localize the files x and y as x and y, but the z symlink for x will not be
  • YARN-12. Major bug reported by Junping Du and fixed by Junping Du (scheduler)
    Several Findbugs issues with new FairScheduler in YARN
    -
    The good feature of FairScheduler is added recently to YARN. As recently PreCommit test from MAPREDUCE-4309, there are several bugs found by Findbugs related to FairScheduler: -org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerEventLog.shutdown() might ignore java.lang.Exception -Inconsistent synchronization of org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerEventLog.logDisabled; locked 50% of time -Inconsistent synchronization of org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.QueueManager.queueMaxAppsDefault; locked 50% of time -Inconsistent synchronization of org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.QueueManager.userMaxAppsDefault; locked 50% of time -The details are in:https://builds.apache.org/job/PreCommit-MAPREDUCE-Build/2612//artifact/trunk/patchprocess/newPatchFindbugsWarningshadoop-yarn-server-resourcemanager.html#DE_MIGHT_IGNORE - +
    The good feature of FairScheduler is added recently to YARN. As recently PreCommit test from MAPREDUCE-4309, there are several bugs found by Findbugs related to FairScheduler: +org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerEventLog.shutdown() might ignore java.lang.Exception +Inconsistent synchronization of org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerEventLog.logDisabled; locked 50% of time +Inconsistent synchronization of org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.QueueManager.queueMaxAppsDefault; locked 50% of time +Inconsistent synchronization of org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.QueueManager.userMaxAppsDefault; locked 50% of time +The details are in:https://builds.apache.org/job/PreCommit-MAPREDUCE-Build/2612//artifact/trunk/patchprocess/newPatchFindbugsWarningshadoop-yarn-server-resourcemanager.html#DE_MIGHT_IGNORE +
  • YARN-10. Major improvement reported by Arun C Murthy and fixed by Hitesh Shah
    @@ -991,18 +991,18 @@ The details are in:https://builds.apache.org/job/PreCommit-MAPREDUCE-Build/2612/
  • MAPREDUCE-3812. Major sub-task reported by Vinod Kumar Vavilapalli and fixed by Harsh J (mrv2 , performance)
    Lower default allocation sizes, fix allocation configurations and document them
    -
    Removes two sets of previously available config properties: - -1. ( yarn.scheduler.fifo.minimum-allocation-mb and yarn.scheduler.fifo.maximum-allocation-mb ) and, -2. ( yarn.scheduler.capacity.minimum-allocation-mb and yarn.scheduler.capacity.maximum-allocation-mb ) - -In favor of two new, generically named properties: - -1. yarn.scheduler.minimum-allocation-mb - This acts as the floor value of memory resource requests for containers. -2. yarn.scheduler.maximum-allocation-mb - This acts as the ceiling value of memory resource requests for containers. - -Both these properties need to be set at the ResourceManager (RM) to take effect, as the RM is where the scheduler resides. - +
    Removes two sets of previously available config properties: + +1. ( yarn.scheduler.fifo.minimum-allocation-mb and yarn.scheduler.fifo.maximum-allocation-mb ) and, +2. ( yarn.scheduler.capacity.minimum-allocation-mb and yarn.scheduler.capacity.maximum-allocation-mb ) + +In favor of two new, generically named properties: + +1. yarn.scheduler.minimum-allocation-mb - This acts as the floor value of memory resource requests for containers. +2. yarn.scheduler.maximum-allocation-mb - This acts as the ceiling value of memory resource requests for containers. + +Both these properties need to be set at the ResourceManager (RM) to take effect, as the RM is where the scheduler resides. + Also changes the default minimum and maximums to 128 MB and 10 GB respectively.
  • MAPREDUCE-3782. Critical bug reported by Arpit Gupta and fixed by Jason Lowe (mrv2)
    @@ -1043,8 +1043,8 @@ Also changes the default minimum and maximums to 128 MB and 10 GB respectively.<
  • MAPREDUCE-3543. Critical bug reported by Mahadev konar and fixed by Thomas Graves (mrv2)
    Mavenize Gridmix.
    -
    Note that to apply this you should first run the script - ./MAPREDUCE-3543v3.sh svn, then apply the patch. - +
    Note that to apply this you should first run the script - ./MAPREDUCE-3543v3.sh svn, then apply the patch. + If this is merged to more then trunk, the version inside of hadoop-tools/hadoop-gridmix/pom.xml will need to be udpated accordingly.
  • MAPREDUCE-3506. Minor bug reported by Ratandeep Ratti and fixed by Jason Lowe (client , mrv2)
    @@ -1613,10 +1613,10 @@ If this is merged to more then trunk, the version inside of hadoop-tools/hadoop-
  • HDFS-3475. Trivial improvement reported by Harsh J and fixed by Harsh J
    Make the replication and invalidation rates configurable
    -
    This change adds two new configuration parameters. -# {{dfs.namenode.invalidate.work.pct.per.iteration}} for controlling deletion rate of blocks. -# {{dfs.namenode.replication.work.multiplier.per.iteration}} for controlling replication rate. This in turn allows controlling the time it takes for decommissioning. - +
    This change adds two new configuration parameters. +# {{dfs.namenode.invalidate.work.pct.per.iteration}} for controlling deletion rate of blocks. +# {{dfs.namenode.replication.work.multiplier.per.iteration}} for controlling replication rate. This in turn allows controlling the time it takes for decommissioning. + Please see hdfs-default.xml for detailed description.
  • HDFS-3474. Major sub-task reported by Ivan Kelly and fixed by Ivan Kelly
    @@ -4769,8 +4769,8 @@ These release notes include new developer and user-facing incompatibilities, fea
  • MAPREDUCE-3720. Major bug reported by Vinod Kumar Vavilapalli and fixed by Vinod Kumar Vavilapalli (client , mrv2)
    Command line listJobs should not visit each AM
    -
    Changed bin/mapred job -list to not print job-specific information not available at RM. - +
    Changed bin/mapred job -list to not print job-specific information not available at RM. + Very minor incompatibility in cmd-line output, inevitable due to MRv2 architecture.
  • MAPREDUCE-3718. Major sub-task reported by Vinod Kumar Vavilapalli and fixed by Hitesh Shah (mrv2 , performance)
    @@ -4819,8 +4819,8 @@ Very minor incompatibility in cmd-line output, inevitable due to MRv2 architectu
  • MAPREDUCE-3703. Critical bug reported by Eric Payne and fixed by Eric Payne (mrv2 , resourcemanager)
    ResourceManager should provide node lists in JMX output
    -
    New JMX Bean in ResourceManager to provide list of live node managers: - +
    New JMX Bean in ResourceManager to provide list of live node managers: + Hadoop:service=ResourceManager,name=RMNMInfo LiveNodeManagers
  • MAPREDUCE-3702. Critical bug reported by Thomas Graves and fixed by Thomas Graves (mrv2)
    @@ -5037,12 +5037,12 @@ Hadoop:service=ResourceManager,name=RMNMInfo LiveNodeManagers
  • MAPREDUCE-3549. Blocker bug reported by Thomas Graves and fixed by Thomas Graves (mrv2)
    write api documentation for web service apis for RM, NM, mapreduce app master, and job history server
    -
    new files added: A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/WebServicesIntro.apt.vm -A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/NodeManagerRest.apt.vm -A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ResourceManagerRest.apt.vm -A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/MapredAppMasterRest.apt.vm -A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HistoryServerRest.apt.vm - +
    new files added: A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/WebServicesIntro.apt.vm +A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/NodeManagerRest.apt.vm +A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ResourceManagerRest.apt.vm +A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/MapredAppMasterRest.apt.vm +A hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HistoryServerRest.apt.vm + The hadoop-project/src/site/site.xml is split into separate patch.
  • MAPREDUCE-3548. Critical sub-task reported by Thomas Graves and fixed by Thomas Graves (mrv2)
    @@ -5471,7 +5471,7 @@ The hadoop-project/src/site/site.xml is split into separate patch.<
  • MAPREDUCE-3297. Major task reported by Siddharth Seth and fixed by Siddharth Seth (mrv2)
    Move Log Related components from yarn-server-nodemanager to yarn-common
    -
    Moved log related components into yarn-common so that HistoryServer and clients can use them without depending on the yarn-server-nodemanager module. +
    Moved log related components into yarn-common so that HistoryServer and clients can use them without depending on the yarn-server-nodemanager module.
  • MAPREDUCE-3291. Blocker bug reported by Ramya Sunil and fixed by Robert Joseph Evans (mrv2)
    @@ -5504,17 +5504,17 @@ The hadoop-project/src/site/site.xml is split into separate patch.<
  • MAPREDUCE-3219. Minor sub-task reported by Hitesh Shah and fixed by Hitesh Shah (mrv2 , test)
    ant test TestDelegationToken failing on trunk
    -
    Reenabled and fixed bugs in the failing test TestDelegationToken. +
    Reenabled and fixed bugs in the failing test TestDelegationToken.
  • MAPREDUCE-3217. Minor sub-task reported by Hitesh Shah and fixed by Devaraj K (mrv2 , test)
    ant test TestAuditLogger fails on trunk
    -
    Reenabled and fixed bugs in the failing ant test TestAuditLogger. +
    Reenabled and fixed bugs in the failing ant test TestAuditLogger.
  • MAPREDUCE-3215. Minor sub-task reported by Hitesh Shah and fixed by Hitesh Shah (mrv2)
    org.apache.hadoop.mapreduce.TestNoJobSetupCleanup failing on trunk
    -
    Reneabled and fixed bugs in the failing test TestNoJobSetupCleanup. +
    Reneabled and fixed bugs in the failing test TestNoJobSetupCleanup.
  • MAPREDUCE-3194. Major bug reported by Siddharth Seth and fixed by Jason Lowe (mrv2)
    @@ -5875,12 +5875,12 @@ The hadoop-project/src/site/site.xml is split into separate patch.<
  • HDFS-2246. Major improvement reported by Sanjay Radia and fixed by Jitendra Nath Pandey
    Shortcut a local client reads to a Datanodes files directly
    -
    1. New configurations -a. dfs.block.local-path-access.user is the key in datanode configuration to specify the user allowed to do short circuit read. -b. dfs.client.read.shortcircuit is the key to enable short circuit read at the client side configuration. -c. dfs.client.read.shortcircuit.skip.checksum is the key to bypass checksum check at the client side. -2. By default none of the above are enabled and short circuit read will not kick in. -3. If security is on, the feature can be used only for user that has kerberos credentials at the client, therefore map reduce tasks cannot benefit from it in general. +
    1. New configurations +a. dfs.block.local-path-access.user is the key in datanode configuration to specify the user allowed to do short circuit read. +b. dfs.client.read.shortcircuit is the key to enable short circuit read at the client side configuration. +c. dfs.client.read.shortcircuit.skip.checksum is the key to bypass checksum check at the client side. +2. By default none of the above are enabled and short circuit read will not kick in. +3. If security is on, the feature can be used only for user that has kerberos credentials at the client, therefore map reduce tasks cannot benefit from it in general.
  • HDFS-2178. Major improvement reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur
    @@ -6161,7 +6161,7 @@ c. dfs.client.read.shortcircuit.skip.checksum is the key to bypass checksum chec
  • HADOOP-7802. Major bug reported by Bruno Mahé and fixed by Bruno Mahé
    Hadoop scripts unconditionally source "$bin"/../libexec/hadoop-config.sh.
    -
    Here is a patch to enable this behavior +
    Here is a patch to enable this behavior
  • HADOOP-7801. Major bug reported by Bruno Mahé and fixed by Bruno Mahé (build)
    @@ -6486,9 +6486,9 @@ These release notes include new developer and user-facing incompatibilities, fea
  • MAPREDUCE-3186. Blocker bug reported by Ramgopal N and fixed by Eric Payne (mrv2)
    User jobs are getting hanged if the Resource manager process goes down and comes up while job is getting executed.
    -
    New Yarn configuration property: - -Name: yarn.app.mapreduce.am.scheduler.connection.retries +
    New Yarn configuration property: + +Name: yarn.app.mapreduce.am.scheduler.connection.retries Description: Number of times AM should retry to contact RM if connection is lost.
  • MAPREDUCE-3185. Critical bug reported by Mahadev konar and fixed by Jonathan Eagles (mrv2)
    @@ -6641,7 +6641,7 @@ Description: Number of times AM should retry to contact RM if connection is lost
  • MAPREDUCE-3112. Major bug reported by Eric Yang and fixed by Eric Yang (contrib/streaming)
    Calling hadoop cli inside mapreduce job leads to errors
    -
    Removed inheritance of certain server environment variables (HADOOP_OPTS and HADOOP_ROOT_LOGGER) in task attempt process. +
    Removed inheritance of certain server environment variables (HADOOP_OPTS and HADOOP_ROOT_LOGGER) in task attempt process.
  • MAPREDUCE-3110. Major bug reported by Devaraj K and fixed by Vinod Kumar Vavilapalli (mrv2 , test)
    @@ -7114,16 +7114,16 @@ Description: Number of times AM should retry to contact RM if connection is lost
  • MAPREDUCE-2858. Blocker sub-task reported by Luke Lu and fixed by Robert Joseph Evans (applicationmaster , mrv2 , security)
    MRv2 WebApp Security
    -
    A new server has been added to yarn. It is a web proxy that sits in front of the AM web UI. The server is controlled by the yarn.web-proxy.address config. If that config is set, and it points to an address that is different then the RM web interface then a separate proxy server needs to be launched. - -This can be done by running - -yarn-daemon.sh start proxyserver - -If a separate proxy server is needed other configs also may need to be set, if security is enabled. -yarn.web-proxy.principal -yarn.web-proxy.keytab - +
    A new server has been added to yarn. It is a web proxy that sits in front of the AM web UI. The server is controlled by the yarn.web-proxy.address config. If that config is set, and it points to an address that is different then the RM web interface then a separate proxy server needs to be launched. + +This can be done by running + +yarn-daemon.sh start proxyserver + +If a separate proxy server is needed other configs also may need to be set, if security is enabled. +yarn.web-proxy.principal +yarn.web-proxy.keytab + The proxy server is stateless and should be able to support a VIP or other load balancing sitting in front of multiple instances of this server.
  • MAPREDUCE-2854. Major bug reported by Thomas Graves and fixed by Thomas Graves
    @@ -8061,12 +8061,12 @@ mapreduce.reduce.shuffle.catch.exception.message.regex
  • MAPREDUCE-2037. Major new feature reported by Dick King and fixed by Dick King
    Capturing interim progress times, CPU usage, and memory usage, when tasks reach certain progress thresholds
    -
    Capture intermediate task resource consumption information: -* Time taken so far -* CPU load [either at the time the data are taken, or exponentially smoothed] -* Memory load [also either at the time the data are taken, or exponentially smoothed] - -This would be taken at intervals that depend on the task progress plateaus. For example, reducers have three progress ranges - [0-1/3], (1/3-2/3], and (2/3-3/3] - where fundamentally different activities happen. Mappers have different boundaries that are not symmetrically placed [0-9/10], (9/10-1]. Data capture boundaries should coincide with activity boundaries. For the state information capture [CPU and memory] we should average over the covered interval. +
    Capture intermediate task resource consumption information: +* Time taken so far +* CPU load [either at the time the data are taken, or exponentially smoothed] +* Memory load [also either at the time the data are taken, or exponentially smoothed] + +This would be taken at intervals that depend on the task progress plateaus. For example, reducers have three progress ranges - [0-1/3], (1/3-2/3], and (2/3-3/3] - where fundamentally different activities happen. Mappers have different boundaries that are not symmetrically placed [0-9/10], (9/10-1]. Data capture boundaries should coincide with activity boundaries. For the state information capture [CPU and memory] we should average over the covered interval.
  • MAPREDUCE-2033. Major task reported by Vinay Kumar Thota and fixed by Vinay Kumar Thota (contrib/gridmix)
    @@ -8175,24 +8175,24 @@ This would be taken at intervals that depend on the task progress plateaus. For
  • MAPREDUCE-279. Major improvement reported by Arun C Murthy and fixed by (mrv2)
    Map-Reduce 2.0
    -
    MapReduce has undergone a complete re-haul in hadoop-0.23 and we now have, what we call, MapReduce 2.0 (MRv2). - -The fundamental idea of MRv2 is to split up the two major functionalities of the JobTracker, resource management and job scheduling/monitoring, into separate daemons. The idea is to have a global ResourceManager (RM) and per-application ApplicationMaster (AM). An application is either a single job in the classical sense of Map-Reduce jobs or a DAG of jobs. The ResourceManager and per-node slave, the NodeManager (NM), form the data-computation framework. The ResourceManager is the ultimate authority that arbitrates resources among all the applications in the system. The per-application ApplicationMaster is, in effect, a framework specific library and is tasked with negotiating resources from the ResourceManager and working with the NodeManager(s) to execute and monitor the tasks. - -The ResourceManager has two main components: -* Scheduler (S) -* ApplicationsManager (ASM) - -The Scheduler is responsible for allocating resources to the various running applications subject to familiar constraints of capacities, queues etc. The Scheduler is pure scheduler in the sense that it performs no monitoring or tracking of status for the application. Also, it offers no guarantees on restarting failed tasks either due to application failure or hardware failures. The Scheduler performs its scheduling function based the resource requirements of the applications; it does so based on the abstract notion of a Resource Container which incorporates elements such as memory, cpu, disk, network etc. - -The Scheduler has a pluggable policy plug-in, which is responsible for partitioning the cluster resources among the various queues, applications etc. The current Map-Reduce schedulers such as the CapacityScheduler and the FairScheduler would be some examples of the plug-in. - -The CapacityScheduler supports hierarchical queues to allow for more predictable sharing of cluster resources. -The ApplicationsManager is responsible for accepting job-submissions, negotiating the first container for executing the application specific ApplicationMaster and provides the service for restarting the ApplicationMaster container on failure. - -The NodeManager is the per-machine framework agent who is responsible for launching the applications' containers, monitoring their resource usage (cpu, memory, disk, network) and reporting the same to the Scheduler. - -The per-application ApplicationMaster has the responsibility of negotiating appropriate resource containers from the Scheduler, tracking their status and monitoring for progress. +
    MapReduce has undergone a complete re-haul in hadoop-0.23 and we now have, what we call, MapReduce 2.0 (MRv2). + +The fundamental idea of MRv2 is to split up the two major functionalities of the JobTracker, resource management and job scheduling/monitoring, into separate daemons. The idea is to have a global ResourceManager (RM) and per-application ApplicationMaster (AM). An application is either a single job in the classical sense of Map-Reduce jobs or a DAG of jobs. The ResourceManager and per-node slave, the NodeManager (NM), form the data-computation framework. The ResourceManager is the ultimate authority that arbitrates resources among all the applications in the system. The per-application ApplicationMaster is, in effect, a framework specific library and is tasked with negotiating resources from the ResourceManager and working with the NodeManager(s) to execute and monitor the tasks. + +The ResourceManager has two main components: +* Scheduler (S) +* ApplicationsManager (ASM) + +The Scheduler is responsible for allocating resources to the various running applications subject to familiar constraints of capacities, queues etc. The Scheduler is pure scheduler in the sense that it performs no monitoring or tracking of status for the application. Also, it offers no guarantees on restarting failed tasks either due to application failure or hardware failures. The Scheduler performs its scheduling function based the resource requirements of the applications; it does so based on the abstract notion of a Resource Container which incorporates elements such as memory, cpu, disk, network etc. + +The Scheduler has a pluggable policy plug-in, which is responsible for partitioning the cluster resources among the various queues, applications etc. The current Map-Reduce schedulers such as the CapacityScheduler and the FairScheduler would be some examples of the plug-in. + +The CapacityScheduler supports hierarchical queues to allow for more predictable sharing of cluster resources. +The ApplicationsManager is responsible for accepting job-submissions, negotiating the first container for executing the application specific ApplicationMaster and provides the service for restarting the ApplicationMaster container on failure. + +The NodeManager is the per-machine framework agent who is responsible for launching the applications' containers, monitoring their resource usage (cpu, memory, disk, network) and reporting the same to the Scheduler. + +The per-application ApplicationMaster has the responsibility of negotiating appropriate resource containers from the Scheduler, tracking their status and monitoring for progress.
  • HDFS-2540. Major sub-task reported by Tsz Wo (Nicholas), SZE and fixed by Tsz Wo (Nicholas), SZE
    @@ -8253,10 +8253,10 @@ The per-application ApplicationMaster has the responsibility of negotiating appr
  • HDFS-2465. Major improvement reported by Todd Lipcon and fixed by Todd Lipcon (data-node , performance)
    Add HDFS support for fadvise readahead and drop-behind
    -
    HDFS now has the ability to use posix_fadvise and sync_data_range syscalls to manage the OS buffer cache. This support is currently considered experimental, and may be enabled by configuring the following keys: -dfs.datanode.drop.cache.behind.writes - set to true to drop data out of the buffer cache after writing -dfs.datanode.drop.cache.behind.reads - set to true to drop data out of the buffer cache when performing sequential reads -dfs.datanode.sync.behind.writes - set to true to trigger dirty page writeback immediately after writing data +
    HDFS now has the ability to use posix_fadvise and sync_data_range syscalls to manage the OS buffer cache. This support is currently considered experimental, and may be enabled by configuring the following keys: +dfs.datanode.drop.cache.behind.writes - set to true to drop data out of the buffer cache after writing +dfs.datanode.drop.cache.behind.reads - set to true to drop data out of the buffer cache when performing sequential reads +dfs.datanode.sync.behind.writes - set to true to trigger dirty page writeback immediately after writing data dfs.datanode.readahead.bytes - set to a non-zero value to trigger readahead for sequential reads
  • HDFS-2453. Major sub-task reported by Arpit Gupta and fixed by Tsz Wo (Nicholas), SZE (webhdfs)
    @@ -9331,7 +9331,7 @@ This is an incompatible change in 0.23. The versions of ClientProtocol and Data
  • HDFS-1594. Major bug reported by Devaraj K and fixed by Aaron T. Myers (name-node)
    When the disk becomes full Namenode is getting shutdown and not able to recover
    -
    Implemented a daemon thread to monitor the disk usage for periodically and if the disk usage reaches the threshold value, put the name node into Safe mode so that no modification to file system will occur. Once the disk usage reaches below the threshold, name node will be put out of the safe mode. Here threshold value and interval to check the disk usage are configurable. +
    Implemented a daemon thread to monitor the disk usage for periodically and if the disk usage reaches the threshold value, put the name node into Safe mode so that no modification to file system will occur. Once the disk usage reaches below the threshold, name node will be put out of the safe mode. Here threshold value and interval to check the disk usage are configurable.
  • HDFS-1592. Major bug reported by Bharath Mundlapudi and fixed by Bharath Mundlapudi
    @@ -9376,9 +9376,9 @@ This is an incompatible change in 0.23. The versions of ClientProtocol and Data
  • HDFS-1547. Major improvement reported by Suresh Srinivas and fixed by Suresh Srinivas (name-node)
    Improve decommission mechanism
    -
    Summary of changes to the decommissioning process: -# After nodes are decommissioned, they are not shutdown. The decommissioned nodes are not used for writes. For reads, the decommissioned nodes are given as the last location to read from. -# Number of live and dead decommissioned nodes are displayed in the namenode webUI. +
    Summary of changes to the decommissioning process: +# After nodes are decommissioned, they are not shutdown. The decommissioned nodes are not used for writes. For reads, the decommissioned nodes are given as the last location to read from. +# Number of live and dead decommissioned nodes are displayed in the namenode webUI. # Decommissioned nodes free capacity is not count towards the the cluster free capacity.
  • HDFS-1541. Major sub-task reported by Hairong Kuang and fixed by Hairong Kuang (name-node)
    @@ -9491,10 +9491,10 @@ This is an incompatible change in 0.23. The versions of ClientProtocol and Data
  • HDFS-1448. Major new feature reported by Erik Steffl and fixed by Erik Steffl (tools)
    Create multi-format parser for edits logs file, support binary and XML formats initially
    -
    Offline edits viewer feature adds oev tool to hdfs script. Oev makes it possible to convert edits logs to/from native binary and XML formats. It uses the same framework as Offline image viewer. - -Example usage: - +
    Offline edits viewer feature adds oev tool to hdfs script. Oev makes it possible to convert edits logs to/from native binary and XML formats. It uses the same framework as Offline image viewer. + +Example usage: + $HADOOP_HOME/bin/hdfs oev -i edits -o output.xml
  • HDFS-1445. Major sub-task reported by Matt Foley and fixed by Matt Foley (data-node)
    @@ -9762,7 +9762,7 @@ This change requires an upgrade at deployment.
  • HADOOP-7681. Minor bug reported by Arpit Gupta and fixed by Arpit Gupta (conf)
    log4j.properties is missing properties for security audit and hdfs audit should be changed to info
    -
    HADOOP-7681. Fixed security and hdfs audit log4j properties +
    HADOOP-7681. Fixed security and hdfs audit log4j properties (Arpit Gupta via Eric Yang)
  • HADOOP-7671. Major bug reported by Ravi Prakash and fixed by Ravi Prakash
    @@ -10363,8 +10363,8 @@ This change requires an upgrade at deployment.
  • HADOOP-7227. Major improvement reported by Jitendra Nath Pandey and fixed by Jitendra Nath Pandey (ipc)
    Remove protocol version check at proxy creation in Hadoop RPC.
    -
    1. Protocol version check is removed from proxy creation, instead version check is performed at server in every rpc call. -2. This change is backward incompatible because format of the rpc messages is changed to include client version, client method hash and rpc version. +
    1. Protocol version check is removed from proxy creation, instead version check is performed at server in every rpc call. +2. This change is backward incompatible because format of the rpc messages is changed to include client version, client method hash and rpc version. 3. rpc version is introduced which should change when the format of rpc messages is changed.
  • HADOOP-7223. Major bug reported by Suresh Srinivas and fixed by Suresh Srinivas (fs)
    diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/ContextFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/ContextFactory.java index 25efef9b659..034ea3589a5 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/ContextFactory.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/ContextFactory.java @@ -1,211 +1,211 @@ -/* - * ContextFactory.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Properties; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.metrics.spi.NullContext; - -/** - * Factory class for creating MetricsContext objects. To obtain an instance - * of this class, use the static getFactory() method. - */ -@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) -@InterfaceStability.Evolving -public class ContextFactory { - - private static final String PROPERTIES_FILE = - "/hadoop-metrics.properties"; - private static final String CONTEXT_CLASS_SUFFIX = - ".class"; - private static final String DEFAULT_CONTEXT_CLASSNAME = - "org.apache.hadoop.metrics.spi.NullContext"; - - private static ContextFactory theFactory = null; - - private Map attributeMap = new HashMap(); - private Map contextMap = - new HashMap(); - - // Used only when contexts, or the ContextFactory itself, cannot be - // created. - private static Map nullContextMap = - new HashMap(); - - /** Creates a new instance of ContextFactory */ - protected ContextFactory() { - } - - /** - * Returns the value of the named attribute, or null if there is no - * attribute of that name. - * - * @param attributeName the attribute name - * @return the attribute value - */ - public Object getAttribute(String attributeName) { - return attributeMap.get(attributeName); - } - - /** - * Returns the names of all the factory's attributes. - * - * @return the attribute names - */ - public String[] getAttributeNames() { - String[] result = new String[attributeMap.size()]; - int i = 0; - // for (String attributeName : attributeMap.keySet()) { - Iterator it = attributeMap.keySet().iterator(); - while (it.hasNext()) { - result[i++] = (String) it.next(); - } - return result; - } - - /** - * Sets the named factory attribute to the specified value, creating it - * if it did not already exist. If the value is null, this is the same as - * calling removeAttribute. - * - * @param attributeName the attribute name - * @param value the new attribute value - */ - public void setAttribute(String attributeName, Object value) { - attributeMap.put(attributeName, value); - } - - /** - * Removes the named attribute if it exists. - * - * @param attributeName the attribute name - */ - public void removeAttribute(String attributeName) { - attributeMap.remove(attributeName); - } - - /** - * Returns the named MetricsContext instance, constructing it if necessary - * using the factory's current configuration attributes.

    - * - * When constructing the instance, if the factory property - * contextName.class exists, - * its value is taken to be the name of the class to instantiate. Otherwise, - * the default is to create an instance of - * org.apache.hadoop.metrics.spi.NullContext, which is a - * dummy "no-op" context which will cause all metric data to be discarded. - * - * @param contextName the name of the context - * @return the named MetricsContext - */ - public synchronized MetricsContext getContext(String refName, String contextName) - throws IOException, ClassNotFoundException, - InstantiationException, IllegalAccessException { - MetricsContext metricsContext = contextMap.get(refName); - if (metricsContext == null) { - String classNameAttribute = refName + CONTEXT_CLASS_SUFFIX; - String className = (String) getAttribute(classNameAttribute); - if (className == null) { - className = DEFAULT_CONTEXT_CLASSNAME; - } - Class contextClass = Class.forName(className); - metricsContext = (MetricsContext) contextClass.newInstance(); - metricsContext.init(contextName, this); - contextMap.put(contextName, metricsContext); - } - return metricsContext; - } - - public synchronized MetricsContext getContext(String contextName) - throws IOException, ClassNotFoundException, InstantiationException, - IllegalAccessException { - return getContext(contextName, contextName); - } - - /** - * Returns all MetricsContexts built by this factory. - */ - public synchronized Collection getAllContexts() { - // Make a copy to avoid race conditions with creating new contexts. - return new ArrayList(contextMap.values()); - } - - /** - * Returns a "null" context - one which does nothing. - */ - public static synchronized MetricsContext getNullContext(String contextName) { - MetricsContext nullContext = nullContextMap.get(contextName); - if (nullContext == null) { - nullContext = new NullContext(); - nullContextMap.put(contextName, nullContext); - } - return nullContext; - } - - /** - * Returns the singleton ContextFactory instance, constructing it if - * necessary.

    - * - * When the instance is constructed, this method checks if the file - * hadoop-metrics.properties exists on the class path. If it - * exists, it must be in the format defined by java.util.Properties, and all - * the properties in the file are set as attributes on the newly created - * ContextFactory instance. - * - * @return the singleton ContextFactory instance - */ - public static synchronized ContextFactory getFactory() throws IOException { - if (theFactory == null) { - theFactory = new ContextFactory(); - theFactory.setAttributes(); - } - return theFactory; - } - - private void setAttributes() throws IOException { - InputStream is = getClass().getResourceAsStream(PROPERTIES_FILE); - if (is != null) { - try { - Properties properties = new Properties(); - properties.load(is); - //for (Object propertyNameObj : properties.keySet()) { - Iterator it = properties.keySet().iterator(); - while (it.hasNext()) { - String propertyName = (String) it.next(); - String propertyValue = properties.getProperty(propertyName); - setAttribute(propertyName, propertyValue); - } - } finally { - is.close(); - } - } - } - -} +/* + * ContextFactory.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Properties; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.metrics.spi.NullContext; + +/** + * Factory class for creating MetricsContext objects. To obtain an instance + * of this class, use the static getFactory() method. + */ +@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) +@InterfaceStability.Evolving +public class ContextFactory { + + private static final String PROPERTIES_FILE = + "/hadoop-metrics.properties"; + private static final String CONTEXT_CLASS_SUFFIX = + ".class"; + private static final String DEFAULT_CONTEXT_CLASSNAME = + "org.apache.hadoop.metrics.spi.NullContext"; + + private static ContextFactory theFactory = null; + + private Map attributeMap = new HashMap(); + private Map contextMap = + new HashMap(); + + // Used only when contexts, or the ContextFactory itself, cannot be + // created. + private static Map nullContextMap = + new HashMap(); + + /** Creates a new instance of ContextFactory */ + protected ContextFactory() { + } + + /** + * Returns the value of the named attribute, or null if there is no + * attribute of that name. + * + * @param attributeName the attribute name + * @return the attribute value + */ + public Object getAttribute(String attributeName) { + return attributeMap.get(attributeName); + } + + /** + * Returns the names of all the factory's attributes. + * + * @return the attribute names + */ + public String[] getAttributeNames() { + String[] result = new String[attributeMap.size()]; + int i = 0; + // for (String attributeName : attributeMap.keySet()) { + Iterator it = attributeMap.keySet().iterator(); + while (it.hasNext()) { + result[i++] = (String) it.next(); + } + return result; + } + + /** + * Sets the named factory attribute to the specified value, creating it + * if it did not already exist. If the value is null, this is the same as + * calling removeAttribute. + * + * @param attributeName the attribute name + * @param value the new attribute value + */ + public void setAttribute(String attributeName, Object value) { + attributeMap.put(attributeName, value); + } + + /** + * Removes the named attribute if it exists. + * + * @param attributeName the attribute name + */ + public void removeAttribute(String attributeName) { + attributeMap.remove(attributeName); + } + + /** + * Returns the named MetricsContext instance, constructing it if necessary + * using the factory's current configuration attributes.

    + * + * When constructing the instance, if the factory property + * contextName.class exists, + * its value is taken to be the name of the class to instantiate. Otherwise, + * the default is to create an instance of + * org.apache.hadoop.metrics.spi.NullContext, which is a + * dummy "no-op" context which will cause all metric data to be discarded. + * + * @param contextName the name of the context + * @return the named MetricsContext + */ + public synchronized MetricsContext getContext(String refName, String contextName) + throws IOException, ClassNotFoundException, + InstantiationException, IllegalAccessException { + MetricsContext metricsContext = contextMap.get(refName); + if (metricsContext == null) { + String classNameAttribute = refName + CONTEXT_CLASS_SUFFIX; + String className = (String) getAttribute(classNameAttribute); + if (className == null) { + className = DEFAULT_CONTEXT_CLASSNAME; + } + Class contextClass = Class.forName(className); + metricsContext = (MetricsContext) contextClass.newInstance(); + metricsContext.init(contextName, this); + contextMap.put(contextName, metricsContext); + } + return metricsContext; + } + + public synchronized MetricsContext getContext(String contextName) + throws IOException, ClassNotFoundException, InstantiationException, + IllegalAccessException { + return getContext(contextName, contextName); + } + + /** + * Returns all MetricsContexts built by this factory. + */ + public synchronized Collection getAllContexts() { + // Make a copy to avoid race conditions with creating new contexts. + return new ArrayList(contextMap.values()); + } + + /** + * Returns a "null" context - one which does nothing. + */ + public static synchronized MetricsContext getNullContext(String contextName) { + MetricsContext nullContext = nullContextMap.get(contextName); + if (nullContext == null) { + nullContext = new NullContext(); + nullContextMap.put(contextName, nullContext); + } + return nullContext; + } + + /** + * Returns the singleton ContextFactory instance, constructing it if + * necessary.

    + * + * When the instance is constructed, this method checks if the file + * hadoop-metrics.properties exists on the class path. If it + * exists, it must be in the format defined by java.util.Properties, and all + * the properties in the file are set as attributes on the newly created + * ContextFactory instance. + * + * @return the singleton ContextFactory instance + */ + public static synchronized ContextFactory getFactory() throws IOException { + if (theFactory == null) { + theFactory = new ContextFactory(); + theFactory.setAttributes(); + } + return theFactory; + } + + private void setAttributes() throws IOException { + InputStream is = getClass().getResourceAsStream(PROPERTIES_FILE); + if (is != null) { + try { + Properties properties = new Properties(); + properties.load(is); + //for (Object propertyNameObj : properties.keySet()) { + Iterator it = properties.keySet().iterator(); + while (it.hasNext()) { + String propertyName = (String) it.next(); + String propertyValue = properties.getProperty(propertyName); + setAttribute(propertyName, propertyValue); + } + } finally { + is.close(); + } + } + } + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsContext.java index b84d5265036..e297e3738b1 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsContext.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsContext.java @@ -1,122 +1,122 @@ -/* - * MetricsContext.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics; - -import java.io.IOException; -import java.util.Collection; -import java.util.Map; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.metrics.spi.OutputRecord; - -/** - * The main interface to the metrics package. - */ -@InterfaceAudience.Private -@InterfaceStability.Evolving -public interface MetricsContext { - - /** - * Default period in seconds at which data is sent to the metrics system. - */ - public static final int DEFAULT_PERIOD = 5; - - /** - * Initialize this context. - * @param contextName The given name for this context - * @param factory The creator of this context - */ - public void init(String contextName, ContextFactory factory); - - /** - * Returns the context name. - * - * @return the context name - */ - public abstract String getContextName(); - - /** - * Starts or restarts monitoring, the emitting of metrics records as they are - * updated. - */ - public abstract void startMonitoring() - throws IOException; - - /** - * Stops monitoring. This does not free any data that the implementation - * may have buffered for sending at the next timer event. It - * is OK to call startMonitoring() again after calling - * this. - * @see #close() - */ - public abstract void stopMonitoring(); - - /** - * Returns true if monitoring is currently in progress. - */ - public abstract boolean isMonitoring(); - - /** - * Stops monitoring and also frees any buffered data, returning this - * object to its initial state. - */ - public abstract void close(); - - /** - * Creates a new MetricsRecord instance with the given recordName. - * Throws an exception if the metrics implementation is configured with a fixed - * set of record names and recordName is not in that set. - * - * @param recordName the name of the record - * @throws MetricsException if recordName conflicts with configuration data - */ - public abstract MetricsRecord createRecord(String recordName); - - /** - * Registers a callback to be called at regular time intervals, as - * determined by the implementation-class specific configuration. - * - * @param updater object to be run periodically; it should updated - * some metrics records and then return - */ - public abstract void registerUpdater(Updater updater); - - /** - * Removes a callback, if it exists. - * - * @param updater object to be removed from the callback list - */ - public abstract void unregisterUpdater(Updater updater); - - /** - * Returns the timer period. - */ - public abstract int getPeriod(); - - /** - * Retrieves all the records managed by this MetricsContext. - * Useful for monitoring systems that are polling-based. - * - * @return A non-null map from all record names to the records managed. - */ - Map> getAllRecords(); -} +/* + * MetricsContext.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics; + +import java.io.IOException; +import java.util.Collection; +import java.util.Map; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.metrics.spi.OutputRecord; + +/** + * The main interface to the metrics package. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public interface MetricsContext { + + /** + * Default period in seconds at which data is sent to the metrics system. + */ + public static final int DEFAULT_PERIOD = 5; + + /** + * Initialize this context. + * @param contextName The given name for this context + * @param factory The creator of this context + */ + public void init(String contextName, ContextFactory factory); + + /** + * Returns the context name. + * + * @return the context name + */ + public abstract String getContextName(); + + /** + * Starts or restarts monitoring, the emitting of metrics records as they are + * updated. + */ + public abstract void startMonitoring() + throws IOException; + + /** + * Stops monitoring. This does not free any data that the implementation + * may have buffered for sending at the next timer event. It + * is OK to call startMonitoring() again after calling + * this. + * @see #close() + */ + public abstract void stopMonitoring(); + + /** + * Returns true if monitoring is currently in progress. + */ + public abstract boolean isMonitoring(); + + /** + * Stops monitoring and also frees any buffered data, returning this + * object to its initial state. + */ + public abstract void close(); + + /** + * Creates a new MetricsRecord instance with the given recordName. + * Throws an exception if the metrics implementation is configured with a fixed + * set of record names and recordName is not in that set. + * + * @param recordName the name of the record + * @throws MetricsException if recordName conflicts with configuration data + */ + public abstract MetricsRecord createRecord(String recordName); + + /** + * Registers a callback to be called at regular time intervals, as + * determined by the implementation-class specific configuration. + * + * @param updater object to be run periodically; it should updated + * some metrics records and then return + */ + public abstract void registerUpdater(Updater updater); + + /** + * Removes a callback, if it exists. + * + * @param updater object to be removed from the callback list + */ + public abstract void unregisterUpdater(Updater updater); + + /** + * Returns the timer period. + */ + public abstract int getPeriod(); + + /** + * Retrieves all the records managed by this MetricsContext. + * Useful for monitoring systems that are polling-based. + * + * @return A non-null map from all record names to the records managed. + */ + Map> getAllRecords(); +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsException.java index 7a19d1bec73..de7139549f1 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsException.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsException.java @@ -1,47 +1,47 @@ -/* - * MetricsException.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; - -/** - * General-purpose, unchecked metrics exception. - */ -@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) -@InterfaceStability.Evolving -public class MetricsException extends RuntimeException { - - private static final long serialVersionUID = -1643257498540498497L; - - /** Creates a new instance of MetricsException */ - public MetricsException() { - } - - /** Creates a new instance of MetricsException - * - * @param message an error message - */ - public MetricsException(String message) { - super(message); - } - -} +/* + * MetricsException.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * General-purpose, unchecked metrics exception. + */ +@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) +@InterfaceStability.Evolving +public class MetricsException extends RuntimeException { + + private static final long serialVersionUID = -1643257498540498497L; + + /** Creates a new instance of MetricsException */ + public MetricsException() { + } + + /** Creates a new instance of MetricsException + * + * @param message an error message + */ + public MetricsException(String message) { + super(message); + } + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsRecord.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsRecord.java index cbe3ea48c20..45701c570f9 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsRecord.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/MetricsRecord.java @@ -1,251 +1,251 @@ -/* - * MetricsRecord.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; - -/** - * A named and optionally tagged set of records to be sent to the metrics - * system.

    - * - * A record name identifies the kind of data to be reported. For example, a - * program reporting statistics relating to the disks on a computer might use - * a record name "diskStats".

    - * - * A record has zero or more tags. A tag has a name and a value. To - * continue the example, the "diskStats" record might use a tag named - * "diskName" to identify a particular disk. Sometimes it is useful to have - * more than one tag, so there might also be a "diskType" with value "ide" or - * "scsi" or whatever.

    - * - * A record also has zero or more metrics. These are the named - * values that are to be reported to the metrics system. In the "diskStats" - * example, possible metric names would be "diskPercentFull", "diskPercentBusy", - * "kbReadPerSecond", etc.

    - * - * The general procedure for using a MetricsRecord is to fill in its tag and - * metric values, and then call update() to pass the record to the - * client library. - * Metric data is not immediately sent to the metrics system - * each time that update() is called. - * An internal table is maintained, identified by the record name. This - * table has columns - * corresponding to the tag and the metric names, and rows - * corresponding to each unique set of tag values. An update - * either modifies an existing row in the table, or adds a new row with a set of - * tag values that are different from all the other rows. Note that if there - * are no tags, then there can be at most one row in the table.

    - * - * Once a row is added to the table, its data will be sent to the metrics system - * on every timer period, whether or not it has been updated since the previous - * timer period. If this is inappropriate, for example if metrics were being - * reported by some transient object in an application, the remove() - * method can be used to remove the row and thus stop the data from being - * sent.

    - * - * Note that the update() method is atomic. This means that it is - * safe for different threads to be updating the same metric. More precisely, - * it is OK for different threads to call update() on MetricsRecord instances - * with the same set of tag names and tag values. Different threads should - * not use the same MetricsRecord instance at the same time. - */ -@InterfaceAudience.Private -@InterfaceStability.Evolving -public interface MetricsRecord { - - /** - * Returns the record name. - * - * @return the record name - */ - public abstract String getRecordName(); - - /** - * Sets the named tag to the specified value. The tagValue may be null, - * which is treated the same as an empty String. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public abstract void setTag(String tagName, String tagValue); - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public abstract void setTag(String tagName, int tagValue); - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public abstract void setTag(String tagName, long tagValue); - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public abstract void setTag(String tagName, short tagValue); - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public abstract void setTag(String tagName, byte tagValue); - - /** - * Removes any tag of the specified name. - * - * @param tagName name of a tag - */ - public abstract void removeTag(String tagName); - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void setMetric(String metricName, int metricValue); - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void setMetric(String metricName, long metricValue); - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void setMetric(String metricName, short metricValue); - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void setMetric(String metricName, byte metricValue); - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void setMetric(String metricName, float metricValue); - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void incrMetric(String metricName, int metricValue); - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void incrMetric(String metricName, long metricValue); - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void incrMetric(String metricName, short metricValue); - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void incrMetric(String metricName, byte metricValue); - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public abstract void incrMetric(String metricName, float metricValue); - - /** - * Updates the table of buffered data which is to be sent periodically. - * If the tag values match an existing row, that row is updated; - * otherwise, a new row is added. - */ - public abstract void update(); - - /** - * Removes, from the buffered data table, all rows having tags - * that equal the tags that have been set on this record. For example, - * if there are no tags on this record, all rows for this record name - * would be removed. Or, if there is a single tag on this record, then - * just rows containing a tag with the same name and value would be removed. - */ - public abstract void remove(); - -} +/* + * MetricsRecord.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * A named and optionally tagged set of records to be sent to the metrics + * system.

    + * + * A record name identifies the kind of data to be reported. For example, a + * program reporting statistics relating to the disks on a computer might use + * a record name "diskStats".

    + * + * A record has zero or more tags. A tag has a name and a value. To + * continue the example, the "diskStats" record might use a tag named + * "diskName" to identify a particular disk. Sometimes it is useful to have + * more than one tag, so there might also be a "diskType" with value "ide" or + * "scsi" or whatever.

    + * + * A record also has zero or more metrics. These are the named + * values that are to be reported to the metrics system. In the "diskStats" + * example, possible metric names would be "diskPercentFull", "diskPercentBusy", + * "kbReadPerSecond", etc.

    + * + * The general procedure for using a MetricsRecord is to fill in its tag and + * metric values, and then call update() to pass the record to the + * client library. + * Metric data is not immediately sent to the metrics system + * each time that update() is called. + * An internal table is maintained, identified by the record name. This + * table has columns + * corresponding to the tag and the metric names, and rows + * corresponding to each unique set of tag values. An update + * either modifies an existing row in the table, or adds a new row with a set of + * tag values that are different from all the other rows. Note that if there + * are no tags, then there can be at most one row in the table.

    + * + * Once a row is added to the table, its data will be sent to the metrics system + * on every timer period, whether or not it has been updated since the previous + * timer period. If this is inappropriate, for example if metrics were being + * reported by some transient object in an application, the remove() + * method can be used to remove the row and thus stop the data from being + * sent.

    + * + * Note that the update() method is atomic. This means that it is + * safe for different threads to be updating the same metric. More precisely, + * it is OK for different threads to call update() on MetricsRecord instances + * with the same set of tag names and tag values. Different threads should + * not use the same MetricsRecord instance at the same time. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public interface MetricsRecord { + + /** + * Returns the record name. + * + * @return the record name + */ + public abstract String getRecordName(); + + /** + * Sets the named tag to the specified value. The tagValue may be null, + * which is treated the same as an empty String. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public abstract void setTag(String tagName, String tagValue); + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public abstract void setTag(String tagName, int tagValue); + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public abstract void setTag(String tagName, long tagValue); + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public abstract void setTag(String tagName, short tagValue); + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public abstract void setTag(String tagName, byte tagValue); + + /** + * Removes any tag of the specified name. + * + * @param tagName name of a tag + */ + public abstract void removeTag(String tagName); + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void setMetric(String metricName, int metricValue); + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void setMetric(String metricName, long metricValue); + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void setMetric(String metricName, short metricValue); + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void setMetric(String metricName, byte metricValue); + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void setMetric(String metricName, float metricValue); + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void incrMetric(String metricName, int metricValue); + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void incrMetric(String metricName, long metricValue); + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void incrMetric(String metricName, short metricValue); + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void incrMetric(String metricName, byte metricValue); + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public abstract void incrMetric(String metricName, float metricValue); + + /** + * Updates the table of buffered data which is to be sent periodically. + * If the tag values match an existing row, that row is updated; + * otherwise, a new row is added. + */ + public abstract void update(); + + /** + * Removes, from the buffered data table, all rows having tags + * that equal the tags that have been set on this record. For example, + * if there are no tags on this record, all rows for this record name + * would be removed. Or, if there is a single tag on this record, then + * just rows containing a tag with the same name and value would be removed. + */ + public abstract void remove(); + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/file/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/file/FileContext.java index f0aafa1c3a3..591b32c4914 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/file/FileContext.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/file/FileContext.java @@ -1,154 +1,154 @@ -/* - * FileContext.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics.file; - -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.metrics.ContextFactory; -import org.apache.hadoop.metrics.spi.AbstractMetricsContext; -import org.apache.hadoop.metrics.spi.OutputRecord; - -/** - * Metrics context for writing metrics to a file.

    - * - * This class is configured by setting ContextFactory attributes which in turn - * are usually configured through a properties file. All the attributes are - * prefixed by the contextName. For example, the properties file might contain: - *

    - * myContextName.fileName=/tmp/metrics.log
    - * myContextName.period=5
    - * 
    - * @see org.apache.hadoop.metrics2.sink.FileSink for metrics 2.0. - */ -@InterfaceAudience.Public -@InterfaceStability.Evolving -@Deprecated -public class FileContext extends AbstractMetricsContext { - - /* Configuration attribute names */ - @InterfaceAudience.Private - protected static final String FILE_NAME_PROPERTY = "fileName"; - @InterfaceAudience.Private - protected static final String PERIOD_PROPERTY = "period"; - - private File file = null; // file for metrics to be written to - private PrintWriter writer = null; - - /** Creates a new instance of FileContext */ - @InterfaceAudience.Private - public FileContext() {} - - @InterfaceAudience.Private - public void init(String contextName, ContextFactory factory) { - super.init(contextName, factory); - - String fileName = getAttribute(FILE_NAME_PROPERTY); - if (fileName != null) { - file = new File(fileName); - } - - parseAndSetPeriod(PERIOD_PROPERTY); - } - - /** - * Returns the configured file name, or null. - */ - @InterfaceAudience.Private - public String getFileName() { - if (file == null) { - return null; - } else { - return file.getName(); - } - } - - /** - * Starts or restarts monitoring, by opening in append-mode, the - * file specified by the fileName attribute, - * if specified. Otherwise the data will be written to standard - * output. - */ - @InterfaceAudience.Private - public void startMonitoring() - throws IOException - { - if (file == null) { - writer = new PrintWriter(new BufferedOutputStream(System.out)); - } else { - writer = new PrintWriter(new FileWriter(file, true)); - } - super.startMonitoring(); - } - - /** - * Stops monitoring, closing the file. - * @see #close() - */ - @InterfaceAudience.Private - public void stopMonitoring() { - super.stopMonitoring(); - - if (writer != null) { - writer.close(); - writer = null; - } - } - - /** - * Emits a metrics record to a file. - */ - @InterfaceAudience.Private - public void emitRecord(String contextName, String recordName, OutputRecord outRec) { - writer.print(contextName); - writer.print("."); - writer.print(recordName); - String separator = ": "; - for (String tagName : outRec.getTagNames()) { - writer.print(separator); - separator = ", "; - writer.print(tagName); - writer.print("="); - writer.print(outRec.getTag(tagName)); - } - for (String metricName : outRec.getMetricNames()) { - writer.print(separator); - separator = ", "; - writer.print(metricName); - writer.print("="); - writer.print(outRec.getMetric(metricName)); - } - writer.println(); - } - - /** - * Flushes the output writer, forcing updates to disk. - */ - @InterfaceAudience.Private - public void flush() { - writer.flush(); - } -} +/* + * FileContext.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics.file; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.metrics.ContextFactory; +import org.apache.hadoop.metrics.spi.AbstractMetricsContext; +import org.apache.hadoop.metrics.spi.OutputRecord; + +/** + * Metrics context for writing metrics to a file.

    + * + * This class is configured by setting ContextFactory attributes which in turn + * are usually configured through a properties file. All the attributes are + * prefixed by the contextName. For example, the properties file might contain: + *

    + * myContextName.fileName=/tmp/metrics.log
    + * myContextName.period=5
    + * 
    + * @see org.apache.hadoop.metrics2.sink.FileSink for metrics 2.0. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@Deprecated +public class FileContext extends AbstractMetricsContext { + + /* Configuration attribute names */ + @InterfaceAudience.Private + protected static final String FILE_NAME_PROPERTY = "fileName"; + @InterfaceAudience.Private + protected static final String PERIOD_PROPERTY = "period"; + + private File file = null; // file for metrics to be written to + private PrintWriter writer = null; + + /** Creates a new instance of FileContext */ + @InterfaceAudience.Private + public FileContext() {} + + @InterfaceAudience.Private + public void init(String contextName, ContextFactory factory) { + super.init(contextName, factory); + + String fileName = getAttribute(FILE_NAME_PROPERTY); + if (fileName != null) { + file = new File(fileName); + } + + parseAndSetPeriod(PERIOD_PROPERTY); + } + + /** + * Returns the configured file name, or null. + */ + @InterfaceAudience.Private + public String getFileName() { + if (file == null) { + return null; + } else { + return file.getName(); + } + } + + /** + * Starts or restarts monitoring, by opening in append-mode, the + * file specified by the fileName attribute, + * if specified. Otherwise the data will be written to standard + * output. + */ + @InterfaceAudience.Private + public void startMonitoring() + throws IOException + { + if (file == null) { + writer = new PrintWriter(new BufferedOutputStream(System.out)); + } else { + writer = new PrintWriter(new FileWriter(file, true)); + } + super.startMonitoring(); + } + + /** + * Stops monitoring, closing the file. + * @see #close() + */ + @InterfaceAudience.Private + public void stopMonitoring() { + super.stopMonitoring(); + + if (writer != null) { + writer.close(); + writer = null; + } + } + + /** + * Emits a metrics record to a file. + */ + @InterfaceAudience.Private + public void emitRecord(String contextName, String recordName, OutputRecord outRec) { + writer.print(contextName); + writer.print("."); + writer.print(recordName); + String separator = ": "; + for (String tagName : outRec.getTagNames()) { + writer.print(separator); + separator = ", "; + writer.print(tagName); + writer.print("="); + writer.print(outRec.getTag(tagName)); + } + for (String metricName : outRec.getMetricNames()) { + writer.print(separator); + separator = ", "; + writer.print(metricName); + writer.print("="); + writer.print(outRec.getMetric(metricName)); + } + writer.println(); + } + + /** + * Flushes the output writer, forcing updates to disk. + */ + @InterfaceAudience.Private + public void flush() { + writer.flush(); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/AbstractMetricsContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/AbstractMetricsContext.java index 947b0a12958..6e1e210e670 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/AbstractMetricsContext.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/AbstractMetricsContext.java @@ -1,481 +1,481 @@ -/* - * AbstractMetricsContext.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics.spi; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.Timer; -import java.util.TimerTask; -import java.util.TreeMap; -import java.util.Map.Entry; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.metrics.ContextFactory; -import org.apache.hadoop.metrics.MetricsContext; -import org.apache.hadoop.metrics.MetricsException; -import org.apache.hadoop.metrics.MetricsRecord; -import org.apache.hadoop.metrics.Updater; - -/** - * The main class of the Service Provider Interface. This class should be - * extended in order to integrate the Metrics API with a specific metrics - * client library.

    - * - * This class implements the internal table of metric data, and the timer - * on which data is to be sent to the metrics system. Subclasses must - * override the abstract emitRecord method in order to transmit - * the data.

    - */ -@InterfaceAudience.Public -@InterfaceStability.Evolving -public abstract class AbstractMetricsContext implements MetricsContext { - - private int period = MetricsContext.DEFAULT_PERIOD; - private Timer timer = null; - - private Set updaters = new HashSet(1); - private volatile boolean isMonitoring = false; - - private ContextFactory factory = null; - private String contextName = null; - - @InterfaceAudience.Private - public static class TagMap extends TreeMap { - private static final long serialVersionUID = 3546309335061952993L; - TagMap() { - super(); - } - TagMap(TagMap orig) { - super(orig); - } - /** - * Returns true if this tagmap contains every tag in other. - */ - public boolean containsAll(TagMap other) { - for (Map.Entry entry : other.entrySet()) { - Object value = get(entry.getKey()); - if (value == null || !value.equals(entry.getValue())) { - // either key does not exist here, or the value is different - return false; - } - } - return true; - } - } - - @InterfaceAudience.Private - public static class MetricMap extends TreeMap { - private static final long serialVersionUID = -7495051861141631609L; - MetricMap() { - super(); - } - MetricMap(MetricMap orig) { - super(orig); - } - } - - static class RecordMap extends HashMap { - private static final long serialVersionUID = 259835619700264611L; - } - - private Map bufferedData = new HashMap(); - - - /** - * Creates a new instance of AbstractMetricsContext - */ - protected AbstractMetricsContext() { - } - - /** - * Initializes the context. - */ - public void init(String contextName, ContextFactory factory) - { - this.contextName = contextName; - this.factory = factory; - } - - /** - * Convenience method for subclasses to access factory attributes. - */ - protected String getAttribute(String attributeName) { - String factoryAttribute = contextName + "." + attributeName; - return (String) factory.getAttribute(factoryAttribute); - } - - /** - * Returns an attribute-value map derived from the factory attributes - * by finding all factory attributes that begin with - * contextName.tableName. The returned map consists of - * those attributes with the contextName and tableName stripped off. - */ - protected Map getAttributeTable(String tableName) { - String prefix = contextName + "." + tableName + "."; - Map result = new HashMap(); - for (String attributeName : factory.getAttributeNames()) { - if (attributeName.startsWith(prefix)) { - String name = attributeName.substring(prefix.length()); - String value = (String) factory.getAttribute(attributeName); - result.put(name, value); - } - } - return result; - } - - /** - * Returns the context name. - */ - public String getContextName() { - return contextName; - } - - /** - * Returns the factory by which this context was created. - */ - public ContextFactory getContextFactory() { - return factory; - } - - /** - * Starts or restarts monitoring, the emitting of metrics records. - */ - public synchronized void startMonitoring() - throws IOException { - if (!isMonitoring) { - startTimer(); - isMonitoring = true; - } - } - - /** - * Stops monitoring. This does not free buffered data. - * @see #close() - */ - public synchronized void stopMonitoring() { - if (isMonitoring) { - stopTimer(); - isMonitoring = false; - } - } - - /** - * Returns true if monitoring is currently in progress. - */ - public boolean isMonitoring() { - return isMonitoring; - } - - /** - * Stops monitoring and frees buffered data, returning this - * object to its initial state. - */ - public synchronized void close() { - stopMonitoring(); - clearUpdaters(); - } - - /** - * Creates a new AbstractMetricsRecord instance with the given recordName. - * Throws an exception if the metrics implementation is configured with a fixed - * set of record names and recordName is not in that set. - * - * @param recordName the name of the record - * @throws MetricsException if recordName conflicts with configuration data - */ - public final synchronized MetricsRecord createRecord(String recordName) { - if (bufferedData.get(recordName) == null) { - bufferedData.put(recordName, new RecordMap()); - } - return newRecord(recordName); - } - - /** - * Subclasses should override this if they subclass MetricsRecordImpl. - * @param recordName the name of the record - * @return newly created instance of MetricsRecordImpl or subclass - */ - protected MetricsRecord newRecord(String recordName) { - return new MetricsRecordImpl(recordName, this); - } - - /** - * Registers a callback to be called at time intervals determined by - * the configuration. - * - * @param updater object to be run periodically; it should update - * some metrics records - */ - public synchronized void registerUpdater(final Updater updater) { - if (!updaters.contains(updater)) { - updaters.add(updater); - } - } - - /** - * Removes a callback, if it exists. - * - * @param updater object to be removed from the callback list - */ - public synchronized void unregisterUpdater(Updater updater) { - updaters.remove(updater); - } - - private synchronized void clearUpdaters() { - updaters.clear(); - } - - /** - * Starts timer if it is not already started - */ - private synchronized void startTimer() { - if (timer == null) { - timer = new Timer("Timer thread for monitoring " + getContextName(), - true); - TimerTask task = new TimerTask() { - public void run() { - try { - timerEvent(); - } - catch (IOException ioe) { - ioe.printStackTrace(); - } - } - }; - long millis = period * 1000; - timer.scheduleAtFixedRate(task, millis, millis); - } - } - - /** - * Stops timer if it is running - */ - private synchronized void stopTimer() { - if (timer != null) { - timer.cancel(); - timer = null; - } - } - - /** - * Timer callback. - */ - private void timerEvent() throws IOException { - if (isMonitoring) { - Collection myUpdaters; - synchronized (this) { - myUpdaters = new ArrayList(updaters); - } - // Run all the registered updates without holding a lock - // on this context - for (Updater updater : myUpdaters) { - try { - updater.doUpdates(this); - } - catch (Throwable throwable) { - throwable.printStackTrace(); - } - } - emitRecords(); - } - } - - /** - * Emits the records. - */ - private synchronized void emitRecords() throws IOException { - for (String recordName : bufferedData.keySet()) { - RecordMap recordMap = bufferedData.get(recordName); - synchronized (recordMap) { - Set> entrySet = recordMap.entrySet (); - for (Entry entry : entrySet) { - OutputRecord outRec = new OutputRecord(entry.getKey(), entry.getValue()); - emitRecord(contextName, recordName, outRec); - } - } - } - flush(); - } - - /** - * Retrieves all the records managed by this MetricsContext. - * Useful for monitoring systems that are polling-based. - * @return A non-null collection of all monitoring records. - */ - public synchronized Map> getAllRecords() { - Map> out = new TreeMap>(); - for (String recordName : bufferedData.keySet()) { - RecordMap recordMap = bufferedData.get(recordName); - synchronized (recordMap) { - List records = new ArrayList(); - Set> entrySet = recordMap.entrySet(); - for (Entry entry : entrySet) { - OutputRecord outRec = new OutputRecord(entry.getKey(), entry.getValue()); - records.add(outRec); - } - out.put(recordName, records); - } - } - return out; - } - - /** - * Sends a record to the metrics system. - */ - protected abstract void emitRecord(String contextName, String recordName, - OutputRecord outRec) throws IOException; - - /** - * Called each period after all records have been emitted, this method does nothing. - * Subclasses may override it in order to perform some kind of flush. - */ - protected void flush() throws IOException { - } - - /** - * Called by MetricsRecordImpl.update(). Creates or updates a row in - * the internal table of metric data. - */ - protected void update(MetricsRecordImpl record) { - String recordName = record.getRecordName(); - TagMap tagTable = record.getTagTable(); - Map metricUpdates = record.getMetricTable(); - - RecordMap recordMap = getRecordMap(recordName); - synchronized (recordMap) { - MetricMap metricMap = recordMap.get(tagTable); - if (metricMap == null) { - metricMap = new MetricMap(); - TagMap tagMap = new TagMap(tagTable); // clone tags - recordMap.put(tagMap, metricMap); - } - - Set> entrySet = metricUpdates.entrySet(); - for (Entry entry : entrySet) { - String metricName = entry.getKey (); - MetricValue updateValue = entry.getValue (); - Number updateNumber = updateValue.getNumber(); - Number currentNumber = metricMap.get(metricName); - if (currentNumber == null || updateValue.isAbsolute()) { - metricMap.put(metricName, updateNumber); - } - else { - Number newNumber = sum(updateNumber, currentNumber); - metricMap.put(metricName, newNumber); - } - } - } - } - - private synchronized RecordMap getRecordMap(String recordName) { - return bufferedData.get(recordName); - } - - /** - * Adds two numbers, coercing the second to the type of the first. - * - */ - private Number sum(Number a, Number b) { - if (a instanceof Integer) { - return Integer.valueOf(a.intValue() + b.intValue()); - } - else if (a instanceof Float) { - return new Float(a.floatValue() + b.floatValue()); - } - else if (a instanceof Short) { - return Short.valueOf((short)(a.shortValue() + b.shortValue())); - } - else if (a instanceof Byte) { - return Byte.valueOf((byte)(a.byteValue() + b.byteValue())); - } - else if (a instanceof Long) { - return Long.valueOf((a.longValue() + b.longValue())); - } - else { - // should never happen - throw new MetricsException("Invalid number type"); - } - - } - - /** - * Called by MetricsRecordImpl.remove(). Removes all matching rows in - * the internal table of metric data. A row matches if it has the same - * tag names and values as record, but it may also have additional - * tags. - */ - protected void remove(MetricsRecordImpl record) { - String recordName = record.getRecordName(); - TagMap tagTable = record.getTagTable(); - - RecordMap recordMap = getRecordMap(recordName); - synchronized (recordMap) { - Iterator it = recordMap.keySet().iterator(); - while (it.hasNext()) { - TagMap rowTags = it.next(); - if (rowTags.containsAll(tagTable)) { - it.remove(); - } - } - } - } - - /** - * Returns the timer period. - */ - public int getPeriod() { - return period; - } - - /** - * Sets the timer period - */ - protected void setPeriod(int period) { - this.period = period; - } - - /** - * If a period is set in the attribute passed in, override - * the default with it. - */ - protected void parseAndSetPeriod(String attributeName) { - String periodStr = getAttribute(attributeName); - if (periodStr != null) { - int period = 0; - try { - period = Integer.parseInt(periodStr); - } catch (NumberFormatException nfe) { - } - if (period <= 0) { - throw new MetricsException("Invalid period: " + periodStr); - } - setPeriod(period); - } - } -} +/* + * AbstractMetricsContext.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics.spi; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Timer; +import java.util.TimerTask; +import java.util.TreeMap; +import java.util.Map.Entry; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.metrics.ContextFactory; +import org.apache.hadoop.metrics.MetricsContext; +import org.apache.hadoop.metrics.MetricsException; +import org.apache.hadoop.metrics.MetricsRecord; +import org.apache.hadoop.metrics.Updater; + +/** + * The main class of the Service Provider Interface. This class should be + * extended in order to integrate the Metrics API with a specific metrics + * client library.

    + * + * This class implements the internal table of metric data, and the timer + * on which data is to be sent to the metrics system. Subclasses must + * override the abstract emitRecord method in order to transmit + * the data.

    + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public abstract class AbstractMetricsContext implements MetricsContext { + + private int period = MetricsContext.DEFAULT_PERIOD; + private Timer timer = null; + + private Set updaters = new HashSet(1); + private volatile boolean isMonitoring = false; + + private ContextFactory factory = null; + private String contextName = null; + + @InterfaceAudience.Private + public static class TagMap extends TreeMap { + private static final long serialVersionUID = 3546309335061952993L; + TagMap() { + super(); + } + TagMap(TagMap orig) { + super(orig); + } + /** + * Returns true if this tagmap contains every tag in other. + */ + public boolean containsAll(TagMap other) { + for (Map.Entry entry : other.entrySet()) { + Object value = get(entry.getKey()); + if (value == null || !value.equals(entry.getValue())) { + // either key does not exist here, or the value is different + return false; + } + } + return true; + } + } + + @InterfaceAudience.Private + public static class MetricMap extends TreeMap { + private static final long serialVersionUID = -7495051861141631609L; + MetricMap() { + super(); + } + MetricMap(MetricMap orig) { + super(orig); + } + } + + static class RecordMap extends HashMap { + private static final long serialVersionUID = 259835619700264611L; + } + + private Map bufferedData = new HashMap(); + + + /** + * Creates a new instance of AbstractMetricsContext + */ + protected AbstractMetricsContext() { + } + + /** + * Initializes the context. + */ + public void init(String contextName, ContextFactory factory) + { + this.contextName = contextName; + this.factory = factory; + } + + /** + * Convenience method for subclasses to access factory attributes. + */ + protected String getAttribute(String attributeName) { + String factoryAttribute = contextName + "." + attributeName; + return (String) factory.getAttribute(factoryAttribute); + } + + /** + * Returns an attribute-value map derived from the factory attributes + * by finding all factory attributes that begin with + * contextName.tableName. The returned map consists of + * those attributes with the contextName and tableName stripped off. + */ + protected Map getAttributeTable(String tableName) { + String prefix = contextName + "." + tableName + "."; + Map result = new HashMap(); + for (String attributeName : factory.getAttributeNames()) { + if (attributeName.startsWith(prefix)) { + String name = attributeName.substring(prefix.length()); + String value = (String) factory.getAttribute(attributeName); + result.put(name, value); + } + } + return result; + } + + /** + * Returns the context name. + */ + public String getContextName() { + return contextName; + } + + /** + * Returns the factory by which this context was created. + */ + public ContextFactory getContextFactory() { + return factory; + } + + /** + * Starts or restarts monitoring, the emitting of metrics records. + */ + public synchronized void startMonitoring() + throws IOException { + if (!isMonitoring) { + startTimer(); + isMonitoring = true; + } + } + + /** + * Stops monitoring. This does not free buffered data. + * @see #close() + */ + public synchronized void stopMonitoring() { + if (isMonitoring) { + stopTimer(); + isMonitoring = false; + } + } + + /** + * Returns true if monitoring is currently in progress. + */ + public boolean isMonitoring() { + return isMonitoring; + } + + /** + * Stops monitoring and frees buffered data, returning this + * object to its initial state. + */ + public synchronized void close() { + stopMonitoring(); + clearUpdaters(); + } + + /** + * Creates a new AbstractMetricsRecord instance with the given recordName. + * Throws an exception if the metrics implementation is configured with a fixed + * set of record names and recordName is not in that set. + * + * @param recordName the name of the record + * @throws MetricsException if recordName conflicts with configuration data + */ + public final synchronized MetricsRecord createRecord(String recordName) { + if (bufferedData.get(recordName) == null) { + bufferedData.put(recordName, new RecordMap()); + } + return newRecord(recordName); + } + + /** + * Subclasses should override this if they subclass MetricsRecordImpl. + * @param recordName the name of the record + * @return newly created instance of MetricsRecordImpl or subclass + */ + protected MetricsRecord newRecord(String recordName) { + return new MetricsRecordImpl(recordName, this); + } + + /** + * Registers a callback to be called at time intervals determined by + * the configuration. + * + * @param updater object to be run periodically; it should update + * some metrics records + */ + public synchronized void registerUpdater(final Updater updater) { + if (!updaters.contains(updater)) { + updaters.add(updater); + } + } + + /** + * Removes a callback, if it exists. + * + * @param updater object to be removed from the callback list + */ + public synchronized void unregisterUpdater(Updater updater) { + updaters.remove(updater); + } + + private synchronized void clearUpdaters() { + updaters.clear(); + } + + /** + * Starts timer if it is not already started + */ + private synchronized void startTimer() { + if (timer == null) { + timer = new Timer("Timer thread for monitoring " + getContextName(), + true); + TimerTask task = new TimerTask() { + public void run() { + try { + timerEvent(); + } + catch (IOException ioe) { + ioe.printStackTrace(); + } + } + }; + long millis = period * 1000; + timer.scheduleAtFixedRate(task, millis, millis); + } + } + + /** + * Stops timer if it is running + */ + private synchronized void stopTimer() { + if (timer != null) { + timer.cancel(); + timer = null; + } + } + + /** + * Timer callback. + */ + private void timerEvent() throws IOException { + if (isMonitoring) { + Collection myUpdaters; + synchronized (this) { + myUpdaters = new ArrayList(updaters); + } + // Run all the registered updates without holding a lock + // on this context + for (Updater updater : myUpdaters) { + try { + updater.doUpdates(this); + } + catch (Throwable throwable) { + throwable.printStackTrace(); + } + } + emitRecords(); + } + } + + /** + * Emits the records. + */ + private synchronized void emitRecords() throws IOException { + for (String recordName : bufferedData.keySet()) { + RecordMap recordMap = bufferedData.get(recordName); + synchronized (recordMap) { + Set> entrySet = recordMap.entrySet (); + for (Entry entry : entrySet) { + OutputRecord outRec = new OutputRecord(entry.getKey(), entry.getValue()); + emitRecord(contextName, recordName, outRec); + } + } + } + flush(); + } + + /** + * Retrieves all the records managed by this MetricsContext. + * Useful for monitoring systems that are polling-based. + * @return A non-null collection of all monitoring records. + */ + public synchronized Map> getAllRecords() { + Map> out = new TreeMap>(); + for (String recordName : bufferedData.keySet()) { + RecordMap recordMap = bufferedData.get(recordName); + synchronized (recordMap) { + List records = new ArrayList(); + Set> entrySet = recordMap.entrySet(); + for (Entry entry : entrySet) { + OutputRecord outRec = new OutputRecord(entry.getKey(), entry.getValue()); + records.add(outRec); + } + out.put(recordName, records); + } + } + return out; + } + + /** + * Sends a record to the metrics system. + */ + protected abstract void emitRecord(String contextName, String recordName, + OutputRecord outRec) throws IOException; + + /** + * Called each period after all records have been emitted, this method does nothing. + * Subclasses may override it in order to perform some kind of flush. + */ + protected void flush() throws IOException { + } + + /** + * Called by MetricsRecordImpl.update(). Creates or updates a row in + * the internal table of metric data. + */ + protected void update(MetricsRecordImpl record) { + String recordName = record.getRecordName(); + TagMap tagTable = record.getTagTable(); + Map metricUpdates = record.getMetricTable(); + + RecordMap recordMap = getRecordMap(recordName); + synchronized (recordMap) { + MetricMap metricMap = recordMap.get(tagTable); + if (metricMap == null) { + metricMap = new MetricMap(); + TagMap tagMap = new TagMap(tagTable); // clone tags + recordMap.put(tagMap, metricMap); + } + + Set> entrySet = metricUpdates.entrySet(); + for (Entry entry : entrySet) { + String metricName = entry.getKey (); + MetricValue updateValue = entry.getValue (); + Number updateNumber = updateValue.getNumber(); + Number currentNumber = metricMap.get(metricName); + if (currentNumber == null || updateValue.isAbsolute()) { + metricMap.put(metricName, updateNumber); + } + else { + Number newNumber = sum(updateNumber, currentNumber); + metricMap.put(metricName, newNumber); + } + } + } + } + + private synchronized RecordMap getRecordMap(String recordName) { + return bufferedData.get(recordName); + } + + /** + * Adds two numbers, coercing the second to the type of the first. + * + */ + private Number sum(Number a, Number b) { + if (a instanceof Integer) { + return Integer.valueOf(a.intValue() + b.intValue()); + } + else if (a instanceof Float) { + return new Float(a.floatValue() + b.floatValue()); + } + else if (a instanceof Short) { + return Short.valueOf((short)(a.shortValue() + b.shortValue())); + } + else if (a instanceof Byte) { + return Byte.valueOf((byte)(a.byteValue() + b.byteValue())); + } + else if (a instanceof Long) { + return Long.valueOf((a.longValue() + b.longValue())); + } + else { + // should never happen + throw new MetricsException("Invalid number type"); + } + + } + + /** + * Called by MetricsRecordImpl.remove(). Removes all matching rows in + * the internal table of metric data. A row matches if it has the same + * tag names and values as record, but it may also have additional + * tags. + */ + protected void remove(MetricsRecordImpl record) { + String recordName = record.getRecordName(); + TagMap tagTable = record.getTagTable(); + + RecordMap recordMap = getRecordMap(recordName); + synchronized (recordMap) { + Iterator it = recordMap.keySet().iterator(); + while (it.hasNext()) { + TagMap rowTags = it.next(); + if (rowTags.containsAll(tagTable)) { + it.remove(); + } + } + } + } + + /** + * Returns the timer period. + */ + public int getPeriod() { + return period; + } + + /** + * Sets the timer period + */ + protected void setPeriod(int period) { + this.period = period; + } + + /** + * If a period is set in the attribute passed in, override + * the default with it. + */ + protected void parseAndSetPeriod(String attributeName) { + String periodStr = getAttribute(attributeName); + if (periodStr != null) { + int period = 0; + try { + period = Integer.parseInt(periodStr); + } catch (NumberFormatException nfe) { + } + if (period <= 0) { + throw new MetricsException("Invalid period: " + periodStr); + } + setPeriod(period); + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/MetricsRecordImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/MetricsRecordImpl.java index 85cf00e0918..0c379b6a329 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/MetricsRecordImpl.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics/spi/MetricsRecordImpl.java @@ -1,281 +1,281 @@ -/* - * MetricsRecordImpl.java - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.metrics.spi; - -import java.util.LinkedHashMap; -import java.util.Map; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.metrics.MetricsException; -import org.apache.hadoop.metrics.MetricsRecord; -import org.apache.hadoop.metrics.spi.AbstractMetricsContext.TagMap; - -/** - * An implementation of MetricsRecord. Keeps a back-pointer to the context - * from which it was created, and delegates back to it on update - * and remove(). - */ -@InterfaceAudience.Public -@InterfaceStability.Evolving -public class MetricsRecordImpl implements MetricsRecord { - - private TagMap tagTable = new TagMap(); - private Map metricTable = new LinkedHashMap(); - - private String recordName; - private AbstractMetricsContext context; - - - /** Creates a new instance of FileRecord */ - protected MetricsRecordImpl(String recordName, AbstractMetricsContext context) - { - this.recordName = recordName; - this.context = context; - } - - /** - * Returns the record name. - * - * @return the record name - */ - public String getRecordName() { - return recordName; - } - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public void setTag(String tagName, String tagValue) { - if (tagValue == null) { - tagValue = ""; - } - tagTable.put(tagName, tagValue); - } - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public void setTag(String tagName, int tagValue) { - tagTable.put(tagName, Integer.valueOf(tagValue)); - } - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public void setTag(String tagName, long tagValue) { - tagTable.put(tagName, Long.valueOf(tagValue)); - } - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public void setTag(String tagName, short tagValue) { - tagTable.put(tagName, Short.valueOf(tagValue)); - } - - /** - * Sets the named tag to the specified value. - * - * @param tagName name of the tag - * @param tagValue new value of the tag - * @throws MetricsException if the tagName conflicts with the configuration - */ - public void setTag(String tagName, byte tagValue) { - tagTable.put(tagName, Byte.valueOf(tagValue)); - } - - /** - * Removes any tag of the specified name. - */ - public void removeTag(String tagName) { - tagTable.remove(tagName); - } - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void setMetric(String metricName, int metricValue) { - setAbsolute(metricName, Integer.valueOf(metricValue)); - } - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void setMetric(String metricName, long metricValue) { - setAbsolute(metricName, Long.valueOf(metricValue)); - } - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void setMetric(String metricName, short metricValue) { - setAbsolute(metricName, Short.valueOf(metricValue)); - } - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void setMetric(String metricName, byte metricValue) { - setAbsolute(metricName, Byte.valueOf(metricValue)); - } - - /** - * Sets the named metric to the specified value. - * - * @param metricName name of the metric - * @param metricValue new value of the metric - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void setMetric(String metricName, float metricValue) { - setAbsolute(metricName, new Float(metricValue)); - } - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void incrMetric(String metricName, int metricValue) { - setIncrement(metricName, Integer.valueOf(metricValue)); - } - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void incrMetric(String metricName, long metricValue) { - setIncrement(metricName, Long.valueOf(metricValue)); - } - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void incrMetric(String metricName, short metricValue) { - setIncrement(metricName, Short.valueOf(metricValue)); - } - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void incrMetric(String metricName, byte metricValue) { - setIncrement(metricName, Byte.valueOf(metricValue)); - } - - /** - * Increments the named metric by the specified value. - * - * @param metricName name of the metric - * @param metricValue incremental value - * @throws MetricsException if the metricName or the type of the metricValue - * conflicts with the configuration - */ - public void incrMetric(String metricName, float metricValue) { - setIncrement(metricName, new Float(metricValue)); - } - - private void setAbsolute(String metricName, Number metricValue) { - metricTable.put(metricName, new MetricValue(metricValue, MetricValue.ABSOLUTE)); - } - - private void setIncrement(String metricName, Number metricValue) { - metricTable.put(metricName, new MetricValue(metricValue, MetricValue.INCREMENT)); - } - - /** - * Updates the table of buffered data which is to be sent periodically. - * If the tag values match an existing row, that row is updated; - * otherwise, a new row is added. - */ - public void update() { - context.update(this); - } - - /** - * Removes the row, if it exists, in the buffered data table having tags - * that equal the tags that have been set on this record. - */ - public void remove() { - context.remove(this); - } - - TagMap getTagTable() { - return tagTable; - } - - Map getMetricTable() { - return metricTable; - } -} +/* + * MetricsRecordImpl.java + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics.spi; + +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.metrics.MetricsException; +import org.apache.hadoop.metrics.MetricsRecord; +import org.apache.hadoop.metrics.spi.AbstractMetricsContext.TagMap; + +/** + * An implementation of MetricsRecord. Keeps a back-pointer to the context + * from which it was created, and delegates back to it on update + * and remove(). + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class MetricsRecordImpl implements MetricsRecord { + + private TagMap tagTable = new TagMap(); + private Map metricTable = new LinkedHashMap(); + + private String recordName; + private AbstractMetricsContext context; + + + /** Creates a new instance of FileRecord */ + protected MetricsRecordImpl(String recordName, AbstractMetricsContext context) + { + this.recordName = recordName; + this.context = context; + } + + /** + * Returns the record name. + * + * @return the record name + */ + public String getRecordName() { + return recordName; + } + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public void setTag(String tagName, String tagValue) { + if (tagValue == null) { + tagValue = ""; + } + tagTable.put(tagName, tagValue); + } + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public void setTag(String tagName, int tagValue) { + tagTable.put(tagName, Integer.valueOf(tagValue)); + } + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public void setTag(String tagName, long tagValue) { + tagTable.put(tagName, Long.valueOf(tagValue)); + } + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public void setTag(String tagName, short tagValue) { + tagTable.put(tagName, Short.valueOf(tagValue)); + } + + /** + * Sets the named tag to the specified value. + * + * @param tagName name of the tag + * @param tagValue new value of the tag + * @throws MetricsException if the tagName conflicts with the configuration + */ + public void setTag(String tagName, byte tagValue) { + tagTable.put(tagName, Byte.valueOf(tagValue)); + } + + /** + * Removes any tag of the specified name. + */ + public void removeTag(String tagName) { + tagTable.remove(tagName); + } + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void setMetric(String metricName, int metricValue) { + setAbsolute(metricName, Integer.valueOf(metricValue)); + } + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void setMetric(String metricName, long metricValue) { + setAbsolute(metricName, Long.valueOf(metricValue)); + } + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void setMetric(String metricName, short metricValue) { + setAbsolute(metricName, Short.valueOf(metricValue)); + } + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void setMetric(String metricName, byte metricValue) { + setAbsolute(metricName, Byte.valueOf(metricValue)); + } + + /** + * Sets the named metric to the specified value. + * + * @param metricName name of the metric + * @param metricValue new value of the metric + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void setMetric(String metricName, float metricValue) { + setAbsolute(metricName, new Float(metricValue)); + } + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void incrMetric(String metricName, int metricValue) { + setIncrement(metricName, Integer.valueOf(metricValue)); + } + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void incrMetric(String metricName, long metricValue) { + setIncrement(metricName, Long.valueOf(metricValue)); + } + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void incrMetric(String metricName, short metricValue) { + setIncrement(metricName, Short.valueOf(metricValue)); + } + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void incrMetric(String metricName, byte metricValue) { + setIncrement(metricName, Byte.valueOf(metricValue)); + } + + /** + * Increments the named metric by the specified value. + * + * @param metricName name of the metric + * @param metricValue incremental value + * @throws MetricsException if the metricName or the type of the metricValue + * conflicts with the configuration + */ + public void incrMetric(String metricName, float metricValue) { + setIncrement(metricName, new Float(metricValue)); + } + + private void setAbsolute(String metricName, Number metricValue) { + metricTable.put(metricName, new MetricValue(metricValue, MetricValue.ABSOLUTE)); + } + + private void setIncrement(String metricName, Number metricValue) { + metricTable.put(metricName, new MetricValue(metricValue, MetricValue.INCREMENT)); + } + + /** + * Updates the table of buffered data which is to be sent periodically. + * If the tag values match an existing row, that row is updated; + * otherwise, a new row is added. + */ + public void update() { + context.update(this); + } + + /** + * Removes the row, if it exists, in the buffered data table having tags + * that equal the tags that have been set on this record. + */ + public void remove() { + context.remove(this); + } + + TagMap getTagTable() { + return tagTable; + } + + Map getMetricTable() { + return metricTable; + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java index 7d321e8a297..4a3424bad32 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java @@ -1,460 +1,460 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.util; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.zip.Checksum; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.fs.ChecksumException; - -/** - * This class provides inteface and utilities for processing checksums for - * DFS data transfers. - */ -@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) -@InterfaceStability.Evolving -public class DataChecksum implements Checksum { - - // Misc constants - public static final int HEADER_LEN = 5; /// 1 byte type and 4 byte len - - // checksum types - public static final int CHECKSUM_NULL = 0; - public static final int CHECKSUM_CRC32 = 1; - public static final int CHECKSUM_CRC32C = 2; - public static final int CHECKSUM_DEFAULT = 3; - public static final int CHECKSUM_MIXED = 4; - - /** The checksum types */ - public static enum Type { - NULL (CHECKSUM_NULL, 0), - CRC32 (CHECKSUM_CRC32, 4), - CRC32C(CHECKSUM_CRC32C, 4), - DEFAULT(CHECKSUM_DEFAULT, 0), // This cannot be used to create DataChecksum - MIXED (CHECKSUM_MIXED, 0); // This cannot be used to create DataChecksum - - public final int id; - public final int size; - - private Type(int id, int size) { - this.id = id; - this.size = size; - } - - /** @return the type corresponding to the id. */ - public static Type valueOf(int id) { - if (id < 0 || id >= values().length) { - throw new IllegalArgumentException("id=" + id - + " out of range [0, " + values().length + ")"); - } - return values()[id]; - } - } - - - public static DataChecksum newDataChecksum(Type type, int bytesPerChecksum ) { - if ( bytesPerChecksum <= 0 ) { - return null; - } - - switch ( type ) { - case NULL : - return new DataChecksum(type, new ChecksumNull(), bytesPerChecksum ); - case CRC32 : - return new DataChecksum(type, new PureJavaCrc32(), bytesPerChecksum ); - case CRC32C: - return new DataChecksum(type, new PureJavaCrc32C(), bytesPerChecksum); - default: - return null; - } - } - - /** - * Creates a DataChecksum from HEADER_LEN bytes from arr[offset]. - * @return DataChecksum of the type in the array or null in case of an error. - */ - public static DataChecksum newDataChecksum( byte bytes[], int offset ) { - if ( offset < 0 || bytes.length < offset + HEADER_LEN ) { - return null; - } - - // like readInt(): - int bytesPerChecksum = ( (bytes[offset+1] & 0xff) << 24 ) | - ( (bytes[offset+2] & 0xff) << 16 ) | - ( (bytes[offset+3] & 0xff) << 8 ) | - ( (bytes[offset+4] & 0xff) ); - return newDataChecksum( Type.valueOf(bytes[0]), bytesPerChecksum ); - } - - /** - * This constructucts a DataChecksum by reading HEADER_LEN bytes from - * input stream in - */ - public static DataChecksum newDataChecksum( DataInputStream in ) - throws IOException { - int type = in.readByte(); - int bpc = in.readInt(); - DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc ); - if ( summer == null ) { - throw new IOException( "Could not create DataChecksum of type " + - type + " with bytesPerChecksum " + bpc ); - } - return summer; - } - - /** - * Writes the checksum header to the output stream out. - */ - public void writeHeader( DataOutputStream out ) - throws IOException { - out.writeByte( type.id ); - out.writeInt( bytesPerChecksum ); - } - - public byte[] getHeader() { - byte[] header = new byte[DataChecksum.HEADER_LEN]; - header[0] = (byte) (type.id & 0xff); - // Writing in buffer just like DataOutput.WriteInt() - header[1+0] = (byte) ((bytesPerChecksum >>> 24) & 0xff); - header[1+1] = (byte) ((bytesPerChecksum >>> 16) & 0xff); - header[1+2] = (byte) ((bytesPerChecksum >>> 8) & 0xff); - header[1+3] = (byte) (bytesPerChecksum & 0xff); - return header; - } - - /** - * Writes the current checksum to the stream. - * If reset is true, then resets the checksum. - * @return number of bytes written. Will be equal to getChecksumSize(); - */ - public int writeValue( DataOutputStream out, boolean reset ) - throws IOException { - if ( type.size <= 0 ) { - return 0; - } - - if ( type.size == 4 ) { - out.writeInt( (int) summer.getValue() ); - } else { - throw new IOException( "Unknown Checksum " + type ); - } - - if ( reset ) { - reset(); - } - - return type.size; - } - - /** - * Writes the current checksum to a buffer. - * If reset is true, then resets the checksum. - * @return number of bytes written. Will be equal to getChecksumSize(); - */ - public int writeValue( byte[] buf, int offset, boolean reset ) - throws IOException { - if ( type.size <= 0 ) { - return 0; - } - - if ( type.size == 4 ) { - int checksum = (int) summer.getValue(); - buf[offset+0] = (byte) ((checksum >>> 24) & 0xff); - buf[offset+1] = (byte) ((checksum >>> 16) & 0xff); - buf[offset+2] = (byte) ((checksum >>> 8) & 0xff); - buf[offset+3] = (byte) (checksum & 0xff); - } else { - throw new IOException( "Unknown Checksum " + type ); - } - - if ( reset ) { - reset(); - } - - return type.size; - } - - /** - * Compares the checksum located at buf[offset] with the current checksum. - * @return true if the checksum matches and false otherwise. - */ - public boolean compare( byte buf[], int offset ) { - if ( type.size == 4 ) { - int checksum = ( (buf[offset+0] & 0xff) << 24 ) | - ( (buf[offset+1] & 0xff) << 16 ) | - ( (buf[offset+2] & 0xff) << 8 ) | - ( (buf[offset+3] & 0xff) ); - return checksum == (int) summer.getValue(); - } - return type.size == 0; - } - - private final Type type; - private final Checksum summer; - private final int bytesPerChecksum; - private int inSum = 0; - - private DataChecksum( Type type, Checksum checksum, int chunkSize ) { - this.type = type; - summer = checksum; - bytesPerChecksum = chunkSize; - } - - // Accessors - public Type getChecksumType() { - return type; - } - public int getChecksumSize() { - return type.size; - } - public int getBytesPerChecksum() { - return bytesPerChecksum; - } - public int getNumBytesInSum() { - return inSum; - } - - public static final int SIZE_OF_INTEGER = Integer.SIZE / Byte.SIZE; - static public int getChecksumHeaderSize() { - return 1 + SIZE_OF_INTEGER; // type byte, bytesPerChecksum int - } - //Checksum Interface. Just a wrapper around member summer. - @Override - public long getValue() { - return summer.getValue(); - } - @Override - public void reset() { - summer.reset(); - inSum = 0; - } - @Override - public void update( byte[] b, int off, int len ) { - if ( len > 0 ) { - summer.update( b, off, len ); - inSum += len; - } - } - @Override - public void update( int b ) { - summer.update( b ); - inSum += 1; - } - - /** - * Verify that the given checksums match the given data. - * - * The 'mark' of the ByteBuffer parameters may be modified by this function,. - * but the position is maintained. - * - * @param data the DirectByteBuffer pointing to the data to verify. - * @param checksums the DirectByteBuffer pointing to a series of stored - * checksums - * @param fileName the name of the file being read, for error-reporting - * @param basePos the file position to which the start of 'data' corresponds - * @throws ChecksumException if the checksums do not match - */ - public void verifyChunkedSums(ByteBuffer data, ByteBuffer checksums, - String fileName, long basePos) - throws ChecksumException { - if (type.size == 0) return; - - if (data.hasArray() && checksums.hasArray()) { - verifyChunkedSums( - data.array(), data.arrayOffset() + data.position(), data.remaining(), - checksums.array(), checksums.arrayOffset() + checksums.position(), - fileName, basePos); - return; - } - if (NativeCrc32.isAvailable()) { - NativeCrc32.verifyChunkedSums(bytesPerChecksum, type.id, checksums, data, - fileName, basePos); - return; - } - - int startDataPos = data.position(); - data.mark(); - checksums.mark(); - try { - byte[] buf = new byte[bytesPerChecksum]; - byte[] sum = new byte[type.size]; - while (data.remaining() > 0) { - int n = Math.min(data.remaining(), bytesPerChecksum); - checksums.get(sum); - data.get(buf, 0, n); - summer.reset(); - summer.update(buf, 0, n); - int calculated = (int)summer.getValue(); - int stored = (sum[0] << 24 & 0xff000000) | - (sum[1] << 16 & 0xff0000) | - (sum[2] << 8 & 0xff00) | - sum[3] & 0xff; - if (calculated != stored) { - long errPos = basePos + data.position() - startDataPos - n; - throw new ChecksumException( - "Checksum error: "+ fileName + " at "+ errPos + - " exp: " + stored + " got: " + calculated, errPos); - } - } - } finally { - data.reset(); - checksums.reset(); - } - } - - /** - * Implementation of chunked verification specifically on byte arrays. This - * is to avoid the copy when dealing with ByteBuffers that have array backing. - */ - private void verifyChunkedSums( - byte[] data, int dataOff, int dataLen, - byte[] checksums, int checksumsOff, String fileName, - long basePos) throws ChecksumException { - - int remaining = dataLen; - int dataPos = 0; - while (remaining > 0) { - int n = Math.min(remaining, bytesPerChecksum); - - summer.reset(); - summer.update(data, dataOff + dataPos, n); - dataPos += n; - remaining -= n; - - int calculated = (int)summer.getValue(); - int stored = (checksums[checksumsOff] << 24 & 0xff000000) | - (checksums[checksumsOff + 1] << 16 & 0xff0000) | - (checksums[checksumsOff + 2] << 8 & 0xff00) | - checksums[checksumsOff + 3] & 0xff; - checksumsOff += 4; - if (calculated != stored) { - long errPos = basePos + dataPos - n; - throw new ChecksumException( - "Checksum error: "+ fileName + " at "+ errPos + - " exp: " + stored + " got: " + calculated, errPos); - } - } - } - - /** - * Calculate checksums for the given data. - * - * The 'mark' of the ByteBuffer parameters may be modified by this function, - * but the position is maintained. - * - * @param data the DirectByteBuffer pointing to the data to checksum. - * @param checksums the DirectByteBuffer into which checksums will be - * stored. Enough space must be available in this - * buffer to put the checksums. - */ - public void calculateChunkedSums(ByteBuffer data, ByteBuffer checksums) { - if (type.size == 0) return; - - if (data.hasArray() && checksums.hasArray()) { - calculateChunkedSums(data.array(), data.arrayOffset() + data.position(), data.remaining(), - checksums.array(), checksums.arrayOffset() + checksums.position()); - return; - } - - data.mark(); - checksums.mark(); - try { - byte[] buf = new byte[bytesPerChecksum]; - while (data.remaining() > 0) { - int n = Math.min(data.remaining(), bytesPerChecksum); - data.get(buf, 0, n); - summer.reset(); - summer.update(buf, 0, n); - checksums.putInt((int)summer.getValue()); - } - } finally { - data.reset(); - checksums.reset(); - } - } - - /** - * Implementation of chunked calculation specifically on byte arrays. This - * is to avoid the copy when dealing with ByteBuffers that have array backing. - */ - private void calculateChunkedSums( - byte[] data, int dataOffset, int dataLength, - byte[] sums, int sumsOffset) { - - int remaining = dataLength; - while (remaining > 0) { - int n = Math.min(remaining, bytesPerChecksum); - summer.reset(); - summer.update(data, dataOffset, n); - dataOffset += n; - remaining -= n; - long calculated = summer.getValue(); - sums[sumsOffset++] = (byte) (calculated >> 24); - sums[sumsOffset++] = (byte) (calculated >> 16); - sums[sumsOffset++] = (byte) (calculated >> 8); - sums[sumsOffset++] = (byte) (calculated); - } - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof DataChecksum)) { - return false; - } - DataChecksum o = (DataChecksum)other; - return o.bytesPerChecksum == this.bytesPerChecksum && - o.type == this.type; - } - - @Override - public int hashCode() { - return (this.type.id + 31) * this.bytesPerChecksum; - } - - @Override - public String toString() { - return "DataChecksum(type=" + type + - ", chunkSize=" + bytesPerChecksum + ")"; - } - - /** - * This just provides a dummy implimentation for Checksum class - * This is used when there is no checksum available or required for - * data - */ - static class ChecksumNull implements Checksum { - - public ChecksumNull() {} - - //Dummy interface - @Override - public long getValue() { return 0; } - @Override - public void reset() {} - @Override - public void update(byte[] b, int off, int len) {} - @Override - public void update(int b) {} - }; -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.util; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.zip.Checksum; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.ChecksumException; + +/** + * This class provides inteface and utilities for processing checksums for + * DFS data transfers. + */ +@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) +@InterfaceStability.Evolving +public class DataChecksum implements Checksum { + + // Misc constants + public static final int HEADER_LEN = 5; /// 1 byte type and 4 byte len + + // checksum types + public static final int CHECKSUM_NULL = 0; + public static final int CHECKSUM_CRC32 = 1; + public static final int CHECKSUM_CRC32C = 2; + public static final int CHECKSUM_DEFAULT = 3; + public static final int CHECKSUM_MIXED = 4; + + /** The checksum types */ + public static enum Type { + NULL (CHECKSUM_NULL, 0), + CRC32 (CHECKSUM_CRC32, 4), + CRC32C(CHECKSUM_CRC32C, 4), + DEFAULT(CHECKSUM_DEFAULT, 0), // This cannot be used to create DataChecksum + MIXED (CHECKSUM_MIXED, 0); // This cannot be used to create DataChecksum + + public final int id; + public final int size; + + private Type(int id, int size) { + this.id = id; + this.size = size; + } + + /** @return the type corresponding to the id. */ + public static Type valueOf(int id) { + if (id < 0 || id >= values().length) { + throw new IllegalArgumentException("id=" + id + + " out of range [0, " + values().length + ")"); + } + return values()[id]; + } + } + + + public static DataChecksum newDataChecksum(Type type, int bytesPerChecksum ) { + if ( bytesPerChecksum <= 0 ) { + return null; + } + + switch ( type ) { + case NULL : + return new DataChecksum(type, new ChecksumNull(), bytesPerChecksum ); + case CRC32 : + return new DataChecksum(type, new PureJavaCrc32(), bytesPerChecksum ); + case CRC32C: + return new DataChecksum(type, new PureJavaCrc32C(), bytesPerChecksum); + default: + return null; + } + } + + /** + * Creates a DataChecksum from HEADER_LEN bytes from arr[offset]. + * @return DataChecksum of the type in the array or null in case of an error. + */ + public static DataChecksum newDataChecksum( byte bytes[], int offset ) { + if ( offset < 0 || bytes.length < offset + HEADER_LEN ) { + return null; + } + + // like readInt(): + int bytesPerChecksum = ( (bytes[offset+1] & 0xff) << 24 ) | + ( (bytes[offset+2] & 0xff) << 16 ) | + ( (bytes[offset+3] & 0xff) << 8 ) | + ( (bytes[offset+4] & 0xff) ); + return newDataChecksum( Type.valueOf(bytes[0]), bytesPerChecksum ); + } + + /** + * This constructucts a DataChecksum by reading HEADER_LEN bytes from + * input stream in + */ + public static DataChecksum newDataChecksum( DataInputStream in ) + throws IOException { + int type = in.readByte(); + int bpc = in.readInt(); + DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc ); + if ( summer == null ) { + throw new IOException( "Could not create DataChecksum of type " + + type + " with bytesPerChecksum " + bpc ); + } + return summer; + } + + /** + * Writes the checksum header to the output stream out. + */ + public void writeHeader( DataOutputStream out ) + throws IOException { + out.writeByte( type.id ); + out.writeInt( bytesPerChecksum ); + } + + public byte[] getHeader() { + byte[] header = new byte[DataChecksum.HEADER_LEN]; + header[0] = (byte) (type.id & 0xff); + // Writing in buffer just like DataOutput.WriteInt() + header[1+0] = (byte) ((bytesPerChecksum >>> 24) & 0xff); + header[1+1] = (byte) ((bytesPerChecksum >>> 16) & 0xff); + header[1+2] = (byte) ((bytesPerChecksum >>> 8) & 0xff); + header[1+3] = (byte) (bytesPerChecksum & 0xff); + return header; + } + + /** + * Writes the current checksum to the stream. + * If reset is true, then resets the checksum. + * @return number of bytes written. Will be equal to getChecksumSize(); + */ + public int writeValue( DataOutputStream out, boolean reset ) + throws IOException { + if ( type.size <= 0 ) { + return 0; + } + + if ( type.size == 4 ) { + out.writeInt( (int) summer.getValue() ); + } else { + throw new IOException( "Unknown Checksum " + type ); + } + + if ( reset ) { + reset(); + } + + return type.size; + } + + /** + * Writes the current checksum to a buffer. + * If reset is true, then resets the checksum. + * @return number of bytes written. Will be equal to getChecksumSize(); + */ + public int writeValue( byte[] buf, int offset, boolean reset ) + throws IOException { + if ( type.size <= 0 ) { + return 0; + } + + if ( type.size == 4 ) { + int checksum = (int) summer.getValue(); + buf[offset+0] = (byte) ((checksum >>> 24) & 0xff); + buf[offset+1] = (byte) ((checksum >>> 16) & 0xff); + buf[offset+2] = (byte) ((checksum >>> 8) & 0xff); + buf[offset+3] = (byte) (checksum & 0xff); + } else { + throw new IOException( "Unknown Checksum " + type ); + } + + if ( reset ) { + reset(); + } + + return type.size; + } + + /** + * Compares the checksum located at buf[offset] with the current checksum. + * @return true if the checksum matches and false otherwise. + */ + public boolean compare( byte buf[], int offset ) { + if ( type.size == 4 ) { + int checksum = ( (buf[offset+0] & 0xff) << 24 ) | + ( (buf[offset+1] & 0xff) << 16 ) | + ( (buf[offset+2] & 0xff) << 8 ) | + ( (buf[offset+3] & 0xff) ); + return checksum == (int) summer.getValue(); + } + return type.size == 0; + } + + private final Type type; + private final Checksum summer; + private final int bytesPerChecksum; + private int inSum = 0; + + private DataChecksum( Type type, Checksum checksum, int chunkSize ) { + this.type = type; + summer = checksum; + bytesPerChecksum = chunkSize; + } + + // Accessors + public Type getChecksumType() { + return type; + } + public int getChecksumSize() { + return type.size; + } + public int getBytesPerChecksum() { + return bytesPerChecksum; + } + public int getNumBytesInSum() { + return inSum; + } + + public static final int SIZE_OF_INTEGER = Integer.SIZE / Byte.SIZE; + static public int getChecksumHeaderSize() { + return 1 + SIZE_OF_INTEGER; // type byte, bytesPerChecksum int + } + //Checksum Interface. Just a wrapper around member summer. + @Override + public long getValue() { + return summer.getValue(); + } + @Override + public void reset() { + summer.reset(); + inSum = 0; + } + @Override + public void update( byte[] b, int off, int len ) { + if ( len > 0 ) { + summer.update( b, off, len ); + inSum += len; + } + } + @Override + public void update( int b ) { + summer.update( b ); + inSum += 1; + } + + /** + * Verify that the given checksums match the given data. + * + * The 'mark' of the ByteBuffer parameters may be modified by this function,. + * but the position is maintained. + * + * @param data the DirectByteBuffer pointing to the data to verify. + * @param checksums the DirectByteBuffer pointing to a series of stored + * checksums + * @param fileName the name of the file being read, for error-reporting + * @param basePos the file position to which the start of 'data' corresponds + * @throws ChecksumException if the checksums do not match + */ + public void verifyChunkedSums(ByteBuffer data, ByteBuffer checksums, + String fileName, long basePos) + throws ChecksumException { + if (type.size == 0) return; + + if (data.hasArray() && checksums.hasArray()) { + verifyChunkedSums( + data.array(), data.arrayOffset() + data.position(), data.remaining(), + checksums.array(), checksums.arrayOffset() + checksums.position(), + fileName, basePos); + return; + } + if (NativeCrc32.isAvailable()) { + NativeCrc32.verifyChunkedSums(bytesPerChecksum, type.id, checksums, data, + fileName, basePos); + return; + } + + int startDataPos = data.position(); + data.mark(); + checksums.mark(); + try { + byte[] buf = new byte[bytesPerChecksum]; + byte[] sum = new byte[type.size]; + while (data.remaining() > 0) { + int n = Math.min(data.remaining(), bytesPerChecksum); + checksums.get(sum); + data.get(buf, 0, n); + summer.reset(); + summer.update(buf, 0, n); + int calculated = (int)summer.getValue(); + int stored = (sum[0] << 24 & 0xff000000) | + (sum[1] << 16 & 0xff0000) | + (sum[2] << 8 & 0xff00) | + sum[3] & 0xff; + if (calculated != stored) { + long errPos = basePos + data.position() - startDataPos - n; + throw new ChecksumException( + "Checksum error: "+ fileName + " at "+ errPos + + " exp: " + stored + " got: " + calculated, errPos); + } + } + } finally { + data.reset(); + checksums.reset(); + } + } + + /** + * Implementation of chunked verification specifically on byte arrays. This + * is to avoid the copy when dealing with ByteBuffers that have array backing. + */ + private void verifyChunkedSums( + byte[] data, int dataOff, int dataLen, + byte[] checksums, int checksumsOff, String fileName, + long basePos) throws ChecksumException { + + int remaining = dataLen; + int dataPos = 0; + while (remaining > 0) { + int n = Math.min(remaining, bytesPerChecksum); + + summer.reset(); + summer.update(data, dataOff + dataPos, n); + dataPos += n; + remaining -= n; + + int calculated = (int)summer.getValue(); + int stored = (checksums[checksumsOff] << 24 & 0xff000000) | + (checksums[checksumsOff + 1] << 16 & 0xff0000) | + (checksums[checksumsOff + 2] << 8 & 0xff00) | + checksums[checksumsOff + 3] & 0xff; + checksumsOff += 4; + if (calculated != stored) { + long errPos = basePos + dataPos - n; + throw new ChecksumException( + "Checksum error: "+ fileName + " at "+ errPos + + " exp: " + stored + " got: " + calculated, errPos); + } + } + } + + /** + * Calculate checksums for the given data. + * + * The 'mark' of the ByteBuffer parameters may be modified by this function, + * but the position is maintained. + * + * @param data the DirectByteBuffer pointing to the data to checksum. + * @param checksums the DirectByteBuffer into which checksums will be + * stored. Enough space must be available in this + * buffer to put the checksums. + */ + public void calculateChunkedSums(ByteBuffer data, ByteBuffer checksums) { + if (type.size == 0) return; + + if (data.hasArray() && checksums.hasArray()) { + calculateChunkedSums(data.array(), data.arrayOffset() + data.position(), data.remaining(), + checksums.array(), checksums.arrayOffset() + checksums.position()); + return; + } + + data.mark(); + checksums.mark(); + try { + byte[] buf = new byte[bytesPerChecksum]; + while (data.remaining() > 0) { + int n = Math.min(data.remaining(), bytesPerChecksum); + data.get(buf, 0, n); + summer.reset(); + summer.update(buf, 0, n); + checksums.putInt((int)summer.getValue()); + } + } finally { + data.reset(); + checksums.reset(); + } + } + + /** + * Implementation of chunked calculation specifically on byte arrays. This + * is to avoid the copy when dealing with ByteBuffers that have array backing. + */ + private void calculateChunkedSums( + byte[] data, int dataOffset, int dataLength, + byte[] sums, int sumsOffset) { + + int remaining = dataLength; + while (remaining > 0) { + int n = Math.min(remaining, bytesPerChecksum); + summer.reset(); + summer.update(data, dataOffset, n); + dataOffset += n; + remaining -= n; + long calculated = summer.getValue(); + sums[sumsOffset++] = (byte) (calculated >> 24); + sums[sumsOffset++] = (byte) (calculated >> 16); + sums[sumsOffset++] = (byte) (calculated >> 8); + sums[sumsOffset++] = (byte) (calculated); + } + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof DataChecksum)) { + return false; + } + DataChecksum o = (DataChecksum)other; + return o.bytesPerChecksum == this.bytesPerChecksum && + o.type == this.type; + } + + @Override + public int hashCode() { + return (this.type.id + 31) * this.bytesPerChecksum; + } + + @Override + public String toString() { + return "DataChecksum(type=" + type + + ", chunkSize=" + bytesPerChecksum + ")"; + } + + /** + * This just provides a dummy implimentation for Checksum class + * This is used when there is no checksum available or required for + * data + */ + static class ChecksumNull implements Checksum { + + public ChecksumNull() {} + + //Dummy interface + @Override + public long getValue() { return 0; } + @Override + public void reset() {} + @Override + public void update(byte[] b, int off, int len) {} + @Override + public void update(int b) {} + }; +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 5e642124f6c..dbf4c9d425c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -48,6 +48,9 @@ Release 2.0.3-alpha - Unreleased HDFS-4041. Hadoop HDFS Maven protoc calls must not depend on external sh script. (Chris Nauroth via suresh) + HADOOP-8911. CRLF characters in source and text files. + (Raja Aluri via suresh) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/libhdfs.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/libhdfs.xml index 44ab6c9c2b1..2d4091b98b6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/libhdfs.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/libhdfs.xml @@ -1,110 +1,110 @@ - - - - - - -

    -C API libhdfs -Content-Type -text/html; -utf-8 -
    - -
    -Overview - -

    -libhdfs is a JNI based C API for Hadoop's Distributed File System (HDFS). -It provides C APIs to a subset of the HDFS APIs to manipulate HDFS files and -the filesystem. libhdfs is part of the Hadoop distribution and comes -pre-compiled in ${HADOOP_PREFIX}/libhdfs/libhdfs.so . -

    - -
    -
    -The APIs - -

    -The libhdfs APIs are a subset of: hadoop fs APIs. -

    -

    -The header file for libhdfs describes each API in detail and is available in ${HADOOP_PREFIX}/src/c++/libhdfs/hdfs.h -

    -
    -
    -A Sample Program - - -#include "hdfs.h" - -int main(int argc, char **argv) { - - hdfsFS fs = hdfsConnect("default", 0); - const char* writePath = "/tmp/testfile.txt"; - hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0); - if(!writeFile) { - fprintf(stderr, "Failed to open %s for writing!\n", writePath); - exit(-1); - } - char* buffer = "Hello, World!"; - tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1); - if (hdfsFlush(fs, writeFile)) { - fprintf(stderr, "Failed to 'flush' %s\n", writePath); - exit(-1); - } - hdfsCloseFile(fs, writeFile); -} - -
    - -
    -How To Link With The Library -

    -See the Makefile for hdfs_test.c in the libhdfs source directory (${HADOOP_PREFIX}/src/c++/libhdfs/Makefile) or something like:
    -gcc above_sample.c -I${HADOOP_PREFIX}/src/c++/libhdfs -L${HADOOP_PREFIX}/libhdfs -lhdfs -o above_sample -

    -
    -
    -Common Problems -

    -The most common problem is the CLASSPATH is not set properly when calling a program that uses libhdfs. -Make sure you set it to all the Hadoop jars needed to run Hadoop itself. Currently, there is no way to -programmatically generate the classpath, but a good bet is to include all the jar files in ${HADOOP_PREFIX} -and ${HADOOP_PREFIX}/lib as well as the right configuration directory containing hdfs-site.xml -

    -
    -
    -Thread Safe -

    libdhfs is thread safe.

    -
      -
    • Concurrency and Hadoop FS "handles" -
      The Hadoop FS implementation includes a FS handle cache which caches based on the URI of the -namenode along with the user connecting. So, all calls to hdfsConnect will return the same handle but -calls to hdfsConnectAsUser with different users will return different handles. But, since HDFS client -handles are completely thread safe, this has no bearing on concurrency. -
    • -
    • Concurrency and libhdfs/JNI -
      The libhdfs calls to JNI should always be creating thread local storage, so (in theory), libhdfs -should be as thread safe as the underlying calls to the Hadoop FS. -
    • -
    -
    - - + + + + + + +
    +C API libhdfs +Content-Type +text/html; +utf-8 +
    + +
    +Overview + +

    +libhdfs is a JNI based C API for Hadoop's Distributed File System (HDFS). +It provides C APIs to a subset of the HDFS APIs to manipulate HDFS files and +the filesystem. libhdfs is part of the Hadoop distribution and comes +pre-compiled in ${HADOOP_PREFIX}/libhdfs/libhdfs.so . +

    + +
    +
    +The APIs + +

    +The libhdfs APIs are a subset of: hadoop fs APIs. +

    +

    +The header file for libhdfs describes each API in detail and is available in ${HADOOP_PREFIX}/src/c++/libhdfs/hdfs.h +

    +
    +
    +A Sample Program + + +#include "hdfs.h" + +int main(int argc, char **argv) { + + hdfsFS fs = hdfsConnect("default", 0); + const char* writePath = "/tmp/testfile.txt"; + hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0); + if(!writeFile) { + fprintf(stderr, "Failed to open %s for writing!\n", writePath); + exit(-1); + } + char* buffer = "Hello, World!"; + tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1); + if (hdfsFlush(fs, writeFile)) { + fprintf(stderr, "Failed to 'flush' %s\n", writePath); + exit(-1); + } + hdfsCloseFile(fs, writeFile); +} + +
    + +
    +How To Link With The Library +

    +See the Makefile for hdfs_test.c in the libhdfs source directory (${HADOOP_PREFIX}/src/c++/libhdfs/Makefile) or something like:
    +gcc above_sample.c -I${HADOOP_PREFIX}/src/c++/libhdfs -L${HADOOP_PREFIX}/libhdfs -lhdfs -o above_sample +

    +
    +
    +Common Problems +

    +The most common problem is the CLASSPATH is not set properly when calling a program that uses libhdfs. +Make sure you set it to all the Hadoop jars needed to run Hadoop itself. Currently, there is no way to +programmatically generate the classpath, but a good bet is to include all the jar files in ${HADOOP_PREFIX} +and ${HADOOP_PREFIX}/lib as well as the right configuration directory containing hdfs-site.xml +

    +
    +
    +Thread Safe +

    libdhfs is thread safe.

    +
      +
    • Concurrency and Hadoop FS "handles" +
      The Hadoop FS implementation includes a FS handle cache which caches based on the URI of the +namenode along with the user connecting. So, all calls to hdfsConnect will return the same handle but +calls to hdfsConnectAsUser with different users will return different handles. But, since HDFS client +handles are completely thread safe, this has no bearing on concurrency. +
    • +
    • Concurrency and libhdfs/JNI +
      The libhdfs calls to JNI should always be creating thread local storage, so (in theory), libhdfs +should be as thread safe as the underlying calls to the Hadoop FS. +
    • +
    +
    + +
    diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 9f520ff3a45..34cabae644a 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -17,6 +17,9 @@ Release 2.0.3-alpha - Unreleased MAPREDUCE-4616. Improve javadoc for MultipleOutputs. (Tony Burton via acmurthy) + HADOOP-8911. CRLF characters in source and text files. + (Raja Aluri via suresh) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestClientProtocolProviderImpls.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestClientProtocolProviderImpls.java index f718e1f4998..74148aea710 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestClientProtocolProviderImpls.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestClientProtocolProviderImpls.java @@ -1,120 +1,120 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.mapreduce; - -import java.io.IOException; - -import junit.framework.TestCase; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.mapred.LocalJobRunner; -import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; -import org.junit.Test; - -public class TestClientProtocolProviderImpls extends TestCase { - - @Test - public void testClusterWithLocalClientProvider() throws Exception { - - Configuration conf = new Configuration(); - - try { - conf.set(MRConfig.FRAMEWORK_NAME, "incorrect"); - new Cluster(conf); - fail("Cluster should not be initialized with incorrect framework name"); - } catch (IOException e) { - - } - - try { - conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.LOCAL_FRAMEWORK_NAME); - conf.set(JTConfig.JT_IPC_ADDRESS, "127.0.0.1:0"); - - new Cluster(conf); - fail("Cluster with Local Framework name should use local JT address"); - } catch (IOException e) { - - } - - try { - conf.set(JTConfig.JT_IPC_ADDRESS, "local"); - Cluster cluster = new Cluster(conf); - assertTrue(cluster.getClient() instanceof LocalJobRunner); - cluster.close(); - } catch (IOException e) { - - } - } - - @Test - public void testClusterWithJTClientProvider() throws Exception { - - Configuration conf = new Configuration(); - try { - conf.set(MRConfig.FRAMEWORK_NAME, "incorrect"); - new Cluster(conf); - fail("Cluster should not be initialized with incorrect framework name"); - - } catch (IOException e) { - - } - - try { - conf.set(MRConfig.FRAMEWORK_NAME, "classic"); - conf.set(JTConfig.JT_IPC_ADDRESS, "local"); - new Cluster(conf); - fail("Cluster with classic Framework name shouldnot use local JT address"); - - } catch (IOException e) { - - } - - try { - conf = new Configuration(); - conf.set(MRConfig.FRAMEWORK_NAME, "classic"); - conf.set(JTConfig.JT_IPC_ADDRESS, "127.0.0.1:0"); - Cluster cluster = new Cluster(conf); - cluster.close(); - } catch (IOException e) { - - } - } - - @Test - public void testClusterException() { - - Configuration conf = new Configuration(); - conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.CLASSIC_FRAMEWORK_NAME); - conf.set(JTConfig.JT_IPC_ADDRESS, "local"); - - // initializing a cluster with this conf should throw an error. - // However the exception thrown should not be specific to either - // the job tracker client provider or the local provider - boolean errorThrown = false; - try { - Cluster cluster = new Cluster(conf); - cluster.close(); - fail("Not expected - cluster init should have failed"); - } catch (IOException e) { - errorThrown = true; - assert(e.getMessage().contains("Cannot initialize Cluster. Please check")); - } - assert(errorThrown); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapreduce; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.LocalJobRunner; +import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; +import org.junit.Test; + +public class TestClientProtocolProviderImpls extends TestCase { + + @Test + public void testClusterWithLocalClientProvider() throws Exception { + + Configuration conf = new Configuration(); + + try { + conf.set(MRConfig.FRAMEWORK_NAME, "incorrect"); + new Cluster(conf); + fail("Cluster should not be initialized with incorrect framework name"); + } catch (IOException e) { + + } + + try { + conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.LOCAL_FRAMEWORK_NAME); + conf.set(JTConfig.JT_IPC_ADDRESS, "127.0.0.1:0"); + + new Cluster(conf); + fail("Cluster with Local Framework name should use local JT address"); + } catch (IOException e) { + + } + + try { + conf.set(JTConfig.JT_IPC_ADDRESS, "local"); + Cluster cluster = new Cluster(conf); + assertTrue(cluster.getClient() instanceof LocalJobRunner); + cluster.close(); + } catch (IOException e) { + + } + } + + @Test + public void testClusterWithJTClientProvider() throws Exception { + + Configuration conf = new Configuration(); + try { + conf.set(MRConfig.FRAMEWORK_NAME, "incorrect"); + new Cluster(conf); + fail("Cluster should not be initialized with incorrect framework name"); + + } catch (IOException e) { + + } + + try { + conf.set(MRConfig.FRAMEWORK_NAME, "classic"); + conf.set(JTConfig.JT_IPC_ADDRESS, "local"); + new Cluster(conf); + fail("Cluster with classic Framework name shouldnot use local JT address"); + + } catch (IOException e) { + + } + + try { + conf = new Configuration(); + conf.set(MRConfig.FRAMEWORK_NAME, "classic"); + conf.set(JTConfig.JT_IPC_ADDRESS, "127.0.0.1:0"); + Cluster cluster = new Cluster(conf); + cluster.close(); + } catch (IOException e) { + + } + } + + @Test + public void testClusterException() { + + Configuration conf = new Configuration(); + conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.CLASSIC_FRAMEWORK_NAME); + conf.set(JTConfig.JT_IPC_ADDRESS, "local"); + + // initializing a cluster with this conf should throw an error. + // However the exception thrown should not be specific to either + // the job tracker client provider or the local provider + boolean errorThrown = false; + try { + Cluster cluster = new Cluster(conf); + cluster.close(); + fail("Not expected - cluster init should have failed"); + } catch (IOException e) { + errorThrown = true; + assert(e.getMessage().contains("Cannot initialize Cluster. Please check")); + } + assert(errorThrown); + } +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestYarnClientProtocolProvider.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestYarnClientProtocolProvider.java index 1bbffb8fde1..49c5dc88a64 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestYarnClientProtocolProvider.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestYarnClientProtocolProvider.java @@ -1,129 +1,129 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.mapreduce; - -import static org.mockito.Matchers.any; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.io.IOException; -import java.nio.ByteBuffer; - -import junit.framework.TestCase; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.LocalJobRunner; -import org.apache.hadoop.mapred.ResourceMgrDelegate; -import org.apache.hadoop.mapred.YARNRunner; -import org.apache.hadoop.mapreduce.protocol.ClientProtocol; -import org.apache.hadoop.security.token.Token; -import org.apache.hadoop.yarn.api.ClientRMProtocol; -import org.apache.hadoop.yarn.api.protocolrecords.GetDelegationTokenRequest; -import org.apache.hadoop.yarn.api.protocolrecords.GetDelegationTokenResponse; -import org.apache.hadoop.yarn.api.records.DelegationToken; -import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.factories.RecordFactory; -import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; -import org.junit.Test; - -public class TestYarnClientProtocolProvider extends TestCase { - - private static final RecordFactory recordFactory = RecordFactoryProvider. - getRecordFactory(null); - - @Test - public void testClusterWithYarnClientProtocolProvider() throws Exception { - - Configuration conf = new Configuration(false); - Cluster cluster = null; - - try { - cluster = new Cluster(conf); - } catch (Exception e) { - throw new Exception( - "Failed to initialize a local runner w/o a cluster framework key", e); - } - - try { - assertTrue("client is not a LocalJobRunner", - cluster.getClient() instanceof LocalJobRunner); - } finally { - if (cluster != null) { - cluster.close(); - } - } - - try { - conf = new Configuration(); - conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME); - cluster = new Cluster(conf); - ClientProtocol client = cluster.getClient(); - assertTrue("client is a YARNRunner", client instanceof YARNRunner); - } catch (IOException e) { - - } finally { - if (cluster != null) { - cluster.close(); - } - } - } - - - @Test - public void testClusterGetDelegationToken() throws Exception { - - Configuration conf = new Configuration(false); - Cluster cluster = null; - try { - conf = new Configuration(); - conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME); - cluster = new Cluster(conf); - YARNRunner yrunner = (YARNRunner) cluster.getClient(); - GetDelegationTokenResponse getDTResponse = - recordFactory.newRecordInstance(GetDelegationTokenResponse.class); - DelegationToken rmDTToken = recordFactory.newRecordInstance( - DelegationToken.class); - rmDTToken.setIdentifier(ByteBuffer.wrap(new byte[2])); - rmDTToken.setKind("Testclusterkind"); - rmDTToken.setPassword(ByteBuffer.wrap("testcluster".getBytes())); - rmDTToken.setService("0.0.0.0:8032"); - getDTResponse.setRMDelegationToken(rmDTToken); - final ClientRMProtocol cRMProtocol = mock(ClientRMProtocol.class); - when(cRMProtocol.getDelegationToken(any( - GetDelegationTokenRequest.class))).thenReturn(getDTResponse); - ResourceMgrDelegate rmgrDelegate = new ResourceMgrDelegate( - new YarnConfiguration(conf)) { - @Override - public synchronized void start() { - this.rmClient = cRMProtocol; - } - }; - yrunner.setResourceMgrDelegate(rmgrDelegate); - Token t = cluster.getDelegationToken(new Text(" ")); - assertTrue("Token kind is instead " + t.getKind().toString(), - "Testclusterkind".equals(t.getKind().toString())); - } finally { - if (cluster != null) { - cluster.close(); - } - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapreduce; + +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.LocalJobRunner; +import org.apache.hadoop.mapred.ResourceMgrDelegate; +import org.apache.hadoop.mapred.YARNRunner; +import org.apache.hadoop.mapreduce.protocol.ClientProtocol; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.yarn.api.ClientRMProtocol; +import org.apache.hadoop.yarn.api.protocolrecords.GetDelegationTokenRequest; +import org.apache.hadoop.yarn.api.protocolrecords.GetDelegationTokenResponse; +import org.apache.hadoop.yarn.api.records.DelegationToken; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.factories.RecordFactory; +import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; +import org.junit.Test; + +public class TestYarnClientProtocolProvider extends TestCase { + + private static final RecordFactory recordFactory = RecordFactoryProvider. + getRecordFactory(null); + + @Test + public void testClusterWithYarnClientProtocolProvider() throws Exception { + + Configuration conf = new Configuration(false); + Cluster cluster = null; + + try { + cluster = new Cluster(conf); + } catch (Exception e) { + throw new Exception( + "Failed to initialize a local runner w/o a cluster framework key", e); + } + + try { + assertTrue("client is not a LocalJobRunner", + cluster.getClient() instanceof LocalJobRunner); + } finally { + if (cluster != null) { + cluster.close(); + } + } + + try { + conf = new Configuration(); + conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME); + cluster = new Cluster(conf); + ClientProtocol client = cluster.getClient(); + assertTrue("client is a YARNRunner", client instanceof YARNRunner); + } catch (IOException e) { + + } finally { + if (cluster != null) { + cluster.close(); + } + } + } + + + @Test + public void testClusterGetDelegationToken() throws Exception { + + Configuration conf = new Configuration(false); + Cluster cluster = null; + try { + conf = new Configuration(); + conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME); + cluster = new Cluster(conf); + YARNRunner yrunner = (YARNRunner) cluster.getClient(); + GetDelegationTokenResponse getDTResponse = + recordFactory.newRecordInstance(GetDelegationTokenResponse.class); + DelegationToken rmDTToken = recordFactory.newRecordInstance( + DelegationToken.class); + rmDTToken.setIdentifier(ByteBuffer.wrap(new byte[2])); + rmDTToken.setKind("Testclusterkind"); + rmDTToken.setPassword(ByteBuffer.wrap("testcluster".getBytes())); + rmDTToken.setService("0.0.0.0:8032"); + getDTResponse.setRMDelegationToken(rmDTToken); + final ClientRMProtocol cRMProtocol = mock(ClientRMProtocol.class); + when(cRMProtocol.getDelegationToken(any( + GetDelegationTokenRequest.class))).thenReturn(getDTResponse); + ResourceMgrDelegate rmgrDelegate = new ResourceMgrDelegate( + new YarnConfiguration(conf)) { + @Override + public synchronized void start() { + this.rmClient = cRMProtocol; + } + }; + yrunner.setResourceMgrDelegate(rmgrDelegate); + Token t = cluster.getDelegationToken(new Text(" ")); + assertTrue("Token kind is instead " + t.getKind().toString(), + "Testclusterkind".equals(t.getKind().toString())); + } finally { + if (cluster != null) { + cluster.close(); + } + } + } + +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMean.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMean.java index bc2d658b231..b1f7a67a53c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMean.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMean.java @@ -1,196 +1,196 @@ -package org.apache.hadoop.examples; - -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.StringTokenizer; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - -public class WordMean extends Configured implements Tool { - - private double mean = 0; - - private final static Text COUNT = new Text("count"); - private final static Text LENGTH = new Text("length"); - private final static LongWritable ONE = new LongWritable(1); - - /** - * Maps words from line of text into 2 key-value pairs; one key-value pair for - * counting the word, another for counting its length. - */ - public static class WordMeanMapper extends - Mapper { - - private LongWritable wordLen = new LongWritable(); - - /** - * Emits 2 key-value pairs for counting the word and its length. Outputs are - * (Text, LongWritable). - * - * @param value - * This will be a line of text coming in from our input file. - */ - public void map(Object key, Text value, Context context) - throws IOException, InterruptedException { - StringTokenizer itr = new StringTokenizer(value.toString()); - while (itr.hasMoreTokens()) { - String string = itr.nextToken(); - this.wordLen.set(string.length()); - context.write(LENGTH, this.wordLen); - context.write(COUNT, ONE); - } - } - } - - /** - * Performs integer summation of all the values for each key. - */ - public static class WordMeanReducer extends - Reducer { - - private LongWritable sum = new LongWritable(); - - /** - * Sums all the individual values within the iterator and writes them to the - * same key. - * - * @param key - * This will be one of 2 constants: LENGTH_STR or COUNT_STR. - * @param values - * This will be an iterator of all the values associated with that - * key. - */ - public void reduce(Text key, Iterable values, Context context) - throws IOException, InterruptedException { - - int theSum = 0; - for (LongWritable val : values) { - theSum += val.get(); - } - sum.set(theSum); - context.write(key, sum); - } - } - - /** - * Reads the output file and parses the summation of lengths, and the word - * count, to perform a quick calculation of the mean. - * - * @param path - * The path to find the output file in. Set in main to the output - * directory. - * @throws IOException - * If it cannot access the output directory, we throw an exception. - */ - private double readAndCalcMean(Path path, Configuration conf) - throws IOException { - FileSystem fs = FileSystem.get(conf); - Path file = new Path(path, "part-r-00000"); - - if (!fs.exists(file)) - throw new IOException("Output not found!"); - - BufferedReader br = null; - - // average = total sum / number of elements; - try { - br = new BufferedReader(new InputStreamReader(fs.open(file))); - - long count = 0; - long length = 0; - - String line; - while ((line = br.readLine()) != null) { - StringTokenizer st = new StringTokenizer(line); - - // grab type - String type = st.nextToken(); - - // differentiate - if (type.equals(COUNT.toString())) { - String countLit = st.nextToken(); - count = Long.parseLong(countLit); - } else if (type.equals(LENGTH.toString())) { - String lengthLit = st.nextToken(); - length = Long.parseLong(lengthLit); - } - } - - double theMean = (((double) length) / ((double) count)); - System.out.println("The mean is: " + theMean); - return theMean; - } finally { - br.close(); - } - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new WordMean(), args); - } - - @Override - public int run(String[] args) throws Exception { - if (args.length != 2) { - System.err.println("Usage: wordmean "); - return 0; - } - - Configuration conf = getConf(); - - @SuppressWarnings("deprecation") - Job job = new Job(conf, "word mean"); - job.setJarByClass(WordMean.class); - job.setMapperClass(WordMeanMapper.class); - job.setCombinerClass(WordMeanReducer.class); - job.setReducerClass(WordMeanReducer.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(LongWritable.class); - FileInputFormat.addInputPath(job, new Path(args[0])); - Path outputpath = new Path(args[1]); - FileOutputFormat.setOutputPath(job, outputpath); - boolean result = job.waitForCompletion(true); - mean = readAndCalcMean(outputpath, conf); - - return (result ? 0 : 1); - } - - /** - * Only valuable after run() called. - * - * @return Returns the mean value. - */ - public double getMean() { - return mean; - } +package org.apache.hadoop.examples; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordMean extends Configured implements Tool { + + private double mean = 0; + + private final static Text COUNT = new Text("count"); + private final static Text LENGTH = new Text("length"); + private final static LongWritable ONE = new LongWritable(1); + + /** + * Maps words from line of text into 2 key-value pairs; one key-value pair for + * counting the word, another for counting its length. + */ + public static class WordMeanMapper extends + Mapper { + + private LongWritable wordLen = new LongWritable(); + + /** + * Emits 2 key-value pairs for counting the word and its length. Outputs are + * (Text, LongWritable). + * + * @param value + * This will be a line of text coming in from our input file. + */ + public void map(Object key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer itr = new StringTokenizer(value.toString()); + while (itr.hasMoreTokens()) { + String string = itr.nextToken(); + this.wordLen.set(string.length()); + context.write(LENGTH, this.wordLen); + context.write(COUNT, ONE); + } + } + } + + /** + * Performs integer summation of all the values for each key. + */ + public static class WordMeanReducer extends + Reducer { + + private LongWritable sum = new LongWritable(); + + /** + * Sums all the individual values within the iterator and writes them to the + * same key. + * + * @param key + * This will be one of 2 constants: LENGTH_STR or COUNT_STR. + * @param values + * This will be an iterator of all the values associated with that + * key. + */ + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + + int theSum = 0; + for (LongWritable val : values) { + theSum += val.get(); + } + sum.set(theSum); + context.write(key, sum); + } + } + + /** + * Reads the output file and parses the summation of lengths, and the word + * count, to perform a quick calculation of the mean. + * + * @param path + * The path to find the output file in. Set in main to the output + * directory. + * @throws IOException + * If it cannot access the output directory, we throw an exception. + */ + private double readAndCalcMean(Path path, Configuration conf) + throws IOException { + FileSystem fs = FileSystem.get(conf); + Path file = new Path(path, "part-r-00000"); + + if (!fs.exists(file)) + throw new IOException("Output not found!"); + + BufferedReader br = null; + + // average = total sum / number of elements; + try { + br = new BufferedReader(new InputStreamReader(fs.open(file))); + + long count = 0; + long length = 0; + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + + // grab type + String type = st.nextToken(); + + // differentiate + if (type.equals(COUNT.toString())) { + String countLit = st.nextToken(); + count = Long.parseLong(countLit); + } else if (type.equals(LENGTH.toString())) { + String lengthLit = st.nextToken(); + length = Long.parseLong(lengthLit); + } + } + + double theMean = (((double) length) / ((double) count)); + System.out.println("The mean is: " + theMean); + return theMean; + } finally { + br.close(); + } + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordMean(), args); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: wordmean "); + return 0; + } + + Configuration conf = getConf(); + + @SuppressWarnings("deprecation") + Job job = new Job(conf, "word mean"); + job.setJarByClass(WordMean.class); + job.setMapperClass(WordMeanMapper.class); + job.setCombinerClass(WordMeanReducer.class); + job.setReducerClass(WordMeanReducer.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + FileInputFormat.addInputPath(job, new Path(args[0])); + Path outputpath = new Path(args[1]); + FileOutputFormat.setOutputPath(job, outputpath); + boolean result = job.waitForCompletion(true); + mean = readAndCalcMean(outputpath, conf); + + return (result ? 0 : 1); + } + + /** + * Only valuable after run() called. + * + * @return Returns the mean value. + */ + public double getMean() { + return mean; + } } \ No newline at end of file diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMedian.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMedian.java index 406d19ed4f8..d22208680f4 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMedian.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordMedian.java @@ -1,208 +1,208 @@ -package org.apache.hadoop.examples; - -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.StringTokenizer; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.TaskCounter; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - -public class WordMedian extends Configured implements Tool { - - private double median = 0; - private final static IntWritable ONE = new IntWritable(1); - - /** - * Maps words from line of text into a key-value pair; the length of the word - * as the key, and 1 as the value. - */ - public static class WordMedianMapper extends - Mapper { - - private IntWritable length = new IntWritable(); - - /** - * Emits a key-value pair for counting the word. Outputs are (IntWritable, - * IntWritable). - * - * @param value - * This will be a line of text coming in from our input file. - */ - public void map(Object key, Text value, Context context) - throws IOException, InterruptedException { - StringTokenizer itr = new StringTokenizer(value.toString()); - while (itr.hasMoreTokens()) { - String string = itr.nextToken(); - length.set(string.length()); - context.write(length, ONE); - } - } - } - - /** - * Performs integer summation of all the values for each key. - */ - public static class WordMedianReducer extends - Reducer { - - private IntWritable val = new IntWritable(); - - /** - * Sums all the individual values within the iterator and writes them to the - * same key. - * - * @param key - * This will be a length of a word that was read. - * @param values - * This will be an iterator of all the values associated with that - * key. - */ - public void reduce(IntWritable key, Iterable values, - Context context) throws IOException, InterruptedException { - - int sum = 0; - for (IntWritable value : values) { - sum += value.get(); - } - val.set(sum); - context.write(key, val); - } - } - - /** - * This is a standard program to read and find a median value based on a file - * of word counts such as: 1 456, 2 132, 3 56... Where the first values are - * the word lengths and the following values are the number of times that - * words of that length appear. - * - * @param path - * The path to read the HDFS file from (part-r-00000...00001...etc). - * @param medianIndex1 - * The first length value to look for. - * @param medianIndex2 - * The second length value to look for (will be the same as the first - * if there are an even number of words total). - * @throws IOException - * If file cannot be found, we throw an exception. - * */ - private double readAndFindMedian(String path, int medianIndex1, - int medianIndex2, Configuration conf) throws IOException { - FileSystem fs = FileSystem.get(conf); - Path file = new Path(path, "part-r-00000"); - - if (!fs.exists(file)) - throw new IOException("Output not found!"); - - BufferedReader br = null; - - try { - br = new BufferedReader(new InputStreamReader(fs.open(file))); - int num = 0; - - String line; - while ((line = br.readLine()) != null) { - StringTokenizer st = new StringTokenizer(line); - - // grab length - String currLen = st.nextToken(); - - // grab count - String lengthFreq = st.nextToken(); - - int prevNum = num; - num += Integer.parseInt(lengthFreq); - - if (medianIndex2 >= prevNum && medianIndex1 <= num) { - System.out.println("The median is: " + currLen); - br.close(); - return Double.parseDouble(currLen); - } else if (medianIndex2 >= prevNum && medianIndex1 < num) { - String nextCurrLen = st.nextToken(); - double theMedian = (Integer.parseInt(currLen) + Integer - .parseInt(nextCurrLen)) / 2.0; - System.out.println("The median is: " + theMedian); - br.close(); - return theMedian; - } - } - } finally { - br.close(); - } - // error, no median found - return -1; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new WordMedian(), args); - } - - @Override - public int run(String[] args) throws Exception { - if (args.length != 2) { - System.err.println("Usage: wordmedian "); - return 0; - } - - setConf(new Configuration()); - Configuration conf = getConf(); - - @SuppressWarnings("deprecation") - Job job = new Job(conf, "word median"); - job.setJarByClass(WordMedian.class); - job.setMapperClass(WordMedianMapper.class); - job.setCombinerClass(WordMedianReducer.class); - job.setReducerClass(WordMedianReducer.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(IntWritable.class); - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, new Path(args[1])); - boolean result = job.waitForCompletion(true); - - // Wait for JOB 1 -- get middle value to check for Median - - long totalWords = job.getCounters() - .getGroup(TaskCounter.class.getCanonicalName()) - .findCounter("MAP_OUTPUT_RECORDS", "Map output records").getValue(); - int medianIndex1 = (int) Math.ceil((totalWords / 2.0)); - int medianIndex2 = (int) Math.floor((totalWords / 2.0)); - - median = readAndFindMedian(args[1], medianIndex1, medianIndex2, conf); - - return (result ? 0 : 1); - } - - public double getMedian() { - return median; - } -} +package org.apache.hadoop.examples; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskCounter; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordMedian extends Configured implements Tool { + + private double median = 0; + private final static IntWritable ONE = new IntWritable(1); + + /** + * Maps words from line of text into a key-value pair; the length of the word + * as the key, and 1 as the value. + */ + public static class WordMedianMapper extends + Mapper { + + private IntWritable length = new IntWritable(); + + /** + * Emits a key-value pair for counting the word. Outputs are (IntWritable, + * IntWritable). + * + * @param value + * This will be a line of text coming in from our input file. + */ + public void map(Object key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer itr = new StringTokenizer(value.toString()); + while (itr.hasMoreTokens()) { + String string = itr.nextToken(); + length.set(string.length()); + context.write(length, ONE); + } + } + } + + /** + * Performs integer summation of all the values for each key. + */ + public static class WordMedianReducer extends + Reducer { + + private IntWritable val = new IntWritable(); + + /** + * Sums all the individual values within the iterator and writes them to the + * same key. + * + * @param key + * This will be a length of a word that was read. + * @param values + * This will be an iterator of all the values associated with that + * key. + */ + public void reduce(IntWritable key, Iterable values, + Context context) throws IOException, InterruptedException { + + int sum = 0; + for (IntWritable value : values) { + sum += value.get(); + } + val.set(sum); + context.write(key, val); + } + } + + /** + * This is a standard program to read and find a median value based on a file + * of word counts such as: 1 456, 2 132, 3 56... Where the first values are + * the word lengths and the following values are the number of times that + * words of that length appear. + * + * @param path + * The path to read the HDFS file from (part-r-00000...00001...etc). + * @param medianIndex1 + * The first length value to look for. + * @param medianIndex2 + * The second length value to look for (will be the same as the first + * if there are an even number of words total). + * @throws IOException + * If file cannot be found, we throw an exception. + * */ + private double readAndFindMedian(String path, int medianIndex1, + int medianIndex2, Configuration conf) throws IOException { + FileSystem fs = FileSystem.get(conf); + Path file = new Path(path, "part-r-00000"); + + if (!fs.exists(file)) + throw new IOException("Output not found!"); + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(file))); + int num = 0; + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + + // grab length + String currLen = st.nextToken(); + + // grab count + String lengthFreq = st.nextToken(); + + int prevNum = num; + num += Integer.parseInt(lengthFreq); + + if (medianIndex2 >= prevNum && medianIndex1 <= num) { + System.out.println("The median is: " + currLen); + br.close(); + return Double.parseDouble(currLen); + } else if (medianIndex2 >= prevNum && medianIndex1 < num) { + String nextCurrLen = st.nextToken(); + double theMedian = (Integer.parseInt(currLen) + Integer + .parseInt(nextCurrLen)) / 2.0; + System.out.println("The median is: " + theMedian); + br.close(); + return theMedian; + } + } + } finally { + br.close(); + } + // error, no median found + return -1; + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordMedian(), args); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: wordmedian "); + return 0; + } + + setConf(new Configuration()); + Configuration conf = getConf(); + + @SuppressWarnings("deprecation") + Job job = new Job(conf, "word median"); + job.setJarByClass(WordMedian.class); + job.setMapperClass(WordMedianMapper.class); + job.setCombinerClass(WordMedianReducer.class); + job.setReducerClass(WordMedianReducer.class); + job.setOutputKeyClass(IntWritable.class); + job.setOutputValueClass(IntWritable.class); + FileInputFormat.addInputPath(job, new Path(args[0])); + FileOutputFormat.setOutputPath(job, new Path(args[1])); + boolean result = job.waitForCompletion(true); + + // Wait for JOB 1 -- get middle value to check for Median + + long totalWords = job.getCounters() + .getGroup(TaskCounter.class.getCanonicalName()) + .findCounter("MAP_OUTPUT_RECORDS", "Map output records").getValue(); + int medianIndex1 = (int) Math.ceil((totalWords / 2.0)); + int medianIndex2 = (int) Math.floor((totalWords / 2.0)); + + median = readAndFindMedian(args[1], medianIndex1, medianIndex2, conf); + + return (result ? 0 : 1); + } + + public double getMedian() { + return median; + } +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java index 6a122df6527..d45935ebc03 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java @@ -1,210 +1,210 @@ -package org.apache.hadoop.examples; - -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.StringTokenizer; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - -public class WordStandardDeviation extends Configured implements Tool { - - private double stddev = 0; - - private final static Text LENGTH = new Text("length"); - private final static Text SQUARE = new Text("square"); - private final static Text COUNT = new Text("count"); - private final static LongWritable ONE = new LongWritable(1); - - /** - * Maps words from line of text into 3 key-value pairs; one key-value pair for - * counting the word, one for counting its length, and one for counting the - * square of its length. - */ - public static class WordStandardDeviationMapper extends - Mapper { - - private LongWritable wordLen = new LongWritable(); - private LongWritable wordLenSq = new LongWritable(); - - /** - * Emits 3 key-value pairs for counting the word, its length, and the - * squares of its length. Outputs are (Text, LongWritable). - * - * @param value - * This will be a line of text coming in from our input file. - */ - public void map(Object key, Text value, Context context) - throws IOException, InterruptedException { - StringTokenizer itr = new StringTokenizer(value.toString()); - while (itr.hasMoreTokens()) { - String string = itr.nextToken(); - - this.wordLen.set(string.length()); - - // the square of an integer is an integer... - this.wordLenSq.set((long) Math.pow(string.length(), 2.0)); - - context.write(LENGTH, this.wordLen); - context.write(SQUARE, this.wordLenSq); - context.write(COUNT, ONE); - } - } - } - - /** - * Performs integer summation of all the values for each key. - */ - public static class WordStandardDeviationReducer extends - Reducer { - - private LongWritable val = new LongWritable(); - - /** - * Sums all the individual values within the iterator and writes them to the - * same key. - * - * @param key - * This will be one of 2 constants: LENGTH_STR, COUNT_STR, or - * SQUARE_STR. - * @param values - * This will be an iterator of all the values associated with that - * key. - */ - public void reduce(Text key, Iterable values, Context context) - throws IOException, InterruptedException { - - int sum = 0; - for (LongWritable value : values) { - sum += value.get(); - } - val.set(sum); - context.write(key, val); - } - } - - /** - * Reads the output file and parses the summation of lengths, the word count, - * and the lengths squared, to perform a quick calculation of the standard - * deviation. - * - * @param path - * The path to find the output file in. Set in main to the output - * directory. - * @throws IOException - * If it cannot access the output directory, we throw an exception. - */ - private double readAndCalcStdDev(Path path, Configuration conf) - throws IOException { - FileSystem fs = FileSystem.get(conf); - Path file = new Path(path, "part-r-00000"); - - if (!fs.exists(file)) - throw new IOException("Output not found!"); - - double stddev = 0; - BufferedReader br = null; - try { - br = new BufferedReader(new InputStreamReader(fs.open(file))); - long count = 0; - long length = 0; - long square = 0; - String line; - while ((line = br.readLine()) != null) { - StringTokenizer st = new StringTokenizer(line); - - // grab type - String type = st.nextToken(); - - // differentiate - if (type.equals(COUNT.toString())) { - String countLit = st.nextToken(); - count = Long.parseLong(countLit); - } else if (type.equals(LENGTH.toString())) { - String lengthLit = st.nextToken(); - length = Long.parseLong(lengthLit); - } else if (type.equals(SQUARE.toString())) { - String squareLit = st.nextToken(); - square = Long.parseLong(squareLit); - } - } - // average = total sum / number of elements; - double mean = (((double) length) / ((double) count)); - // standard deviation = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2)) - mean = Math.pow(mean, 2.0); - double term = (((double) square / ((double) count))); - stddev = Math.sqrt((term - mean)); - System.out.println("The standard deviation is: " + stddev); - } finally { - br.close(); - } - return stddev; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new WordStandardDeviation(), - args); - } - - @Override - public int run(String[] args) throws Exception { - if (args.length != 2) { - System.err.println("Usage: wordstddev "); - return 0; - } - - Configuration conf = getConf(); - - @SuppressWarnings("deprecation") - Job job = new Job(conf, "word stddev"); - job.setJarByClass(WordStandardDeviation.class); - job.setMapperClass(WordStandardDeviationMapper.class); - job.setCombinerClass(WordStandardDeviationReducer.class); - job.setReducerClass(WordStandardDeviationReducer.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(LongWritable.class); - FileInputFormat.addInputPath(job, new Path(args[0])); - Path outputpath = new Path(args[1]); - FileOutputFormat.setOutputPath(job, outputpath); - boolean result = job.waitForCompletion(true); - - // read output and calculate standard deviation - stddev = readAndCalcStdDev(outputpath, conf); - - return (result ? 0 : 1); - } - - public double getStandardDeviation() { - return stddev; - } +package org.apache.hadoop.examples; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordStandardDeviation extends Configured implements Tool { + + private double stddev = 0; + + private final static Text LENGTH = new Text("length"); + private final static Text SQUARE = new Text("square"); + private final static Text COUNT = new Text("count"); + private final static LongWritable ONE = new LongWritable(1); + + /** + * Maps words from line of text into 3 key-value pairs; one key-value pair for + * counting the word, one for counting its length, and one for counting the + * square of its length. + */ + public static class WordStandardDeviationMapper extends + Mapper { + + private LongWritable wordLen = new LongWritable(); + private LongWritable wordLenSq = new LongWritable(); + + /** + * Emits 3 key-value pairs for counting the word, its length, and the + * squares of its length. Outputs are (Text, LongWritable). + * + * @param value + * This will be a line of text coming in from our input file. + */ + public void map(Object key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer itr = new StringTokenizer(value.toString()); + while (itr.hasMoreTokens()) { + String string = itr.nextToken(); + + this.wordLen.set(string.length()); + + // the square of an integer is an integer... + this.wordLenSq.set((long) Math.pow(string.length(), 2.0)); + + context.write(LENGTH, this.wordLen); + context.write(SQUARE, this.wordLenSq); + context.write(COUNT, ONE); + } + } + } + + /** + * Performs integer summation of all the values for each key. + */ + public static class WordStandardDeviationReducer extends + Reducer { + + private LongWritable val = new LongWritable(); + + /** + * Sums all the individual values within the iterator and writes them to the + * same key. + * + * @param key + * This will be one of 2 constants: LENGTH_STR, COUNT_STR, or + * SQUARE_STR. + * @param values + * This will be an iterator of all the values associated with that + * key. + */ + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + + int sum = 0; + for (LongWritable value : values) { + sum += value.get(); + } + val.set(sum); + context.write(key, val); + } + } + + /** + * Reads the output file and parses the summation of lengths, the word count, + * and the lengths squared, to perform a quick calculation of the standard + * deviation. + * + * @param path + * The path to find the output file in. Set in main to the output + * directory. + * @throws IOException + * If it cannot access the output directory, we throw an exception. + */ + private double readAndCalcStdDev(Path path, Configuration conf) + throws IOException { + FileSystem fs = FileSystem.get(conf); + Path file = new Path(path, "part-r-00000"); + + if (!fs.exists(file)) + throw new IOException("Output not found!"); + + double stddev = 0; + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(fs.open(file))); + long count = 0; + long length = 0; + long square = 0; + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + + // grab type + String type = st.nextToken(); + + // differentiate + if (type.equals(COUNT.toString())) { + String countLit = st.nextToken(); + count = Long.parseLong(countLit); + } else if (type.equals(LENGTH.toString())) { + String lengthLit = st.nextToken(); + length = Long.parseLong(lengthLit); + } else if (type.equals(SQUARE.toString())) { + String squareLit = st.nextToken(); + square = Long.parseLong(squareLit); + } + } + // average = total sum / number of elements; + double mean = (((double) length) / ((double) count)); + // standard deviation = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2)) + mean = Math.pow(mean, 2.0); + double term = (((double) square / ((double) count))); + stddev = Math.sqrt((term - mean)); + System.out.println("The standard deviation is: " + stddev); + } finally { + br.close(); + } + return stddev; + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordStandardDeviation(), + args); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: wordstddev "); + return 0; + } + + Configuration conf = getConf(); + + @SuppressWarnings("deprecation") + Job job = new Job(conf, "word stddev"); + job.setJarByClass(WordStandardDeviation.class); + job.setMapperClass(WordStandardDeviationMapper.class); + job.setCombinerClass(WordStandardDeviationReducer.class); + job.setReducerClass(WordStandardDeviationReducer.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + FileInputFormat.addInputPath(job, new Path(args[0])); + Path outputpath = new Path(args[1]); + FileOutputFormat.setOutputPath(job, outputpath); + boolean result = job.waitForCompletion(true); + + // read output and calculate standard deviation + stddev = readAndCalcStdDev(outputpath, conf); + + return (result ? 0 : 1); + } + + public double getStandardDeviation() { + return stddev; + } } \ No newline at end of file diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java index 3a2ec5ec112..54575165746 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java @@ -1,272 +1,272 @@ -package org.apache.hadoop.examples; - -import static org.junit.Assert.assertEquals; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.StringTokenizer; -import java.util.TreeMap; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.junit.Before; -import org.junit.Test; - -public class TestWordStats { - - private final static String INPUT = "src/test/java/org/apache/hadoop/examples/pi/math"; - private final static String MEAN_OUTPUT = "build/data/mean_output"; - private final static String MEDIAN_OUTPUT = "build/data/median_output"; - private final static String STDDEV_OUTPUT = "build/data/stddev_output"; - - /** - * Modified internal test class that is designed to read all the files in the - * input directory, and find the standard deviation between all of the word - * lengths. - */ - public static class WordStdDevReader { - private long wordsRead = 0; - private long wordLengthsRead = 0; - private long wordLengthsReadSquared = 0; - - public WordStdDevReader() { - } - - public double read(String path) throws IOException { - FileSystem fs = FileSystem.get(new Configuration()); - FileStatus[] files = fs.listStatus(new Path(path)); - - for (FileStatus fileStat : files) { - if (!fileStat.isFile()) - continue; - - BufferedReader br = null; - - try { - br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); - - String line; - while ((line = br.readLine()) != null) { - StringTokenizer st = new StringTokenizer(line); - String word; - while (st.hasMoreTokens()) { - word = st.nextToken(); - this.wordsRead++; - this.wordLengthsRead += word.length(); - this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0); - } - } - - } catch (IOException e) { - System.out.println("Output could not be read!"); - throw e; - } finally { - br.close(); - } - } - - double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); - mean = Math.pow(mean, 2.0); - double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead))); - double stddev = Math.sqrt((term - mean)); - return stddev; - } - - } - - /** - * Modified internal test class that is designed to read all the files in the - * input directory, and find the median length of all the words. - */ - public static class WordMedianReader { - private long wordsRead = 0; - private TreeMap map = new TreeMap(); - - public WordMedianReader() { - } - - public double read(String path) throws IOException { - FileSystem fs = FileSystem.get(new Configuration()); - FileStatus[] files = fs.listStatus(new Path(path)); - - int num = 0; - - for (FileStatus fileStat : files) { - if (!fileStat.isFile()) - continue; - - BufferedReader br = null; - - try { - br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); - - String line; - while ((line = br.readLine()) != null) { - StringTokenizer st = new StringTokenizer(line); - String word; - while (st.hasMoreTokens()) { - word = st.nextToken(); - this.wordsRead++; - if (this.map.get(word.length()) == null) { - this.map.put(word.length(), 1); - } else { - int count = this.map.get(word.length()); - this.map.put(word.length(), count + 1); - } - } - } - } catch (IOException e) { - System.out.println("Output could not be read!"); - throw e; - } finally { - br.close(); - } - } - - int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0)); - int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0)); - - for (Integer key : this.map.navigableKeySet()) { - int prevNum = num; - num += this.map.get(key); - - if (medianIndex2 >= prevNum && medianIndex1 <= num) { - return key; - } else if (medianIndex2 >= prevNum && medianIndex1 < num) { - Integer nextCurrLen = this.map.navigableKeySet().iterator().next(); - double median = (key + nextCurrLen) / 2.0; - return median; - } - } - return -1; - } - - } - - /** - * Modified internal test class that is designed to read all the files in the - * input directory, and find the mean length of all the words. - */ - public static class WordMeanReader { - private long wordsRead = 0; - private long wordLengthsRead = 0; - - public WordMeanReader() { - } - - public double read(String path) throws IOException { - FileSystem fs = FileSystem.get(new Configuration()); - FileStatus[] files = fs.listStatus(new Path(path)); - - for (FileStatus fileStat : files) { - if (!fileStat.isFile()) - continue; - - BufferedReader br = null; - - try { - br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); - - String line; - while ((line = br.readLine()) != null) { - StringTokenizer st = new StringTokenizer(line); - String word; - while (st.hasMoreTokens()) { - word = st.nextToken(); - this.wordsRead++; - this.wordLengthsRead += word.length(); - } - } - } catch (IOException e) { - System.out.println("Output could not be read!"); - throw e; - } finally { - br.close(); - } - } - - double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); - return mean; - } - - } - - /** - * Internal class designed to delete the output directory. Meant solely for - * use before and after the test is run; this is so next iterations of the - * test do not encounter a "file already exists" error. - * - * @param dir - * The directory to delete. - * @return Returns whether the deletion was successful or not. - */ - public static boolean deleteDir(File dir) { - if (dir.isDirectory()) { - String[] children = dir.list(); - for (int i = 0; i < children.length; i++) { - boolean success = deleteDir(new File(dir, children[i])); - if (!success) { - System.out.println("Could not delete directory after test!"); - return false; - } - } - } - - // The directory is now empty so delete it - return dir.delete(); - } - - @Before public void setup() throws Exception { - deleteDir(new File(MEAN_OUTPUT)); - deleteDir(new File(MEDIAN_OUTPUT)); - deleteDir(new File(STDDEV_OUTPUT)); - } - - @Test public void testGetTheMean() throws Exception { - String args[] = new String[2]; - args[0] = INPUT; - args[1] = MEAN_OUTPUT; - - WordMean wm = new WordMean(); - ToolRunner.run(new Configuration(), wm, args); - double mean = wm.getMean(); - - // outputs MUST match - WordMeanReader wr = new WordMeanReader(); - assertEquals(mean, wr.read(INPUT), 0.0); - } - - @Test public void testGetTheMedian() throws Exception { - String args[] = new String[2]; - args[0] = INPUT; - args[1] = MEDIAN_OUTPUT; - - WordMedian wm = new WordMedian(); - ToolRunner.run(new Configuration(), wm, args); - double median = wm.getMedian(); - - // outputs MUST match - WordMedianReader wr = new WordMedianReader(); - assertEquals(median, wr.read(INPUT), 0.0); - } - - @Test public void testGetTheStandardDeviation() throws Exception { - String args[] = new String[2]; - args[0] = INPUT; - args[1] = STDDEV_OUTPUT; - - WordStandardDeviation wsd = new WordStandardDeviation(); - ToolRunner.run(new Configuration(), wsd, args); - double stddev = wsd.getStandardDeviation(); - - // outputs MUST match - WordStdDevReader wr = new WordStdDevReader(); - assertEquals(stddev, wr.read(INPUT), 0.0); - } - -} +package org.apache.hadoop.examples; + +import static org.junit.Assert.assertEquals; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; +import java.util.TreeMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.ToolRunner; +import org.junit.Before; +import org.junit.Test; + +public class TestWordStats { + + private final static String INPUT = "src/test/java/org/apache/hadoop/examples/pi/math"; + private final static String MEAN_OUTPUT = "build/data/mean_output"; + private final static String MEDIAN_OUTPUT = "build/data/median_output"; + private final static String STDDEV_OUTPUT = "build/data/stddev_output"; + + /** + * Modified internal test class that is designed to read all the files in the + * input directory, and find the standard deviation between all of the word + * lengths. + */ + public static class WordStdDevReader { + private long wordsRead = 0; + private long wordLengthsRead = 0; + private long wordLengthsReadSquared = 0; + + public WordStdDevReader() { + } + + public double read(String path) throws IOException { + FileSystem fs = FileSystem.get(new Configuration()); + FileStatus[] files = fs.listStatus(new Path(path)); + + for (FileStatus fileStat : files) { + if (!fileStat.isFile()) + continue; + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + String word; + while (st.hasMoreTokens()) { + word = st.nextToken(); + this.wordsRead++; + this.wordLengthsRead += word.length(); + this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0); + } + } + + } catch (IOException e) { + System.out.println("Output could not be read!"); + throw e; + } finally { + br.close(); + } + } + + double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); + mean = Math.pow(mean, 2.0); + double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead))); + double stddev = Math.sqrt((term - mean)); + return stddev; + } + + } + + /** + * Modified internal test class that is designed to read all the files in the + * input directory, and find the median length of all the words. + */ + public static class WordMedianReader { + private long wordsRead = 0; + private TreeMap map = new TreeMap(); + + public WordMedianReader() { + } + + public double read(String path) throws IOException { + FileSystem fs = FileSystem.get(new Configuration()); + FileStatus[] files = fs.listStatus(new Path(path)); + + int num = 0; + + for (FileStatus fileStat : files) { + if (!fileStat.isFile()) + continue; + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + String word; + while (st.hasMoreTokens()) { + word = st.nextToken(); + this.wordsRead++; + if (this.map.get(word.length()) == null) { + this.map.put(word.length(), 1); + } else { + int count = this.map.get(word.length()); + this.map.put(word.length(), count + 1); + } + } + } + } catch (IOException e) { + System.out.println("Output could not be read!"); + throw e; + } finally { + br.close(); + } + } + + int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0)); + int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0)); + + for (Integer key : this.map.navigableKeySet()) { + int prevNum = num; + num += this.map.get(key); + + if (medianIndex2 >= prevNum && medianIndex1 <= num) { + return key; + } else if (medianIndex2 >= prevNum && medianIndex1 < num) { + Integer nextCurrLen = this.map.navigableKeySet().iterator().next(); + double median = (key + nextCurrLen) / 2.0; + return median; + } + } + return -1; + } + + } + + /** + * Modified internal test class that is designed to read all the files in the + * input directory, and find the mean length of all the words. + */ + public static class WordMeanReader { + private long wordsRead = 0; + private long wordLengthsRead = 0; + + public WordMeanReader() { + } + + public double read(String path) throws IOException { + FileSystem fs = FileSystem.get(new Configuration()); + FileStatus[] files = fs.listStatus(new Path(path)); + + for (FileStatus fileStat : files) { + if (!fileStat.isFile()) + continue; + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + String word; + while (st.hasMoreTokens()) { + word = st.nextToken(); + this.wordsRead++; + this.wordLengthsRead += word.length(); + } + } + } catch (IOException e) { + System.out.println("Output could not be read!"); + throw e; + } finally { + br.close(); + } + } + + double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); + return mean; + } + + } + + /** + * Internal class designed to delete the output directory. Meant solely for + * use before and after the test is run; this is so next iterations of the + * test do not encounter a "file already exists" error. + * + * @param dir + * The directory to delete. + * @return Returns whether the deletion was successful or not. + */ + public static boolean deleteDir(File dir) { + if (dir.isDirectory()) { + String[] children = dir.list(); + for (int i = 0; i < children.length; i++) { + boolean success = deleteDir(new File(dir, children[i])); + if (!success) { + System.out.println("Could not delete directory after test!"); + return false; + } + } + } + + // The directory is now empty so delete it + return dir.delete(); + } + + @Before public void setup() throws Exception { + deleteDir(new File(MEAN_OUTPUT)); + deleteDir(new File(MEDIAN_OUTPUT)); + deleteDir(new File(STDDEV_OUTPUT)); + } + + @Test public void testGetTheMean() throws Exception { + String args[] = new String[2]; + args[0] = INPUT; + args[1] = MEAN_OUTPUT; + + WordMean wm = new WordMean(); + ToolRunner.run(new Configuration(), wm, args); + double mean = wm.getMean(); + + // outputs MUST match + WordMeanReader wr = new WordMeanReader(); + assertEquals(mean, wr.read(INPUT), 0.0); + } + + @Test public void testGetTheMedian() throws Exception { + String args[] = new String[2]; + args[0] = INPUT; + args[1] = MEDIAN_OUTPUT; + + WordMedian wm = new WordMedian(); + ToolRunner.run(new Configuration(), wm, args); + double median = wm.getMedian(); + + // outputs MUST match + WordMedianReader wr = new WordMedianReader(); + assertEquals(median, wr.read(INPUT), 0.0); + } + + @Test public void testGetTheStandardDeviation() throws Exception { + String args[] = new String[2]; + args[0] = INPUT; + args[1] = STDDEV_OUTPUT; + + WordStandardDeviation wsd = new WordStandardDeviation(); + ToolRunner.run(new Configuration(), wsd, args); + double stddev = wsd.getStandardDeviation(); + + // outputs MUST match + WordStdDevReader wr = new WordStdDevReader(); + assertEquals(stddev, wr.read(INPUT), 0.0); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/sample/data.txt b/hadoop-mapreduce-project/src/contrib/index/sample/data.txt index 92b47ebc484..4bc9a8f9d77 100755 --- a/hadoop-mapreduce-project/src/contrib/index/sample/data.txt +++ b/hadoop-mapreduce-project/src/contrib/index/sample/data.txt @@ -1,10 +1,10 @@ -0 ins apache dot org -1 ins apache -2 ins apache -3 ins apache -4 ins apache -5 ins apache -6 ins apache -7 ins apache -8 ins apache -9 ins apache +0 ins apache dot org +1 ins apache +2 ins apache +3 ins apache +4 ins apache +5 ins apache +6 ins apache +7 ins apache +8 ins apache +9 ins apache diff --git a/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt b/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt index 0b7feee6094..550a9967b21 100755 --- a/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt +++ b/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt @@ -1,10 +1,10 @@ -0 del -1 upd hadoop -2 del -3 upd hadoop -4 del -5 upd hadoop -6 del -7 upd hadoop -8 del -9 upd hadoop +0 del +1 upd hadoop +2 del +3 upd hadoop +4 del +5 upd hadoop +6 del +7 upd hadoop +8 del +9 upd hadoop diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java index 1cb13270ba4..f3e463a9aed 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java @@ -1,56 +1,56 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import org.apache.hadoop.contrib.index.mapred.DocumentID; -import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy; -import org.apache.hadoop.contrib.index.mapred.Shard; - -/** - * Choose a shard for each insert or delete based on document id hashing. Do - * NOT use this distribution policy when the number of shards changes. - */ -public class HashingDistributionPolicy implements IDistributionPolicy { - - private int numShards; - - /* (non-Javadoc) - * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[]) - */ - public void init(Shard[] shards) { - numShards = shards.length; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID) - */ - public int chooseShardForInsert(DocumentID key) { - int hashCode = key.hashCode(); - return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID) - */ - public int chooseShardForDelete(DocumentID key) { - int hashCode = key.hashCode(); - return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards; - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import org.apache.hadoop.contrib.index.mapred.DocumentID; +import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy; +import org.apache.hadoop.contrib.index.mapred.Shard; + +/** + * Choose a shard for each insert or delete based on document id hashing. Do + * NOT use this distribution policy when the number of shards changes. + */ +public class HashingDistributionPolicy implements IDistributionPolicy { + + private int numShards; + + /* (non-Javadoc) + * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[]) + */ + public void init(Shard[] shards) { + numShards = shards.length; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID) + */ + public int chooseShardForInsert(DocumentID key) { + int hashCode = key.hashCode(); + return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID) + */ + public int chooseShardForDelete(DocumentID key) { + int hashCode = key.hashCode(); + return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards; + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java index 3e215773261..79324b5b026 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java @@ -1,57 +1,57 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import java.io.IOException; - -import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; -import org.apache.hadoop.contrib.index.mapred.DocumentID; -import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; - -/** - * Identity local analysis maps inputs directly into outputs. - */ -public class IdentityLocalAnalysis implements - ILocalAnalysis { - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) - */ - public void map(DocumentID key, DocumentAndOp value, - OutputCollector output, Reporter reporter) - throws IOException { - output.collect(key, value); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf) - */ - public void configure(JobConf job) { - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Closeable#close() - */ - public void close() throws IOException { - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import java.io.IOException; + +import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; +import org.apache.hadoop.contrib.index.mapred.DocumentID; +import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +/** + * Identity local analysis maps inputs directly into outputs. + */ +public class IdentityLocalAnalysis implements + ILocalAnalysis { + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) + */ + public void map(DocumentID key, DocumentAndOp value, + OutputCollector output, Reporter reporter) + throws IOException { + output.collect(key, value); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf) + */ + public void configure(JobConf job) { + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Closeable#close() + */ + public void close() throws IOException { + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java index 1e501d58e07..a09064bb2a9 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java @@ -1,46 +1,46 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import java.io.IOException; - -import org.apache.hadoop.contrib.index.mapred.DocumentID; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; - -/** - * An InputFormat for LineDoc for plain text files where each line is a doc. - */ -public class LineDocInputFormat extends - FileInputFormat { - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.FileInputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit, org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter) - */ - public RecordReader getRecordReader( - InputSplit split, JobConf job, Reporter reporter) throws IOException { - reporter.setStatus(split.toString()); - return new LineDocRecordReader(job, (FileSplit) split); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import java.io.IOException; + +import org.apache.hadoop.contrib.index.mapred.DocumentID; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +/** + * An InputFormat for LineDoc for plain text files where each line is a doc. + */ +public class LineDocInputFormat extends + FileInputFormat { + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.FileInputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit, org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter) + */ + public RecordReader getRecordReader( + InputSplit split, JobConf job, Reporter reporter) throws IOException { + reporter.setStatus(split.toString()); + return new LineDocRecordReader(job, (FileSplit) split); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java index 4fdc226d39a..c5a5a5a6d54 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java @@ -1,80 +1,80 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import java.io.IOException; - -import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; -import org.apache.hadoop.contrib.index.mapred.DocumentID; -import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.Term; - -/** - * Convert LineDocTextAndOp to DocumentAndOp as required by ILocalAnalysis. - */ -public class LineDocLocalAnalysis implements - ILocalAnalysis { - - private static String docidFieldName = "id"; - private static String contentFieldName = "content"; - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) - */ - public void map(DocumentID key, LineDocTextAndOp value, - OutputCollector output, Reporter reporter) - throws IOException { - - DocumentAndOp.Op op = value.getOp(); - Document doc = null; - Term term = null; - - if (op == DocumentAndOp.Op.INSERT || op == DocumentAndOp.Op.UPDATE) { - doc = new Document(); - doc.add(new Field(docidFieldName, key.getText().toString(), - Field.Store.YES, Field.Index.UN_TOKENIZED)); - doc.add(new Field(contentFieldName, value.getText().toString(), - Field.Store.NO, Field.Index.TOKENIZED)); - } - - if (op == DocumentAndOp.Op.DELETE || op == DocumentAndOp.Op.UPDATE) { - term = new Term(docidFieldName, key.getText().toString()); - } - - output.collect(key, new DocumentAndOp(op, doc, term)); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf) - */ - public void configure(JobConf job) { - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Closeable#close() - */ - public void close() throws IOException { - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import java.io.IOException; + +import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; +import org.apache.hadoop.contrib.index.mapred.DocumentID; +import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.Term; + +/** + * Convert LineDocTextAndOp to DocumentAndOp as required by ILocalAnalysis. + */ +public class LineDocLocalAnalysis implements + ILocalAnalysis { + + private static String docidFieldName = "id"; + private static String contentFieldName = "content"; + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) + */ + public void map(DocumentID key, LineDocTextAndOp value, + OutputCollector output, Reporter reporter) + throws IOException { + + DocumentAndOp.Op op = value.getOp(); + Document doc = null; + Term term = null; + + if (op == DocumentAndOp.Op.INSERT || op == DocumentAndOp.Op.UPDATE) { + doc = new Document(); + doc.add(new Field(docidFieldName, key.getText().toString(), + Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field(contentFieldName, value.getText().toString(), + Field.Store.NO, Field.Index.TOKENIZED)); + } + + if (op == DocumentAndOp.Op.DELETE || op == DocumentAndOp.Op.UPDATE) { + term = new Term(docidFieldName, key.getText().toString()); + } + + output.collect(key, new DocumentAndOp(op, doc, term)); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf) + */ + public void configure(JobConf job) { + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Closeable#close() + */ + public void close() throws IOException { + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java index a1e38c3873b..bc6e6ba48d8 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java @@ -1,231 +1,231 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; -import org.apache.hadoop.contrib.index.mapred.DocumentID; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.RecordReader; - -/** - * A simple RecordReader for LineDoc for plain text files where each line is a - * doc. Each line is as follows: documentIDopcontent, - * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete" - * for delete, or "u", "upd" or "update" for update. - */ -public class LineDocRecordReader implements - RecordReader { - private static final char SPACE = ' '; - private static final char EOL = '\n'; - - private long start; - private long pos; - private long end; - private BufferedInputStream in; - private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256); - - /** - * Provide a bridge to get the bytes from the ByteArrayOutputStream without - * creating a new byte array. - */ - private static class TextStuffer extends OutputStream { - public Text target; - - public void write(int b) { - throw new UnsupportedOperationException("write(byte) not supported"); - } - - public void write(byte[] data, int offset, int len) throws IOException { - target.set(data, offset, len); - } - } - - private TextStuffer bridge = new TextStuffer(); - - /** - * Constructor - * @param job - * @param split - * @throws IOException - */ - public LineDocRecordReader(Configuration job, FileSplit split) - throws IOException { - long start = split.getStart(); - long end = start + split.getLength(); - final Path file = split.getPath(); - - // open the file and seek to the start of the split - FileSystem fs = file.getFileSystem(job); - FSDataInputStream fileIn = fs.open(split.getPath()); - InputStream in = fileIn; - boolean skipFirstLine = false; - if (start != 0) { - skipFirstLine = true; // wait till BufferedInputStream to skip - --start; - fileIn.seek(start); - } - - this.in = new BufferedInputStream(in); - if (skipFirstLine) { // skip first line and re-establish "start". - start += LineDocRecordReader.readData(this.in, null, EOL); - } - this.start = start; - this.pos = start; - this.end = end; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.RecordReader#close() - */ - public void close() throws IOException { - in.close(); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.RecordReader#createKey() - */ - public DocumentID createKey() { - return new DocumentID(); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.RecordReader#createValue() - */ - public LineDocTextAndOp createValue() { - return new LineDocTextAndOp(); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.RecordReader#getPos() - */ - public long getPos() throws IOException { - return pos; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.RecordReader#getProgress() - */ - public float getProgress() throws IOException { - if (start == end) { - return 0.0f; - } else { - return Math.min(1.0f, (pos - start) / (float) (end - start)); - } - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object) - */ - public synchronized boolean next(DocumentID key, LineDocTextAndOp value) - throws IOException { - if (pos >= end) { - return false; - } - - // key is document id, which are bytes until first space - if (!readInto(key.getText(), SPACE)) { - return false; - } - - // read operation: i/d/u, or ins/del/upd, or insert/delete/update - Text opText = new Text(); - if (!readInto(opText, SPACE)) { - return false; - } - String opStr = opText.toString(); - DocumentAndOp.Op op; - if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) { - op = DocumentAndOp.Op.INSERT; - } else if (opStr.equals("d") || opStr.equals("del") - || opStr.equals("delete")) { - op = DocumentAndOp.Op.DELETE; - } else if (opStr.equals("u") || opStr.equals("upd") - || opStr.equals("update")) { - op = DocumentAndOp.Op.UPDATE; - } else { - // default is insert - op = DocumentAndOp.Op.INSERT; - } - value.setOp(op); - - if (op == DocumentAndOp.Op.DELETE) { - return true; - } else { - // read rest of the line - return readInto(value.getText(), EOL); - } - } - - private boolean readInto(Text text, char delimiter) throws IOException { - buffer.reset(); - long bytesRead = readData(in, buffer, delimiter); - if (bytesRead == 0) { - return false; - } - pos += bytesRead; - bridge.target = text; - buffer.writeTo(bridge); - return true; - } - - private static long readData(InputStream in, OutputStream out, char delimiter) - throws IOException { - long bytes = 0; - while (true) { - - int b = in.read(); - if (b == -1) { - break; - } - bytes += 1; - - byte c = (byte) b; - if (c == EOL || c == delimiter) { - break; - } - - if (c == '\r') { - in.mark(1); - byte nextC = (byte) in.read(); - if (nextC != EOL || c == delimiter) { - in.reset(); - } else { - bytes += 1; - } - break; - } - - if (out != null) { - out.write(c); - } - } - return bytes; - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; +import org.apache.hadoop.contrib.index.mapred.DocumentID; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.RecordReader; + +/** + * A simple RecordReader for LineDoc for plain text files where each line is a + * doc. Each line is as follows: documentIDopcontent, + * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete" + * for delete, or "u", "upd" or "update" for update. + */ +public class LineDocRecordReader implements + RecordReader { + private static final char SPACE = ' '; + private static final char EOL = '\n'; + + private long start; + private long pos; + private long end; + private BufferedInputStream in; + private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256); + + /** + * Provide a bridge to get the bytes from the ByteArrayOutputStream without + * creating a new byte array. + */ + private static class TextStuffer extends OutputStream { + public Text target; + + public void write(int b) { + throw new UnsupportedOperationException("write(byte) not supported"); + } + + public void write(byte[] data, int offset, int len) throws IOException { + target.set(data, offset, len); + } + } + + private TextStuffer bridge = new TextStuffer(); + + /** + * Constructor + * @param job + * @param split + * @throws IOException + */ + public LineDocRecordReader(Configuration job, FileSplit split) + throws IOException { + long start = split.getStart(); + long end = start + split.getLength(); + final Path file = split.getPath(); + + // open the file and seek to the start of the split + FileSystem fs = file.getFileSystem(job); + FSDataInputStream fileIn = fs.open(split.getPath()); + InputStream in = fileIn; + boolean skipFirstLine = false; + if (start != 0) { + skipFirstLine = true; // wait till BufferedInputStream to skip + --start; + fileIn.seek(start); + } + + this.in = new BufferedInputStream(in); + if (skipFirstLine) { // skip first line and re-establish "start". + start += LineDocRecordReader.readData(this.in, null, EOL); + } + this.start = start; + this.pos = start; + this.end = end; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.RecordReader#close() + */ + public void close() throws IOException { + in.close(); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.RecordReader#createKey() + */ + public DocumentID createKey() { + return new DocumentID(); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.RecordReader#createValue() + */ + public LineDocTextAndOp createValue() { + return new LineDocTextAndOp(); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.RecordReader#getPos() + */ + public long getPos() throws IOException { + return pos; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.RecordReader#getProgress() + */ + public float getProgress() throws IOException { + if (start == end) { + return 0.0f; + } else { + return Math.min(1.0f, (pos - start) / (float) (end - start)); + } + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object) + */ + public synchronized boolean next(DocumentID key, LineDocTextAndOp value) + throws IOException { + if (pos >= end) { + return false; + } + + // key is document id, which are bytes until first space + if (!readInto(key.getText(), SPACE)) { + return false; + } + + // read operation: i/d/u, or ins/del/upd, or insert/delete/update + Text opText = new Text(); + if (!readInto(opText, SPACE)) { + return false; + } + String opStr = opText.toString(); + DocumentAndOp.Op op; + if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) { + op = DocumentAndOp.Op.INSERT; + } else if (opStr.equals("d") || opStr.equals("del") + || opStr.equals("delete")) { + op = DocumentAndOp.Op.DELETE; + } else if (opStr.equals("u") || opStr.equals("upd") + || opStr.equals("update")) { + op = DocumentAndOp.Op.UPDATE; + } else { + // default is insert + op = DocumentAndOp.Op.INSERT; + } + value.setOp(op); + + if (op == DocumentAndOp.Op.DELETE) { + return true; + } else { + // read rest of the line + return readInto(value.getText(), EOL); + } + } + + private boolean readInto(Text text, char delimiter) throws IOException { + buffer.reset(); + long bytesRead = readData(in, buffer, delimiter); + if (bytesRead == 0) { + return false; + } + pos += bytesRead; + bridge.target = text; + buffer.writeTo(bridge); + return true; + } + + private static long readData(InputStream in, OutputStream out, char delimiter) + throws IOException { + long bytes = 0; + while (true) { + + int b = in.read(); + if (b == -1) { + break; + } + bytes += 1; + + byte c = (byte) b; + if (c == EOL || c == delimiter) { + break; + } + + if (c == '\r') { + in.mark(1); + byte nextC = (byte) in.read(); + if (nextC != EOL || c == delimiter) { + in.reset(); + } else { + bytes += 1; + } + break; + } + + if (out != null) { + out.write(c); + } + } + return bytes; + } +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java index c9dcea30a8f..be707517f40 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java @@ -1,92 +1,92 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; - -/** - * This class represents an operation. The operation can be an insert, a delete - * or an update. If the operation is an insert or an update, a (new) document, - * which is in the form of text, is specified. - */ -public class LineDocTextAndOp implements Writable { - private DocumentAndOp.Op op; - private Text doc; - - /** - * Constructor - */ - public LineDocTextAndOp() { - doc = new Text(); - } - - /** - * Set the type of the operation. - * @param op the type of the operation - */ - public void setOp(DocumentAndOp.Op op) { - this.op = op; - } - - /** - * Get the type of the operation. - * @return the type of the operation - */ - public DocumentAndOp.Op getOp() { - return op; - } - - /** - * Get the text that represents a document. - * @return the text that represents a document - */ - public Text getText() { - return doc; - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - public String toString() { - return this.getClass().getName() + "[op=" + op + ", text=" + doc + "]"; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - public void write(DataOutput out) throws IOException { - throw new IOException(this.getClass().getName() - + ".write should never be called"); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - public void readFields(DataInput in) throws IOException { - throw new IOException(this.getClass().getName() - + ".readFields should never be called"); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.contrib.index.mapred.DocumentAndOp; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/** + * This class represents an operation. The operation can be an insert, a delete + * or an update. If the operation is an insert or an update, a (new) document, + * which is in the form of text, is specified. + */ +public class LineDocTextAndOp implements Writable { + private DocumentAndOp.Op op; + private Text doc; + + /** + * Constructor + */ + public LineDocTextAndOp() { + doc = new Text(); + } + + /** + * Set the type of the operation. + * @param op the type of the operation + */ + public void setOp(DocumentAndOp.Op op) { + this.op = op; + } + + /** + * Get the type of the operation. + * @return the type of the operation + */ + public DocumentAndOp.Op getOp() { + return op; + } + + /** + * Get the text that represents a document. + * @return the text that represents a document + */ + public Text getText() { + return doc; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + public String toString() { + return this.getClass().getName() + "[op=" + op + ", text=" + doc + "]"; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) + */ + public void write(DataOutput out) throws IOException { + throw new IOException(this.getClass().getName() + + ".write should never be called"); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) + */ + public void readFields(DataInput in) throws IOException { + throw new IOException(this.getClass().getName() + + ".readFields should never be called"); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java index d1dd03cd1e3..8d69025dfdd 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java @@ -1,58 +1,58 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.example; - -import org.apache.hadoop.contrib.index.mapred.DocumentID; -import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy; -import org.apache.hadoop.contrib.index.mapred.Shard; - -/** - * Choose a shard for each insert in a round-robin fashion. Choose all the - * shards for each delete because we don't know where it is stored. - */ -public class RoundRobinDistributionPolicy implements IDistributionPolicy { - - private int numShards; - private int rr; // round-robin implementation - - /* (non-Javadoc) - * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[]) - */ - public void init(Shard[] shards) { - numShards = shards.length; - rr = 0; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID) - */ - public int chooseShardForInsert(DocumentID key) { - int chosen = rr; - rr = (rr + 1) % numShards; - return chosen; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID) - */ - public int chooseShardForDelete(DocumentID key) { - // -1 represents all the shards - return -1; - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.example; + +import org.apache.hadoop.contrib.index.mapred.DocumentID; +import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy; +import org.apache.hadoop.contrib.index.mapred.Shard; + +/** + * Choose a shard for each insert in a round-robin fashion. Choose all the + * shards for each delete because we don't know where it is stored. + */ +public class RoundRobinDistributionPolicy implements IDistributionPolicy { + + private int numShards; + private int rr; // round-robin implementation + + /* (non-Javadoc) + * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[]) + */ + public void init(Shard[] shards) { + numShards = shards.length; + rr = 0; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID) + */ + public int chooseShardForInsert(DocumentID key) { + int chosen = rr; + rr = (rr + 1) % numShards; + return chosen; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID) + */ + public int chooseShardForDelete(DocumentID key) { + // -1 represents all the shards + return -1; + } +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java index d98752006af..286e95de9ee 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java @@ -1,55 +1,55 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.lucene.index.IndexFileNameFilter; - -/** - * A wrapper class to convert an IndexFileNameFilter which implements - * java.io.FilenameFilter to an org.apache.hadoop.fs.PathFilter. - */ -class LuceneIndexFileNameFilter implements PathFilter { - - private static final LuceneIndexFileNameFilter singleton = - new LuceneIndexFileNameFilter(); - - /** - * Get a static instance. - * @return the static instance - */ - public static LuceneIndexFileNameFilter getFilter() { - return singleton; - } - - private final IndexFileNameFilter luceneFilter; - - private LuceneIndexFileNameFilter() { - luceneFilter = IndexFileNameFilter.getFilter(); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.fs.PathFilter#accept(org.apache.hadoop.fs.Path) - */ - public boolean accept(Path path) { - return luceneFilter.accept(null, path.getName()); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.lucene.index.IndexFileNameFilter; + +/** + * A wrapper class to convert an IndexFileNameFilter which implements + * java.io.FilenameFilter to an org.apache.hadoop.fs.PathFilter. + */ +class LuceneIndexFileNameFilter implements PathFilter { + + private static final LuceneIndexFileNameFilter singleton = + new LuceneIndexFileNameFilter(); + + /** + * Get a static instance. + * @return the static instance + */ + public static LuceneIndexFileNameFilter getFilter() { + return singleton; + } + + private final IndexFileNameFilter luceneFilter; + + private LuceneIndexFileNameFilter() { + luceneFilter = IndexFileNameFilter.getFilter(); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.fs.PathFilter#accept(org.apache.hadoop.fs.Path) + */ + public boolean accept(Path path) { + return luceneFilter.accept(null, path.getName()); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java index 8a54feaeb61..eff980824e8 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java @@ -1,112 +1,112 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import java.io.IOException; - -import org.apache.lucene.store.Directory; - -/** - * This class copies some methods from Lucene's SegmentInfos since that class - * is not public. - */ -public final class LuceneUtil { - - static final class IndexFileNames { - /** Name of the index segment file */ - static final String SEGMENTS = "segments"; - - /** Name of the generation reference file name */ - static final String SEGMENTS_GEN = "segments.gen"; - } - - /** - * Check if the file is a segments_N file - * @param name - * @return true if the file is a segments_N file - */ - public static boolean isSegmentsFile(String name) { - return name.startsWith(IndexFileNames.SEGMENTS) - && !name.equals(IndexFileNames.SEGMENTS_GEN); - } - - /** - * Check if the file is the segments.gen file - * @param name - * @return true if the file is the segments.gen file - */ - public static boolean isSegmentsGenFile(String name) { - return name.equals(IndexFileNames.SEGMENTS_GEN); - } - - /** - * Get the generation (N) of the current segments_N file in the directory. - * - * @param directory -- directory to search for the latest segments_N file - */ - public static long getCurrentSegmentGeneration(Directory directory) - throws IOException { - String[] files = directory.list(); - if (files == null) - throw new IOException("cannot read directory " + directory - + ": list() returned null"); - return getCurrentSegmentGeneration(files); - } - - /** - * Get the generation (N) of the current segments_N file from a list of - * files. - * - * @param files -- array of file names to check - */ - public static long getCurrentSegmentGeneration(String[] files) { - if (files == null) { - return -1; - } - long max = -1; - for (int i = 0; i < files.length; i++) { - String file = files[i]; - if (file.startsWith(IndexFileNames.SEGMENTS) - && !file.equals(IndexFileNames.SEGMENTS_GEN)) { - long gen = generationFromSegmentsFileName(file); - if (gen > max) { - max = gen; - } - } - } - return max; - } - - /** - * Parse the generation off the segments file name and return it. - */ - public static long generationFromSegmentsFileName(String fileName) { - if (fileName.equals(IndexFileNames.SEGMENTS)) { - return 0; - } else if (fileName.startsWith(IndexFileNames.SEGMENTS)) { - return Long.parseLong( - fileName.substring(1 + IndexFileNames.SEGMENTS.length()), - Character.MAX_RADIX); - } else { - throw new IllegalArgumentException("fileName \"" + fileName - + "\" is not a segments file"); - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import java.io.IOException; + +import org.apache.lucene.store.Directory; + +/** + * This class copies some methods from Lucene's SegmentInfos since that class + * is not public. + */ +public final class LuceneUtil { + + static final class IndexFileNames { + /** Name of the index segment file */ + static final String SEGMENTS = "segments"; + + /** Name of the generation reference file name */ + static final String SEGMENTS_GEN = "segments.gen"; + } + + /** + * Check if the file is a segments_N file + * @param name + * @return true if the file is a segments_N file + */ + public static boolean isSegmentsFile(String name) { + return name.startsWith(IndexFileNames.SEGMENTS) + && !name.equals(IndexFileNames.SEGMENTS_GEN); + } + + /** + * Check if the file is the segments.gen file + * @param name + * @return true if the file is the segments.gen file + */ + public static boolean isSegmentsGenFile(String name) { + return name.equals(IndexFileNames.SEGMENTS_GEN); + } + + /** + * Get the generation (N) of the current segments_N file in the directory. + * + * @param directory -- directory to search for the latest segments_N file + */ + public static long getCurrentSegmentGeneration(Directory directory) + throws IOException { + String[] files = directory.list(); + if (files == null) + throw new IOException("cannot read directory " + directory + + ": list() returned null"); + return getCurrentSegmentGeneration(files); + } + + /** + * Get the generation (N) of the current segments_N file from a list of + * files. + * + * @param files -- array of file names to check + */ + public static long getCurrentSegmentGeneration(String[] files) { + if (files == null) { + return -1; + } + long max = -1; + for (int i = 0; i < files.length; i++) { + String file = files[i]; + if (file.startsWith(IndexFileNames.SEGMENTS) + && !file.equals(IndexFileNames.SEGMENTS_GEN)) { + long gen = generationFromSegmentsFileName(file); + if (gen > max) { + max = gen; + } + } + } + return max; + } + + /** + * Parse the generation off the segments file name and return it. + */ + public static long generationFromSegmentsFileName(String fileName) { + if (fileName.equals(IndexFileNames.SEGMENTS)) { + return 0; + } else if (fileName.startsWith(IndexFileNames.SEGMENTS)) { + return Long.parseLong( + fileName.substring(1 + IndexFileNames.SEGMENTS.length()), + Character.MAX_RADIX); + } else { + throw new IllegalArgumentException("fileName \"" + fileName + + "\" is not a segments file"); + } + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java index 85d5f3d2c26..01ef01e999f 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java @@ -1,49 +1,49 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import java.io.IOException; -import java.util.List; - -import org.apache.lucene.index.IndexCommitPoint; -import org.apache.lucene.index.IndexDeletionPolicy; - -/** - * For mixed directory. Use KeepAllDeletionPolicy for the read-only directory - * (keep all from init) and use KeepOnlyLastCommitDeletionPolicy for the - * writable directory (initially empty, keep latest after init). - */ -class MixedDeletionPolicy implements IndexDeletionPolicy { - - private int keepAllFromInit = 0; - - public void onInit(List commits) throws IOException { - keepAllFromInit = commits.size(); - } - - public void onCommit(List commits) throws IOException { - int size = commits.size(); - assert (size > keepAllFromInit); - // keep all from init and the latest, delete the rest - for (int i = keepAllFromInit; i < size - 1; i++) { - ((IndexCommitPoint) commits.get(i)).delete(); - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexCommitPoint; +import org.apache.lucene.index.IndexDeletionPolicy; + +/** + * For mixed directory. Use KeepAllDeletionPolicy for the read-only directory + * (keep all from init) and use KeepOnlyLastCommitDeletionPolicy for the + * writable directory (initially empty, keep latest after init). + */ +class MixedDeletionPolicy implements IndexDeletionPolicy { + + private int keepAllFromInit = 0; + + public void onInit(List commits) throws IOException { + keepAllFromInit = commits.size(); + } + + public void onCommit(List commits) throws IOException { + int size = commits.size(); + assert (size > keepAllFromInit); + // keep all from init and the latest, delete the rest + for (int i = keepAllFromInit; i < size - 1; i++) { + ((IndexCommitPoint) commits.get(i)).delete(); + } + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java index a96ca600cdd..037e168b4bd 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java @@ -1,185 +1,185 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.NoLockFactory; - -/** - * The initial version of an index is stored in a read-only FileSystem dir - * (FileSystemDirectory). Index files created by newer versions are written to - * a writable local FS dir (Lucene's FSDirectory). We should use the general - * FileSystemDirectory for the writable dir as well. But have to use Lucene's - * FSDirectory because currently Lucene does randome write and - * FileSystemDirectory only supports sequential write. - * - * Note: We may delete files from the read-only FileSystem dir because there - * can be some segment files from an uncommitted checkpoint. For the same - * reason, we may create files in the writable dir which already exist in the - * read-only dir and logically they overwrite the ones in the read-only dir. - */ -class MixedDirectory extends Directory { - - private final Directory readDir; // FileSystemDirectory - private final Directory writeDir; // Lucene's FSDirectory - - // take advantage of the fact that Lucene's FSDirectory.fileExists is faster - - public MixedDirectory(FileSystem readFs, Path readPath, FileSystem writeFs, - Path writePath, Configuration conf) throws IOException { - - try { - readDir = new FileSystemDirectory(readFs, readPath, false, conf); - // check writeFS is a local FS? - writeDir = FSDirectory.getDirectory(writePath.toString()); - - } catch (IOException e) { - try { - close(); - } catch (IOException e1) { - // ignore this one, throw the original one - } - throw e; - } - - lockFactory = new NoLockFactory(); - } - - // for debugging - MixedDirectory(Directory readDir, Directory writeDir) throws IOException { - this.readDir = readDir; - this.writeDir = writeDir; - - lockFactory = new NoLockFactory(); - } - - @Override - public String[] list() throws IOException { - String[] readFiles = readDir.list(); - String[] writeFiles = writeDir.list(); - - if (readFiles == null || readFiles.length == 0) { - return writeFiles; - } else if (writeFiles == null || writeFiles.length == 0) { - return readFiles; - } else { - String[] result = new String[readFiles.length + writeFiles.length]; - System.arraycopy(readFiles, 0, result, 0, readFiles.length); - System.arraycopy(writeFiles, 0, result, readFiles.length, - writeFiles.length); - return result; - } - } - - @Override - public void deleteFile(String name) throws IOException { - if (writeDir.fileExists(name)) { - writeDir.deleteFile(name); - } - if (readDir.fileExists(name)) { - readDir.deleteFile(name); - } - } - - @Override - public boolean fileExists(String name) throws IOException { - return writeDir.fileExists(name) || readDir.fileExists(name); - } - - @Override - public long fileLength(String name) throws IOException { - if (writeDir.fileExists(name)) { - return writeDir.fileLength(name); - } else { - return readDir.fileLength(name); - } - } - - @Override - public long fileModified(String name) throws IOException { - if (writeDir.fileExists(name)) { - return writeDir.fileModified(name); - } else { - return readDir.fileModified(name); - } - } - - @Override - public void renameFile(String from, String to) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public void touchFile(String name) throws IOException { - if (writeDir.fileExists(name)) { - writeDir.touchFile(name); - } else { - readDir.touchFile(name); - } - } - - @Override - public IndexOutput createOutput(String name) throws IOException { - return writeDir.createOutput(name); - } - - @Override - public IndexInput openInput(String name) throws IOException { - if (writeDir.fileExists(name)) { - return writeDir.openInput(name); - } else { - return readDir.openInput(name); - } - } - - @Override - public IndexInput openInput(String name, int bufferSize) throws IOException { - if (writeDir.fileExists(name)) { - return writeDir.openInput(name, bufferSize); - } else { - return readDir.openInput(name, bufferSize); - } - } - - @Override - public void close() throws IOException { - try { - if (readDir != null) { - readDir.close(); - } - } finally { - if (writeDir != null) { - writeDir.close(); - } - } - } - - public String toString() { - return this.getClass().getName() + "@" + readDir + "&" + writeDir; - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.NoLockFactory; + +/** + * The initial version of an index is stored in a read-only FileSystem dir + * (FileSystemDirectory). Index files created by newer versions are written to + * a writable local FS dir (Lucene's FSDirectory). We should use the general + * FileSystemDirectory for the writable dir as well. But have to use Lucene's + * FSDirectory because currently Lucene does randome write and + * FileSystemDirectory only supports sequential write. + * + * Note: We may delete files from the read-only FileSystem dir because there + * can be some segment files from an uncommitted checkpoint. For the same + * reason, we may create files in the writable dir which already exist in the + * read-only dir and logically they overwrite the ones in the read-only dir. + */ +class MixedDirectory extends Directory { + + private final Directory readDir; // FileSystemDirectory + private final Directory writeDir; // Lucene's FSDirectory + + // take advantage of the fact that Lucene's FSDirectory.fileExists is faster + + public MixedDirectory(FileSystem readFs, Path readPath, FileSystem writeFs, + Path writePath, Configuration conf) throws IOException { + + try { + readDir = new FileSystemDirectory(readFs, readPath, false, conf); + // check writeFS is a local FS? + writeDir = FSDirectory.getDirectory(writePath.toString()); + + } catch (IOException e) { + try { + close(); + } catch (IOException e1) { + // ignore this one, throw the original one + } + throw e; + } + + lockFactory = new NoLockFactory(); + } + + // for debugging + MixedDirectory(Directory readDir, Directory writeDir) throws IOException { + this.readDir = readDir; + this.writeDir = writeDir; + + lockFactory = new NoLockFactory(); + } + + @Override + public String[] list() throws IOException { + String[] readFiles = readDir.list(); + String[] writeFiles = writeDir.list(); + + if (readFiles == null || readFiles.length == 0) { + return writeFiles; + } else if (writeFiles == null || writeFiles.length == 0) { + return readFiles; + } else { + String[] result = new String[readFiles.length + writeFiles.length]; + System.arraycopy(readFiles, 0, result, 0, readFiles.length); + System.arraycopy(writeFiles, 0, result, readFiles.length, + writeFiles.length); + return result; + } + } + + @Override + public void deleteFile(String name) throws IOException { + if (writeDir.fileExists(name)) { + writeDir.deleteFile(name); + } + if (readDir.fileExists(name)) { + readDir.deleteFile(name); + } + } + + @Override + public boolean fileExists(String name) throws IOException { + return writeDir.fileExists(name) || readDir.fileExists(name); + } + + @Override + public long fileLength(String name) throws IOException { + if (writeDir.fileExists(name)) { + return writeDir.fileLength(name); + } else { + return readDir.fileLength(name); + } + } + + @Override + public long fileModified(String name) throws IOException { + if (writeDir.fileExists(name)) { + return writeDir.fileModified(name); + } else { + return readDir.fileModified(name); + } + } + + @Override + public void renameFile(String from, String to) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void touchFile(String name) throws IOException { + if (writeDir.fileExists(name)) { + writeDir.touchFile(name); + } else { + readDir.touchFile(name); + } + } + + @Override + public IndexOutput createOutput(String name) throws IOException { + return writeDir.createOutput(name); + } + + @Override + public IndexInput openInput(String name) throws IOException { + if (writeDir.fileExists(name)) { + return writeDir.openInput(name); + } else { + return readDir.openInput(name); + } + } + + @Override + public IndexInput openInput(String name, int bufferSize) throws IOException { + if (writeDir.fileExists(name)) { + return writeDir.openInput(name, bufferSize); + } else { + return readDir.openInput(name, bufferSize); + } + } + + @Override + public void close() throws IOException { + try { + if (readDir != null) { + readDir.close(); + } + } finally { + if (writeDir != null) { + writeDir.close(); + } + } + } + + public String toString() { + return this.getClass().getName() + "@" + readDir + "&" + writeDir; + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java index b4243f521f5..29aca3bc49b 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java @@ -1,119 +1,119 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Text; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.RAMDirectory; - -/** - * A utility class which writes an index in a ram dir into a DataOutput and - * read from a DataInput an index into a ram dir. - */ -public class RAMDirectoryUtil { - private static final int BUFFER_SIZE = 1024; // RAMOutputStream.BUFFER_SIZE; - - /** - * Write a number of files from a ram directory to a data output. - * @param out the data output - * @param dir the ram directory - * @param names the names of the files to write - * @throws IOException - */ - public static void writeRAMFiles(DataOutput out, RAMDirectory dir, - String[] names) throws IOException { - out.writeInt(names.length); - - for (int i = 0; i < names.length; i++) { - Text.writeString(out, names[i]); - long length = dir.fileLength(names[i]); - out.writeLong(length); - - if (length > 0) { - // can we avoid the extra copy? - IndexInput input = null; - try { - input = dir.openInput(names[i], BUFFER_SIZE); - - int position = 0; - byte[] buffer = new byte[BUFFER_SIZE]; - - while (position < length) { - int len = - position + BUFFER_SIZE <= length ? BUFFER_SIZE - : (int) (length - position); - input.readBytes(buffer, 0, len); - out.write(buffer, 0, len); - position += len; - } - } finally { - if (input != null) { - input.close(); - } - } - } - } - } - - /** - * Read a number of files from a data input to a ram directory. - * @param in the data input - * @param dir the ram directory - * @throws IOException - */ - public static void readRAMFiles(DataInput in, RAMDirectory dir) - throws IOException { - int numFiles = in.readInt(); - - for (int i = 0; i < numFiles; i++) { - String name = Text.readString(in); - long length = in.readLong(); - - if (length > 0) { - // can we avoid the extra copy? - IndexOutput output = null; - try { - output = dir.createOutput(name); - - int position = 0; - byte[] buffer = new byte[BUFFER_SIZE]; - - while (position < length) { - int len = - position + BUFFER_SIZE <= length ? BUFFER_SIZE - : (int) (length - position); - in.readFully(buffer, 0, len); - output.writeBytes(buffer, 0, len); - position += len; - } - } finally { - if (output != null) { - output.close(); - } - } - } - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Text; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; + +/** + * A utility class which writes an index in a ram dir into a DataOutput and + * read from a DataInput an index into a ram dir. + */ +public class RAMDirectoryUtil { + private static final int BUFFER_SIZE = 1024; // RAMOutputStream.BUFFER_SIZE; + + /** + * Write a number of files from a ram directory to a data output. + * @param out the data output + * @param dir the ram directory + * @param names the names of the files to write + * @throws IOException + */ + public static void writeRAMFiles(DataOutput out, RAMDirectory dir, + String[] names) throws IOException { + out.writeInt(names.length); + + for (int i = 0; i < names.length; i++) { + Text.writeString(out, names[i]); + long length = dir.fileLength(names[i]); + out.writeLong(length); + + if (length > 0) { + // can we avoid the extra copy? + IndexInput input = null; + try { + input = dir.openInput(names[i], BUFFER_SIZE); + + int position = 0; + byte[] buffer = new byte[BUFFER_SIZE]; + + while (position < length) { + int len = + position + BUFFER_SIZE <= length ? BUFFER_SIZE + : (int) (length - position); + input.readBytes(buffer, 0, len); + out.write(buffer, 0, len); + position += len; + } + } finally { + if (input != null) { + input.close(); + } + } + } + } + } + + /** + * Read a number of files from a data input to a ram directory. + * @param in the data input + * @param dir the ram directory + * @throws IOException + */ + public static void readRAMFiles(DataInput in, RAMDirectory dir) + throws IOException { + int numFiles = in.readInt(); + + for (int i = 0; i < numFiles; i++) { + String name = Text.readString(in); + long length = in.readLong(); + + if (length > 0) { + // can we avoid the extra copy? + IndexOutput output = null; + try { + output = dir.createOutput(name); + + int position = 0; + byte[] buffer = new byte[BUFFER_SIZE]; + + while (position < length) { + int len = + position + BUFFER_SIZE <= length ? BUFFER_SIZE + : (int) (length - position); + in.readFully(buffer, 0, len); + output.writeBytes(buffer, 0, len); + position += len; + } + } finally { + if (output != null) { + output.close(); + } + } + } + } + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java index b2abd3cd11b..b4b09160bc1 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java @@ -1,233 +1,233 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration; -import org.apache.hadoop.contrib.index.mapred.IntermediateForm; -import org.apache.hadoop.contrib.index.mapred.Shard; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; -import org.apache.lucene.index.Term; -import org.apache.lucene.store.Directory; - -/** - * The initial version of an index is stored in the perm dir. Index files - * created by newer versions are written to a temp dir on the local FS. After - * successfully creating the new version in the temp dir, the shard writer - * moves the new files to the perm dir and deletes the temp dir in close(). - */ -public class ShardWriter { - static final Log LOG = LogFactory.getLog(ShardWriter.class); - - private final FileSystem fs; - private final FileSystem localFs; - private final Path perm; - private final Path temp; - private final Directory dir; - private final IndexWriter writer; - private int maxNumSegments; - private long numForms = 0; - - /** - * Constructor - * @param fs - * @param shard - * @param tempDir - * @param iconf - * @throws IOException - */ - public ShardWriter(FileSystem fs, Shard shard, String tempDir, - IndexUpdateConfiguration iconf) throws IOException { - LOG.info("Construct a shard writer"); - - this.fs = fs; - localFs = FileSystem.getLocal(iconf.getConfiguration()); - perm = new Path(shard.getDirectory()); - temp = new Path(tempDir); - - long initGeneration = shard.getGeneration(); - if (!fs.exists(perm)) { - assert (initGeneration < 0); - fs.mkdirs(perm); - } else { - restoreGeneration(fs, perm, initGeneration); - } - dir = - new MixedDirectory(fs, perm, localFs, fs.startLocalOutput(perm, temp), - iconf.getConfiguration()); - - // analyzer is null because we only use addIndexes, not addDocument - writer = - new IndexWriter(dir, false, null, - initGeneration < 0 ? new KeepOnlyLastCommitDeletionPolicy() - : new MixedDeletionPolicy()); - setParameters(iconf); - } - - /** - * Process an intermediate form by carrying out, on the Lucene instance of - * the shard, the deletes and the inserts (a ram index) in the form. - * @param form the intermediate form containing deletes and a ram index - * @throws IOException - */ - public void process(IntermediateForm form) throws IOException { - // first delete - Iterator iter = form.deleteTermIterator(); - while (iter.hasNext()) { - writer.deleteDocuments(iter.next()); - } - // then insert - writer.addIndexesNoOptimize(new Directory[] { form.getDirectory() }); - numForms++; - } - - /** - * Close the shard writer. Optimize the Lucene instance of the shard before - * closing if necessary, and copy the files created in the temp directory - * to the permanent directory after closing. - * @throws IOException - */ - public void close() throws IOException { - LOG.info("Closing the shard writer, processed " + numForms + " forms"); - try { - try { - if (maxNumSegments > 0) { - writer.optimize(maxNumSegments); - LOG.info("Optimized the shard into at most " + maxNumSegments - + " segments"); - } - } finally { - writer.close(); - LOG.info("Closed Lucene index writer"); - } - - moveFromTempToPerm(); - LOG.info("Moved new index files to " + perm); - - } finally { - dir.close(); - LOG.info("Closed the shard writer"); - } - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - public String toString() { - return this.getClass().getName() + "@" + perm + "&" + temp; - } - - private void setParameters(IndexUpdateConfiguration iconf) { - int maxFieldLength = iconf.getIndexMaxFieldLength(); - if (maxFieldLength > 0) { - writer.setMaxFieldLength(maxFieldLength); - } - writer.setUseCompoundFile(iconf.getIndexUseCompoundFile()); - maxNumSegments = iconf.getIndexMaxNumSegments(); - - if (maxFieldLength > 0) { - LOG.info("sea.max.field.length = " + writer.getMaxFieldLength()); - } - LOG.info("sea.use.compound.file = " + writer.getUseCompoundFile()); - LOG.info("sea.max.num.segments = " + maxNumSegments); - } - - // in case a previous reduce task fails, restore the generation to - // the original starting point by deleting the segments.gen file - // and the segments_N files whose generations are greater than the - // starting generation; rest of the unwanted files will be deleted - // once the unwanted segments_N files are deleted - private void restoreGeneration(FileSystem fs, Path perm, long startGen) - throws IOException { - - FileStatus[] fileStatus = fs.listStatus(perm, new PathFilter() { - public boolean accept(Path path) { - return LuceneUtil.isSegmentsFile(path.getName()); - } - }); - - // remove the segments_N files whose generation are greater than - // the starting generation - for (int i = 0; i < fileStatus.length; i++) { - Path path = fileStatus[i].getPath(); - if (startGen < LuceneUtil.generationFromSegmentsFileName(path.getName())) { - fs.delete(path, true); - } - } - - // always remove segments.gen in case last failed try removed segments_N - // but not segments.gen, and segments.gen will be overwritten anyway. - Path segmentsGenFile = new Path(LuceneUtil.IndexFileNames.SEGMENTS_GEN); - if (fs.exists(segmentsGenFile)) { - fs.delete(segmentsGenFile, true); - } - } - - // move the files created in the temp dir into the perm dir - // and then delete the temp dir from the local FS - private void moveFromTempToPerm() throws IOException { - try { - FileStatus[] fileStatus = - localFs.listStatus(temp, LuceneIndexFileNameFilter.getFilter()); - Path segmentsPath = null; - Path segmentsGenPath = null; - - // move the files created in temp dir except segments_N and segments.gen - for (int i = 0; i < fileStatus.length; i++) { - Path path = fileStatus[i].getPath(); - String name = path.getName(); - - if (LuceneUtil.isSegmentsGenFile(name)) { - assert (segmentsGenPath == null); - segmentsGenPath = path; - } else if (LuceneUtil.isSegmentsFile(name)) { - assert (segmentsPath == null); - segmentsPath = path; - } else { - fs.completeLocalOutput(new Path(perm, name), path); - } - } - - // move the segments_N file - if (segmentsPath != null) { - fs.completeLocalOutput(new Path(perm, segmentsPath.getName()), - segmentsPath); - } - - // move the segments.gen file - if (segmentsGenPath != null) { - fs.completeLocalOutput(new Path(perm, segmentsGenPath.getName()), - segmentsGenPath); - } - } finally { - // finally delete the temp dir (files should have been deleted) - localFs.delete(temp, true); - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration; +import org.apache.hadoop.contrib.index.mapred.IntermediateForm; +import org.apache.hadoop.contrib.index.mapred.Shard; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; + +/** + * The initial version of an index is stored in the perm dir. Index files + * created by newer versions are written to a temp dir on the local FS. After + * successfully creating the new version in the temp dir, the shard writer + * moves the new files to the perm dir and deletes the temp dir in close(). + */ +public class ShardWriter { + static final Log LOG = LogFactory.getLog(ShardWriter.class); + + private final FileSystem fs; + private final FileSystem localFs; + private final Path perm; + private final Path temp; + private final Directory dir; + private final IndexWriter writer; + private int maxNumSegments; + private long numForms = 0; + + /** + * Constructor + * @param fs + * @param shard + * @param tempDir + * @param iconf + * @throws IOException + */ + public ShardWriter(FileSystem fs, Shard shard, String tempDir, + IndexUpdateConfiguration iconf) throws IOException { + LOG.info("Construct a shard writer"); + + this.fs = fs; + localFs = FileSystem.getLocal(iconf.getConfiguration()); + perm = new Path(shard.getDirectory()); + temp = new Path(tempDir); + + long initGeneration = shard.getGeneration(); + if (!fs.exists(perm)) { + assert (initGeneration < 0); + fs.mkdirs(perm); + } else { + restoreGeneration(fs, perm, initGeneration); + } + dir = + new MixedDirectory(fs, perm, localFs, fs.startLocalOutput(perm, temp), + iconf.getConfiguration()); + + // analyzer is null because we only use addIndexes, not addDocument + writer = + new IndexWriter(dir, false, null, + initGeneration < 0 ? new KeepOnlyLastCommitDeletionPolicy() + : new MixedDeletionPolicy()); + setParameters(iconf); + } + + /** + * Process an intermediate form by carrying out, on the Lucene instance of + * the shard, the deletes and the inserts (a ram index) in the form. + * @param form the intermediate form containing deletes and a ram index + * @throws IOException + */ + public void process(IntermediateForm form) throws IOException { + // first delete + Iterator iter = form.deleteTermIterator(); + while (iter.hasNext()) { + writer.deleteDocuments(iter.next()); + } + // then insert + writer.addIndexesNoOptimize(new Directory[] { form.getDirectory() }); + numForms++; + } + + /** + * Close the shard writer. Optimize the Lucene instance of the shard before + * closing if necessary, and copy the files created in the temp directory + * to the permanent directory after closing. + * @throws IOException + */ + public void close() throws IOException { + LOG.info("Closing the shard writer, processed " + numForms + " forms"); + try { + try { + if (maxNumSegments > 0) { + writer.optimize(maxNumSegments); + LOG.info("Optimized the shard into at most " + maxNumSegments + + " segments"); + } + } finally { + writer.close(); + LOG.info("Closed Lucene index writer"); + } + + moveFromTempToPerm(); + LOG.info("Moved new index files to " + perm); + + } finally { + dir.close(); + LOG.info("Closed the shard writer"); + } + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + public String toString() { + return this.getClass().getName() + "@" + perm + "&" + temp; + } + + private void setParameters(IndexUpdateConfiguration iconf) { + int maxFieldLength = iconf.getIndexMaxFieldLength(); + if (maxFieldLength > 0) { + writer.setMaxFieldLength(maxFieldLength); + } + writer.setUseCompoundFile(iconf.getIndexUseCompoundFile()); + maxNumSegments = iconf.getIndexMaxNumSegments(); + + if (maxFieldLength > 0) { + LOG.info("sea.max.field.length = " + writer.getMaxFieldLength()); + } + LOG.info("sea.use.compound.file = " + writer.getUseCompoundFile()); + LOG.info("sea.max.num.segments = " + maxNumSegments); + } + + // in case a previous reduce task fails, restore the generation to + // the original starting point by deleting the segments.gen file + // and the segments_N files whose generations are greater than the + // starting generation; rest of the unwanted files will be deleted + // once the unwanted segments_N files are deleted + private void restoreGeneration(FileSystem fs, Path perm, long startGen) + throws IOException { + + FileStatus[] fileStatus = fs.listStatus(perm, new PathFilter() { + public boolean accept(Path path) { + return LuceneUtil.isSegmentsFile(path.getName()); + } + }); + + // remove the segments_N files whose generation are greater than + // the starting generation + for (int i = 0; i < fileStatus.length; i++) { + Path path = fileStatus[i].getPath(); + if (startGen < LuceneUtil.generationFromSegmentsFileName(path.getName())) { + fs.delete(path, true); + } + } + + // always remove segments.gen in case last failed try removed segments_N + // but not segments.gen, and segments.gen will be overwritten anyway. + Path segmentsGenFile = new Path(LuceneUtil.IndexFileNames.SEGMENTS_GEN); + if (fs.exists(segmentsGenFile)) { + fs.delete(segmentsGenFile, true); + } + } + + // move the files created in the temp dir into the perm dir + // and then delete the temp dir from the local FS + private void moveFromTempToPerm() throws IOException { + try { + FileStatus[] fileStatus = + localFs.listStatus(temp, LuceneIndexFileNameFilter.getFilter()); + Path segmentsPath = null; + Path segmentsGenPath = null; + + // move the files created in temp dir except segments_N and segments.gen + for (int i = 0; i < fileStatus.length; i++) { + Path path = fileStatus[i].getPath(); + String name = path.getName(); + + if (LuceneUtil.isSegmentsGenFile(name)) { + assert (segmentsGenPath == null); + segmentsGenPath = path; + } else if (LuceneUtil.isSegmentsFile(name)) { + assert (segmentsPath == null); + segmentsPath = path; + } else { + fs.completeLocalOutput(new Path(perm, name), path); + } + } + + // move the segments_N file + if (segmentsPath != null) { + fs.completeLocalOutput(new Path(perm, segmentsPath.getName()), + segmentsPath); + } + + // move the segments.gen file + if (segmentsGenPath != null) { + fs.completeLocalOutput(new Path(perm, segmentsGenPath.getName()), + segmentsGenPath); + } + } finally { + // finally delete the temp dir (files should have been deleted) + localFs.delete(temp, true); + } + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java index 1d2ca384624..778f18bcc9a 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java @@ -1,276 +1,276 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.main; - -import java.io.IOException; -import java.text.NumberFormat; -import java.util.Arrays; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration; -import org.apache.hadoop.contrib.index.mapred.IIndexUpdater; -import org.apache.hadoop.contrib.index.mapred.Shard; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.main; + +import java.io.IOException; +import java.text.NumberFormat; +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration; +import org.apache.hadoop.contrib.index.mapred.IIndexUpdater; +import org.apache.hadoop.contrib.index.mapred.Shard; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileOutputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.util.ReflectionUtils; - -/** - * A distributed "index" is partitioned into "shards". Each shard corresponds - * to a Lucene instance. This class contains the main() method which uses a - * Map/Reduce job to analyze documents and update Lucene instances in parallel. - * - * The main() method in UpdateIndex requires the following information for - * updating the shards: - * - Input formatter. This specifies how to format the input documents. - * - Analysis. This defines the analyzer to use on the input. The analyzer - * determines whether a document is being inserted, updated, or deleted. - * For inserts or updates, the analyzer also converts each input document - * into a Lucene document. - * - Input paths. This provides the location(s) of updated documents, - * e.g., HDFS files or directories, or HBase tables. - * - Shard paths, or index path with the number of shards. Either specify - * the path for each shard, or specify an index path and the shards are - * the sub-directories of the index directory. - * - Output path. When the update to a shard is done, a message is put here. - * - Number of map tasks. - * - * All of the information can be specified in a configuration file. All but - * the first two can also be specified as command line options. Check out - * conf/index-config.xml.template for other configurable parameters. - * - * Note: Because of the parallel nature of Map/Reduce, the behaviour of - * multiple inserts, deletes or updates to the same document is undefined. - */ -public class UpdateIndex { - public static final Log LOG = LogFactory.getLog(UpdateIndex.class); - - private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); - static { - NUMBER_FORMAT.setMinimumIntegerDigits(5); - NUMBER_FORMAT.setGroupingUsed(false); - } - - private static long now() { - return System.currentTimeMillis(); - } - - private static void printUsage(String cmd) { - System.err.println("Usage: java " + UpdateIndex.class.getName() + "\n" - + " -inputPaths \n" - + " -outputPath \n" - + " -shards \n" - + " -indexPath \n" - + " -numShards \n" - + " -numMapTasks \n" - + " -conf \n" - + "Note: Do not use both -shards option and -indexPath option."); - } - - private static String getIndexPath(Configuration conf) { - return conf.get("sea.index.path"); - } - - private static int getNumShards(Configuration conf) { - return conf.getInt("sea.num.shards", 1); - } - - private static Shard[] createShards(String indexPath, int numShards, - Configuration conf) throws IOException { - - String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR; - long versionNumber = -1; - long generation = -1; - - FileSystem fs = FileSystem.get(conf); - Path path = new Path(indexPath); - - if (fs.exists(path)) { - FileStatus[] fileStatus = fs.listStatus(path); - String[] shardNames = new String[fileStatus.length]; - int count = 0; - for (int i = 0; i < fileStatus.length; i++) { - if (fileStatus[i].isDirectory()) { - shardNames[count] = fileStatus[i].getPath().getName(); - count++; - } - } - Arrays.sort(shardNames, 0, count); - - Shard[] shards = new Shard[count >= numShards ? count : numShards]; - for (int i = 0; i < count; i++) { - shards[i] = - new Shard(versionNumber, parent + shardNames[i], generation); - } - - int number = count; - for (int i = count; i < numShards; i++) { - String shardPath; - while (true) { - shardPath = parent + NUMBER_FORMAT.format(number++); - if (!fs.exists(new Path(shardPath))) { - break; - } - } - shards[i] = new Shard(versionNumber, shardPath, generation); - } - return shards; - } else { - Shard[] shards = new Shard[numShards]; - for (int i = 0; i < shards.length; i++) { - shards[i] = - new Shard(versionNumber, parent + NUMBER_FORMAT.format(i), - generation); - } - return shards; - } - } - - /** - * The main() method - * @param argv - */ - public static void main(String[] argv) { - if (argv.length == 0) { - printUsage(""); - System.exit(-1); - } - - String inputPathsString = null; - Path outputPath = null; - String shardsString = null; - String indexPath = null; - int numShards = -1; - int numMapTasks = -1; - Configuration conf = new Configuration(); - String confPath = null; - - // parse the command line - for (int i = 0; i < argv.length; i++) { // parse command line - if (argv[i].equals("-inputPaths")) { - inputPathsString = argv[++i]; - } else if (argv[i].equals("-outputPath")) { - outputPath = new Path(argv[++i]); - } else if (argv[i].equals("-shards")) { - shardsString = argv[++i]; - } else if (argv[i].equals("-indexPath")) { - indexPath = argv[++i]; - } else if (argv[i].equals("-numShards")) { - numShards = Integer.parseInt(argv[++i]); - } else if (argv[i].equals("-numMapTasks")) { - numMapTasks = Integer.parseInt(argv[++i]); - } else if (argv[i].equals("-conf")) { - // add as a local FS resource - confPath = argv[++i]; - conf.addResource(new Path(confPath)); - } else { - System.out.println("Unknown option " + argv[i] + " w/ value " - + argv[++i]); - } - } - LOG.info("inputPaths = " + inputPathsString); - LOG.info("outputPath = " + outputPath); - LOG.info("shards = " + shardsString); - LOG.info("indexPath = " + indexPath); - LOG.info("numShards = " + numShards); - LOG.info("numMapTasks= " + numMapTasks); - LOG.info("confPath = " + confPath); - - Path[] inputPaths = null; - Shard[] shards = null; - - JobConf jobConf = new JobConf(conf); - IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(jobConf); - - if (inputPathsString != null) { - jobConf.set(org.apache.hadoop.mapreduce.lib.input. - FileInputFormat.INPUT_DIR, inputPathsString); - } - inputPaths = FileInputFormat.getInputPaths(jobConf); - if (inputPaths.length == 0) { - inputPaths = null; - } - - if (outputPath == null) { +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.util.ReflectionUtils; + +/** + * A distributed "index" is partitioned into "shards". Each shard corresponds + * to a Lucene instance. This class contains the main() method which uses a + * Map/Reduce job to analyze documents and update Lucene instances in parallel. + * + * The main() method in UpdateIndex requires the following information for + * updating the shards: + * - Input formatter. This specifies how to format the input documents. + * - Analysis. This defines the analyzer to use on the input. The analyzer + * determines whether a document is being inserted, updated, or deleted. + * For inserts or updates, the analyzer also converts each input document + * into a Lucene document. + * - Input paths. This provides the location(s) of updated documents, + * e.g., HDFS files or directories, or HBase tables. + * - Shard paths, or index path with the number of shards. Either specify + * the path for each shard, or specify an index path and the shards are + * the sub-directories of the index directory. + * - Output path. When the update to a shard is done, a message is put here. + * - Number of map tasks. + * + * All of the information can be specified in a configuration file. All but + * the first two can also be specified as command line options. Check out + * conf/index-config.xml.template for other configurable parameters. + * + * Note: Because of the parallel nature of Map/Reduce, the behaviour of + * multiple inserts, deletes or updates to the same document is undefined. + */ +public class UpdateIndex { + public static final Log LOG = LogFactory.getLog(UpdateIndex.class); + + private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); + static { + NUMBER_FORMAT.setMinimumIntegerDigits(5); + NUMBER_FORMAT.setGroupingUsed(false); + } + + private static long now() { + return System.currentTimeMillis(); + } + + private static void printUsage(String cmd) { + System.err.println("Usage: java " + UpdateIndex.class.getName() + "\n" + + " -inputPaths \n" + + " -outputPath \n" + + " -shards \n" + + " -indexPath \n" + + " -numShards \n" + + " -numMapTasks \n" + + " -conf \n" + + "Note: Do not use both -shards option and -indexPath option."); + } + + private static String getIndexPath(Configuration conf) { + return conf.get("sea.index.path"); + } + + private static int getNumShards(Configuration conf) { + return conf.getInt("sea.num.shards", 1); + } + + private static Shard[] createShards(String indexPath, int numShards, + Configuration conf) throws IOException { + + String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR; + long versionNumber = -1; + long generation = -1; + + FileSystem fs = FileSystem.get(conf); + Path path = new Path(indexPath); + + if (fs.exists(path)) { + FileStatus[] fileStatus = fs.listStatus(path); + String[] shardNames = new String[fileStatus.length]; + int count = 0; + for (int i = 0; i < fileStatus.length; i++) { + if (fileStatus[i].isDirectory()) { + shardNames[count] = fileStatus[i].getPath().getName(); + count++; + } + } + Arrays.sort(shardNames, 0, count); + + Shard[] shards = new Shard[count >= numShards ? count : numShards]; + for (int i = 0; i < count; i++) { + shards[i] = + new Shard(versionNumber, parent + shardNames[i], generation); + } + + int number = count; + for (int i = count; i < numShards; i++) { + String shardPath; + while (true) { + shardPath = parent + NUMBER_FORMAT.format(number++); + if (!fs.exists(new Path(shardPath))) { + break; + } + } + shards[i] = new Shard(versionNumber, shardPath, generation); + } + return shards; + } else { + Shard[] shards = new Shard[numShards]; + for (int i = 0; i < shards.length; i++) { + shards[i] = + new Shard(versionNumber, parent + NUMBER_FORMAT.format(i), + generation); + } + return shards; + } + } + + /** + * The main() method + * @param argv + */ + public static void main(String[] argv) { + if (argv.length == 0) { + printUsage(""); + System.exit(-1); + } + + String inputPathsString = null; + Path outputPath = null; + String shardsString = null; + String indexPath = null; + int numShards = -1; + int numMapTasks = -1; + Configuration conf = new Configuration(); + String confPath = null; + + // parse the command line + for (int i = 0; i < argv.length; i++) { // parse command line + if (argv[i].equals("-inputPaths")) { + inputPathsString = argv[++i]; + } else if (argv[i].equals("-outputPath")) { + outputPath = new Path(argv[++i]); + } else if (argv[i].equals("-shards")) { + shardsString = argv[++i]; + } else if (argv[i].equals("-indexPath")) { + indexPath = argv[++i]; + } else if (argv[i].equals("-numShards")) { + numShards = Integer.parseInt(argv[++i]); + } else if (argv[i].equals("-numMapTasks")) { + numMapTasks = Integer.parseInt(argv[++i]); + } else if (argv[i].equals("-conf")) { + // add as a local FS resource + confPath = argv[++i]; + conf.addResource(new Path(confPath)); + } else { + System.out.println("Unknown option " + argv[i] + " w/ value " + + argv[++i]); + } + } + LOG.info("inputPaths = " + inputPathsString); + LOG.info("outputPath = " + outputPath); + LOG.info("shards = " + shardsString); + LOG.info("indexPath = " + indexPath); + LOG.info("numShards = " + numShards); + LOG.info("numMapTasks= " + numMapTasks); + LOG.info("confPath = " + confPath); + + Path[] inputPaths = null; + Shard[] shards = null; + + JobConf jobConf = new JobConf(conf); + IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(jobConf); + + if (inputPathsString != null) { + jobConf.set(org.apache.hadoop.mapreduce.lib.input. + FileInputFormat.INPUT_DIR, inputPathsString); + } + inputPaths = FileInputFormat.getInputPaths(jobConf); + if (inputPaths.length == 0) { + inputPaths = null; + } + + if (outputPath == null) { outputPath = FileOutputFormat.getOutputPath(jobConf); - } - - if (inputPaths == null || outputPath == null) { - System.err.println("InputPaths and outputPath must be specified."); - printUsage(""); - System.exit(-1); - } - - if (shardsString != null) { - iconf.setIndexShards(shardsString); - } - shards = Shard.getIndexShards(iconf); - if (shards != null && shards.length == 0) { - shards = null; - } - - if (indexPath == null) { - indexPath = getIndexPath(conf); - } - if (numShards <= 0) { - numShards = getNumShards(conf); - } - - if (shards == null && indexPath == null) { - System.err.println("Either shards or indexPath must be specified."); - printUsage(""); - System.exit(-1); - } - - if (numMapTasks <= 0) { - numMapTasks = jobConf.getNumMapTasks(); - } - - try { - // create shards and set their directories if necessary - if (shards == null) { - shards = createShards(indexPath, numShards, conf); - } - - long startTime = now(); - try { - IIndexUpdater updater = - (IIndexUpdater) ReflectionUtils.newInstance( - iconf.getIndexUpdaterClass(), conf); - LOG.info("sea.index.updater = " - + iconf.getIndexUpdaterClass().getName()); - - updater.run(conf, inputPaths, outputPath, numMapTasks, shards); - LOG.info("Index update job is done"); - - } finally { - long elapsedTime = now() - startTime; - LOG.info("Elapsed time is " + (elapsedTime / 1000) + "s"); - System.out.println("Elapsed time is " + (elapsedTime / 1000) + "s"); - } - } catch (Exception e) { - e.printStackTrace(System.err); - } - } -} + } + + if (inputPaths == null || outputPath == null) { + System.err.println("InputPaths and outputPath must be specified."); + printUsage(""); + System.exit(-1); + } + + if (shardsString != null) { + iconf.setIndexShards(shardsString); + } + shards = Shard.getIndexShards(iconf); + if (shards != null && shards.length == 0) { + shards = null; + } + + if (indexPath == null) { + indexPath = getIndexPath(conf); + } + if (numShards <= 0) { + numShards = getNumShards(conf); + } + + if (shards == null && indexPath == null) { + System.err.println("Either shards or indexPath must be specified."); + printUsage(""); + System.exit(-1); + } + + if (numMapTasks <= 0) { + numMapTasks = jobConf.getNumMapTasks(); + } + + try { + // create shards and set their directories if necessary + if (shards == null) { + shards = createShards(indexPath, numShards, conf); + } + + long startTime = now(); + try { + IIndexUpdater updater = + (IIndexUpdater) ReflectionUtils.newInstance( + iconf.getIndexUpdaterClass(), conf); + LOG.info("sea.index.updater = " + + iconf.getIndexUpdaterClass().getName()); + + updater.run(conf, inputPaths, outputPath, numMapTasks, shards); + LOG.info("Index update job is done"); + + } finally { + long elapsedTime = now() - startTime; + LOG.info("Elapsed time is " + (elapsedTime / 1000) + "s"); + System.out.println("Elapsed time is " + (elapsedTime / 1000) + "s"); + } + } catch (Exception e) { + e.printStackTrace(System.err); + } + } +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java index cf7eb6e0a8e..f07008446f7 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java @@ -1,208 +1,208 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.Term; - -/** - * This class represents an indexing operation. The operation can be an insert, - * a delete or an update. If the operation is an insert or an update, a (new) - * document must be specified. If the operation is a delete or an update, a - * delete term must be specified. - */ -public class DocumentAndOp implements Writable { - - /** - * This class represents the type of an operation - an insert, a delete or - * an update. - */ - public static final class Op { - public static final Op INSERT = new Op("INSERT"); - public static final Op DELETE = new Op("DELETE"); - public static final Op UPDATE = new Op("UPDATE"); - - private String name; - - private Op(String name) { - this.name = name; - } - - public String toString() { - return name; - } - } - - private Op op; - private Document doc; - private Term term; - - /** - * Constructor for no operation. - */ - public DocumentAndOp() { - } - - /** - * Constructor for an insert operation. - * @param op - * @param doc - */ - public DocumentAndOp(Op op, Document doc) { - assert (op == Op.INSERT); - this.op = op; - this.doc = doc; - this.term = null; - } - - /** - * Constructor for a delete operation. - * @param op - * @param term - */ - public DocumentAndOp(Op op, Term term) { - assert (op == Op.DELETE); - this.op = op; - this.doc = null; - this.term = term; - } - - /** - * Constructor for an insert, a delete or an update operation. - * @param op - * @param doc - * @param term - */ - public DocumentAndOp(Op op, Document doc, Term term) { - if (op == Op.INSERT) { - assert (doc != null); - assert (term == null); - } else if (op == Op.DELETE) { - assert (doc == null); - assert (term != null); - } else { - assert (op == Op.UPDATE); - assert (doc != null); - assert (term != null); - } - this.op = op; - this.doc = doc; - this.term = term; - } - - /** - * Set the instance to be an insert operation. - * @param doc - */ - public void setInsert(Document doc) { - this.op = Op.INSERT; - this.doc = doc; - this.term = null; - } - - /** - * Set the instance to be a delete operation. - * @param term - */ - public void setDelete(Term term) { - this.op = Op.DELETE; - this.doc = null; - this.term = term; - } - - /** - * Set the instance to be an update operation. - * @param doc - * @param term - */ - public void setUpdate(Document doc, Term term) { - this.op = Op.UPDATE; - this.doc = doc; - this.term = term; - } - - /** - * Get the type of operation. - * @return the type of the operation. - */ - public Op getOp() { - return op; - } - - /** - * Get the document. - * @return the document - */ - public Document getDocument() { - return doc; - } - - /** - * Get the term. - * @return the term - */ - public Term getTerm() { - return term; - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append(this.getClass().getName()); - buffer.append("[op="); - buffer.append(op); - buffer.append(", doc="); - if (doc != null) { - buffer.append(doc); - } else { - buffer.append("null"); - } - buffer.append(", term="); - if (term != null) { - buffer.append(term); - } else { - buffer.append("null"); - } - buffer.append("]"); - return buffer.toString(); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - public void write(DataOutput out) throws IOException { - throw new IOException(this.getClass().getName() - + ".write should never be called"); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - public void readFields(DataInput in) throws IOException { - throw new IOException(this.getClass().getName() - + ".readFields should never be called"); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; + +/** + * This class represents an indexing operation. The operation can be an insert, + * a delete or an update. If the operation is an insert or an update, a (new) + * document must be specified. If the operation is a delete or an update, a + * delete term must be specified. + */ +public class DocumentAndOp implements Writable { + + /** + * This class represents the type of an operation - an insert, a delete or + * an update. + */ + public static final class Op { + public static final Op INSERT = new Op("INSERT"); + public static final Op DELETE = new Op("DELETE"); + public static final Op UPDATE = new Op("UPDATE"); + + private String name; + + private Op(String name) { + this.name = name; + } + + public String toString() { + return name; + } + } + + private Op op; + private Document doc; + private Term term; + + /** + * Constructor for no operation. + */ + public DocumentAndOp() { + } + + /** + * Constructor for an insert operation. + * @param op + * @param doc + */ + public DocumentAndOp(Op op, Document doc) { + assert (op == Op.INSERT); + this.op = op; + this.doc = doc; + this.term = null; + } + + /** + * Constructor for a delete operation. + * @param op + * @param term + */ + public DocumentAndOp(Op op, Term term) { + assert (op == Op.DELETE); + this.op = op; + this.doc = null; + this.term = term; + } + + /** + * Constructor for an insert, a delete or an update operation. + * @param op + * @param doc + * @param term + */ + public DocumentAndOp(Op op, Document doc, Term term) { + if (op == Op.INSERT) { + assert (doc != null); + assert (term == null); + } else if (op == Op.DELETE) { + assert (doc == null); + assert (term != null); + } else { + assert (op == Op.UPDATE); + assert (doc != null); + assert (term != null); + } + this.op = op; + this.doc = doc; + this.term = term; + } + + /** + * Set the instance to be an insert operation. + * @param doc + */ + public void setInsert(Document doc) { + this.op = Op.INSERT; + this.doc = doc; + this.term = null; + } + + /** + * Set the instance to be a delete operation. + * @param term + */ + public void setDelete(Term term) { + this.op = Op.DELETE; + this.doc = null; + this.term = term; + } + + /** + * Set the instance to be an update operation. + * @param doc + * @param term + */ + public void setUpdate(Document doc, Term term) { + this.op = Op.UPDATE; + this.doc = doc; + this.term = term; + } + + /** + * Get the type of operation. + * @return the type of the operation. + */ + public Op getOp() { + return op; + } + + /** + * Get the document. + * @return the document + */ + public Document getDocument() { + return doc; + } + + /** + * Get the term. + * @return the term + */ + public Term getTerm() { + return term; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append(this.getClass().getName()); + buffer.append("[op="); + buffer.append(op); + buffer.append(", doc="); + if (doc != null) { + buffer.append(doc); + } else { + buffer.append("null"); + } + buffer.append(", term="); + if (term != null) { + buffer.append(term); + } else { + buffer.append("null"); + } + buffer.append("]"); + return buffer.toString(); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) + */ + public void write(DataOutput out) throws IOException { + throw new IOException(this.getClass().getName() + + ".write should never be called"); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) + */ + public void readFields(DataInput in) throws IOException { + throw new IOException(this.getClass().getName() + + ".readFields should never be called"); + } +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java index c62afa8fca4..d5d0d9e0dd2 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java @@ -1,89 +1,89 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; - -/** - * The class represents a document id, which is of type text. - */ -public class DocumentID implements WritableComparable { - private final Text docID; - - /** - * Constructor. - */ - public DocumentID() { - docID = new Text(); - } - - /** - * The text of the document id. - * @return the text - */ - public Text getText() { - return docID; - } - - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - public int compareTo(Object obj) { - if (this == obj) { - return 0; - } else { - return docID.compareTo(((DocumentID) obj).docID); - } - } - - /* (non-Javadoc) - * @see java.lang.Object#hashCode() - */ - public int hashCode() { - return docID.hashCode(); - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - public String toString() { - return this.getClass().getName() + "[" + docID + "]"; - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - public void write(DataOutput out) throws IOException { - throw new IOException(this.getClass().getName() - + ".write should never be called"); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - public void readFields(DataInput in) throws IOException { - throw new IOException(this.getClass().getName() - + ".readFields should never be called"); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; + +/** + * The class represents a document id, which is of type text. + */ +public class DocumentID implements WritableComparable { + private final Text docID; + + /** + * Constructor. + */ + public DocumentID() { + docID = new Text(); + } + + /** + * The text of the document id. + * @return the text + */ + public Text getText() { + return docID; + } + + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + public int compareTo(Object obj) { + if (this == obj) { + return 0; + } else { + return docID.compareTo(((DocumentID) obj).docID); + } + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + public int hashCode() { + return docID.hashCode(); + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + public String toString() { + return this.getClass().getName() + "[" + docID + "]"; + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) + */ + public void write(DataOutput out) throws IOException { + throw new IOException(this.getClass().getName() + + ".write should never be called"); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) + */ + public void readFields(DataInput in) throws IOException { + throw new IOException(this.getClass().getName() + + ".readFields should never be called"); + } +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java index ad01cd5ef09..f454ad61397 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java @@ -1,50 +1,50 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -/** - * A distribution policy decides, given a document with a document id, which - * one shard the request should be sent to if the request is an insert, and - * which shard(s) the request should be sent to if the request is a delete. - */ -public interface IDistributionPolicy { - - /** - * Initialization. It must be called before any chooseShard() is called. - * @param shards - */ - void init(Shard[] shards); - - /** - * Choose a shard to send an insert request. - * @param key - * @return the index of the chosen shard - */ - int chooseShardForInsert(DocumentID key); - - /** - * Choose a shard or all shards to send a delete request. E.g. a round-robin - * distribution policy would send a delete request to all the shards. - * -1 represents all the shards. - * @param key - * @return the index of the chosen shard, -1 if all the shards are chosen - */ - int chooseShardForDelete(DocumentID key); - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +/** + * A distribution policy decides, given a document with a document id, which + * one shard the request should be sent to if the request is an insert, and + * which shard(s) the request should be sent to if the request is a delete. + */ +public interface IDistributionPolicy { + + /** + * Initialization. It must be called before any chooseShard() is called. + * @param shards + */ + void init(Shard[] shards); + + /** + * Choose a shard to send an insert request. + * @param key + * @return the index of the chosen shard + */ + int chooseShardForInsert(DocumentID key); + + /** + * Choose a shard or all shards to send a delete request. E.g. a round-robin + * distribution policy would send a delete request to all the shards. + * -1 represents all the shards. + * @param key + * @return the index of the chosen shard, -1 if all the shards are chosen + */ + int chooseShardForDelete(DocumentID key); + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java index e9812cce6d6..9feb9a2db9a 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java @@ -1,46 +1,46 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -/** - * A class implements an index updater interface should create a Map/Reduce job - * configuration and run the Map/Reduce job to analyze documents and update - * Lucene instances in parallel. - */ -public interface IIndexUpdater { - - /** - * Create a Map/Reduce job configuration and run the Map/Reduce job to - * analyze documents and update Lucene instances in parallel. - * @param conf - * @param inputPaths - * @param outputPath - * @param numMapTasks - * @param shards - * @throws IOException - */ - void run(Configuration conf, Path[] inputPaths, Path outputPath, - int numMapTasks, Shard[] shards) throws IOException; - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +/** + * A class implements an index updater interface should create a Map/Reduce job + * configuration and run the Map/Reduce job to analyze documents and update + * Lucene instances in parallel. + */ +public interface IIndexUpdater { + + /** + * Create a Map/Reduce job configuration and run the Map/Reduce job to + * analyze documents and update Lucene instances in parallel. + * @param conf + * @param inputPaths + * @param outputPath + * @param numMapTasks + * @param shards + * @throws IOException + */ + void run(Configuration conf, Path[] inputPaths, Path outputPath, + int numMapTasks, Shard[] shards) throws IOException; + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java index e8fcc900c65..32d59c57664 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java @@ -1,32 +1,32 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.Mapper; - -/** - * Application specific local analysis. The output type must be (DocumentID, - * DocumentAndOp). - */ -public interface ILocalAnalysis - extends Mapper { - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.Mapper; + +/** + * Application specific local analysis. The output type must be (DocumentID, + * DocumentAndOp). + */ +public interface ILocalAnalysis + extends Mapper { + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java index 09c58d85b5a..bb84ba8253f 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java @@ -1,111 +1,111 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.MapReduceBase; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reducer; -import org.apache.hadoop.mapred.Reporter; - -/** - * This combiner combines multiple intermediate forms into one intermediate - * form. More specifically, the input intermediate forms are a single-document - * ram index and/or a single delete term. An output intermediate form contains - * a multi-document ram index and/or multiple delete terms. - */ -public class IndexUpdateCombiner extends MapReduceBase implements - Reducer { - static final Log LOG = LogFactory.getLog(IndexUpdateCombiner.class); - - IndexUpdateConfiguration iconf; - long maxSizeInBytes; - long nearMaxSizeInBytes; - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.Reducer#reduce(java.lang.Object, java.util.Iterator, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) - */ - public void reduce(Shard key, Iterator values, - OutputCollector output, Reporter reporter) - throws IOException { - - String message = key.toString(); - IntermediateForm form = null; - - while (values.hasNext()) { - IntermediateForm singleDocForm = values.next(); - long formSize = form == null ? 0 : form.totalSizeInBytes(); - long singleDocFormSize = singleDocForm.totalSizeInBytes(); - - if (form != null && formSize + singleDocFormSize > maxSizeInBytes) { - closeForm(form, message); - output.collect(key, form); - form = null; - } - - if (form == null && singleDocFormSize >= nearMaxSizeInBytes) { - output.collect(key, singleDocForm); - } else { - if (form == null) { - form = createForm(message); - } - form.process(singleDocForm); - } - } - - if (form != null) { - closeForm(form, message); - output.collect(key, form); - } - } - - private IntermediateForm createForm(String message) throws IOException { - LOG.info("Construct a form writer for " + message); - IntermediateForm form = new IntermediateForm(); - form.configure(iconf); - return form; - } - - private void closeForm(IntermediateForm form, String message) - throws IOException { - form.closeWriter(); - LOG.info("Closed the form writer for " + message + ", form = " + form); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) - */ - public void configure(JobConf job) { - iconf = new IndexUpdateConfiguration(job); - maxSizeInBytes = iconf.getMaxRAMSizeInBytes(); - nearMaxSizeInBytes = maxSizeInBytes - (maxSizeInBytes >>> 3); // 7/8 of max - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.MapReduceBase#close() - */ - public void close() throws IOException { - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +/** + * This combiner combines multiple intermediate forms into one intermediate + * form. More specifically, the input intermediate forms are a single-document + * ram index and/or a single delete term. An output intermediate form contains + * a multi-document ram index and/or multiple delete terms. + */ +public class IndexUpdateCombiner extends MapReduceBase implements + Reducer { + static final Log LOG = LogFactory.getLog(IndexUpdateCombiner.class); + + IndexUpdateConfiguration iconf; + long maxSizeInBytes; + long nearMaxSizeInBytes; + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.Reducer#reduce(java.lang.Object, java.util.Iterator, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) + */ + public void reduce(Shard key, Iterator values, + OutputCollector output, Reporter reporter) + throws IOException { + + String message = key.toString(); + IntermediateForm form = null; + + while (values.hasNext()) { + IntermediateForm singleDocForm = values.next(); + long formSize = form == null ? 0 : form.totalSizeInBytes(); + long singleDocFormSize = singleDocForm.totalSizeInBytes(); + + if (form != null && formSize + singleDocFormSize > maxSizeInBytes) { + closeForm(form, message); + output.collect(key, form); + form = null; + } + + if (form == null && singleDocFormSize >= nearMaxSizeInBytes) { + output.collect(key, singleDocForm); + } else { + if (form == null) { + form = createForm(message); + } + form.process(singleDocForm); + } + } + + if (form != null) { + closeForm(form, message); + output.collect(key, form); + } + } + + private IntermediateForm createForm(String message) throws IOException { + LOG.info("Construct a form writer for " + message); + IntermediateForm form = new IntermediateForm(); + form.configure(iconf); + return form; + } + + private void closeForm(IntermediateForm form, String message) + throws IOException { + form.closeWriter(); + LOG.info("Closed the form writer for " + message + ", form = " + form); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) + */ + public void configure(JobConf job) { + iconf = new IndexUpdateConfiguration(job); + maxSizeInBytes = iconf.getMaxRAMSizeInBytes(); + nearMaxSizeInBytes = maxSizeInBytes - (maxSizeInBytes >>> 3); // 7/8 of max + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.MapReduceBase#close() + */ + public void close() throws IOException { + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java index 8de0ff5adaf..16ad84440fa 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java @@ -1,256 +1,256 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.contrib.index.example.HashingDistributionPolicy; -import org.apache.hadoop.contrib.index.example.LineDocInputFormat; -import org.apache.hadoop.contrib.index.example.LineDocLocalAnalysis; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapreduce.MRConfig; -import org.apache.hadoop.mapreduce.MRJobConfig; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; - -/** - * This class provides the getters and the setters to a number of parameters. - * Most of the parameters are related to the index update and the rest are - * from the existing Map/Reduce parameters. - */ -public class IndexUpdateConfiguration { - final Configuration conf; - - /** - * Constructor - * @param conf - */ - public IndexUpdateConfiguration(Configuration conf) { - this.conf = conf; - } - - /** - * Get the underlying configuration object. - * @return the configuration - */ - public Configuration getConfiguration() { - return conf; - } - - // - // existing map/reduce properties - // - // public int getIOFileBufferSize() { - // return getInt("io.file.buffer.size", 4096); - // } - - /** - * Get the IO sort space in MB. - * @return the IO sort space in MB - */ - public int getIOSortMB() { - return conf.getInt(MRJobConfig.IO_SORT_MB, 100); - } - - /** - * Set the IO sort space in MB. - * @param mb the IO sort space in MB - */ - public void setIOSortMB(int mb) { - conf.setInt(MRJobConfig.IO_SORT_MB, mb); - } - - /** - * Get the Map/Reduce temp directory. - * @return the Map/Reduce temp directory - */ - public String getMapredTempDir() { - return conf.get(MRConfig.TEMP_DIR); - } - - // - // properties for index update - // - /** - * Get the distribution policy class. - * @return the distribution policy class - */ - public Class getDistributionPolicyClass() { - return conf.getClass("sea.distribution.policy", - HashingDistributionPolicy.class, IDistributionPolicy.class); - } - - /** - * Set the distribution policy class. - * @param theClass the distribution policy class - */ - public void setDistributionPolicyClass( - Class theClass) { - conf.setClass("sea.distribution.policy", theClass, - IDistributionPolicy.class); - } - - /** - * Get the analyzer class. - * @return the analyzer class - */ - public Class getDocumentAnalyzerClass() { - return conf.getClass("sea.document.analyzer", StandardAnalyzer.class, - Analyzer.class); - } - - /** - * Set the analyzer class. - * @param theClass the analyzer class - */ - public void setDocumentAnalyzerClass(Class theClass) { - conf.setClass("sea.document.analyzer", theClass, Analyzer.class); - } - - /** - * Get the index input format class. - * @return the index input format class - */ - public Class getIndexInputFormatClass() { - return conf.getClass("sea.input.format", LineDocInputFormat.class, - InputFormat.class); - } - - /** - * Set the index input format class. - * @param theClass the index input format class - */ - public void setIndexInputFormatClass(Class theClass) { - conf.setClass("sea.input.format", theClass, InputFormat.class); - } - - /** - * Get the index updater class. - * @return the index updater class - */ - public Class getIndexUpdaterClass() { - return conf.getClass("sea.index.updater", IndexUpdater.class, - IIndexUpdater.class); - } - - /** - * Set the index updater class. - * @param theClass the index updater class - */ - public void setIndexUpdaterClass(Class theClass) { - conf.setClass("sea.index.updater", theClass, IIndexUpdater.class); - } - - /** - * Get the local analysis class. - * @return the local analysis class - */ - public Class getLocalAnalysisClass() { - return conf.getClass("sea.local.analysis", LineDocLocalAnalysis.class, - ILocalAnalysis.class); - } - - /** - * Set the local analysis class. - * @param theClass the local analysis class - */ - public void setLocalAnalysisClass(Class theClass) { - conf.setClass("sea.local.analysis", theClass, ILocalAnalysis.class); - } - - /** - * Get the string representation of a number of shards. - * @return the string representation of a number of shards - */ - public String getIndexShards() { - return conf.get("sea.index.shards"); - } - - /** - * Set the string representation of a number of shards. - * @param shards the string representation of a number of shards - */ - public void setIndexShards(String shards) { - conf.set("sea.index.shards", shards); - } - - /** - * Get the max field length for a Lucene instance. - * @return the max field length for a Lucene instance - */ - public int getIndexMaxFieldLength() { - return conf.getInt("sea.max.field.length", -1); - } - - /** - * Set the max field length for a Lucene instance. - * @param maxFieldLength the max field length for a Lucene instance - */ - public void setIndexMaxFieldLength(int maxFieldLength) { - conf.setInt("sea.max.field.length", maxFieldLength); - } - - /** - * Get the max number of segments for a Lucene instance. - * @return the max number of segments for a Lucene instance - */ - public int getIndexMaxNumSegments() { - return conf.getInt("sea.max.num.segments", -1); - } - - /** - * Set the max number of segments for a Lucene instance. - * @param maxNumSegments the max number of segments for a Lucene instance - */ - public void setIndexMaxNumSegments(int maxNumSegments) { - conf.setInt("sea.max.num.segments", maxNumSegments); - } - - /** - * Check whether to use the compound file format for a Lucene instance. - * @return true if using the compound file format for a Lucene instance - */ - public boolean getIndexUseCompoundFile() { - return conf.getBoolean("sea.use.compound.file", false); - } - - /** - * Set whether use the compound file format for a Lucene instance. - * @param useCompoundFile whether to use the compound file format - */ - public void setIndexUseCompoundFile(boolean useCompoundFile) { - conf.setBoolean("sea.use.compound.file", useCompoundFile); - } - - /** - * Get the max ram index size in bytes. The default is 50M. - * @return the max ram index size in bytes - */ - public long getMaxRAMSizeInBytes() { - return conf.getLong("sea.max.ramsize.bytes", 50L << 20); - } - - /** - * Set the max ram index size in bytes. - * @param b the max ram index size in bytes - */ - public void setMaxRAMSizeInBytes(long b) { - conf.setLong("sea.max.ramsize.bytes", b); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.contrib.index.example.HashingDistributionPolicy; +import org.apache.hadoop.contrib.index.example.LineDocInputFormat; +import org.apache.hadoop.contrib.index.example.LineDocLocalAnalysis; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapreduce.MRConfig; +import org.apache.hadoop.mapreduce.MRJobConfig; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +/** + * This class provides the getters and the setters to a number of parameters. + * Most of the parameters are related to the index update and the rest are + * from the existing Map/Reduce parameters. + */ +public class IndexUpdateConfiguration { + final Configuration conf; + + /** + * Constructor + * @param conf + */ + public IndexUpdateConfiguration(Configuration conf) { + this.conf = conf; + } + + /** + * Get the underlying configuration object. + * @return the configuration + */ + public Configuration getConfiguration() { + return conf; + } + + // + // existing map/reduce properties + // + // public int getIOFileBufferSize() { + // return getInt("io.file.buffer.size", 4096); + // } + + /** + * Get the IO sort space in MB. + * @return the IO sort space in MB + */ + public int getIOSortMB() { + return conf.getInt(MRJobConfig.IO_SORT_MB, 100); + } + + /** + * Set the IO sort space in MB. + * @param mb the IO sort space in MB + */ + public void setIOSortMB(int mb) { + conf.setInt(MRJobConfig.IO_SORT_MB, mb); + } + + /** + * Get the Map/Reduce temp directory. + * @return the Map/Reduce temp directory + */ + public String getMapredTempDir() { + return conf.get(MRConfig.TEMP_DIR); + } + + // + // properties for index update + // + /** + * Get the distribution policy class. + * @return the distribution policy class + */ + public Class getDistributionPolicyClass() { + return conf.getClass("sea.distribution.policy", + HashingDistributionPolicy.class, IDistributionPolicy.class); + } + + /** + * Set the distribution policy class. + * @param theClass the distribution policy class + */ + public void setDistributionPolicyClass( + Class theClass) { + conf.setClass("sea.distribution.policy", theClass, + IDistributionPolicy.class); + } + + /** + * Get the analyzer class. + * @return the analyzer class + */ + public Class getDocumentAnalyzerClass() { + return conf.getClass("sea.document.analyzer", StandardAnalyzer.class, + Analyzer.class); + } + + /** + * Set the analyzer class. + * @param theClass the analyzer class + */ + public void setDocumentAnalyzerClass(Class theClass) { + conf.setClass("sea.document.analyzer", theClass, Analyzer.class); + } + + /** + * Get the index input format class. + * @return the index input format class + */ + public Class getIndexInputFormatClass() { + return conf.getClass("sea.input.format", LineDocInputFormat.class, + InputFormat.class); + } + + /** + * Set the index input format class. + * @param theClass the index input format class + */ + public void setIndexInputFormatClass(Class theClass) { + conf.setClass("sea.input.format", theClass, InputFormat.class); + } + + /** + * Get the index updater class. + * @return the index updater class + */ + public Class getIndexUpdaterClass() { + return conf.getClass("sea.index.updater", IndexUpdater.class, + IIndexUpdater.class); + } + + /** + * Set the index updater class. + * @param theClass the index updater class + */ + public void setIndexUpdaterClass(Class theClass) { + conf.setClass("sea.index.updater", theClass, IIndexUpdater.class); + } + + /** + * Get the local analysis class. + * @return the local analysis class + */ + public Class getLocalAnalysisClass() { + return conf.getClass("sea.local.analysis", LineDocLocalAnalysis.class, + ILocalAnalysis.class); + } + + /** + * Set the local analysis class. + * @param theClass the local analysis class + */ + public void setLocalAnalysisClass(Class theClass) { + conf.setClass("sea.local.analysis", theClass, ILocalAnalysis.class); + } + + /** + * Get the string representation of a number of shards. + * @return the string representation of a number of shards + */ + public String getIndexShards() { + return conf.get("sea.index.shards"); + } + + /** + * Set the string representation of a number of shards. + * @param shards the string representation of a number of shards + */ + public void setIndexShards(String shards) { + conf.set("sea.index.shards", shards); + } + + /** + * Get the max field length for a Lucene instance. + * @return the max field length for a Lucene instance + */ + public int getIndexMaxFieldLength() { + return conf.getInt("sea.max.field.length", -1); + } + + /** + * Set the max field length for a Lucene instance. + * @param maxFieldLength the max field length for a Lucene instance + */ + public void setIndexMaxFieldLength(int maxFieldLength) { + conf.setInt("sea.max.field.length", maxFieldLength); + } + + /** + * Get the max number of segments for a Lucene instance. + * @return the max number of segments for a Lucene instance + */ + public int getIndexMaxNumSegments() { + return conf.getInt("sea.max.num.segments", -1); + } + + /** + * Set the max number of segments for a Lucene instance. + * @param maxNumSegments the max number of segments for a Lucene instance + */ + public void setIndexMaxNumSegments(int maxNumSegments) { + conf.setInt("sea.max.num.segments", maxNumSegments); + } + + /** + * Check whether to use the compound file format for a Lucene instance. + * @return true if using the compound file format for a Lucene instance + */ + public boolean getIndexUseCompoundFile() { + return conf.getBoolean("sea.use.compound.file", false); + } + + /** + * Set whether use the compound file format for a Lucene instance. + * @param useCompoundFile whether to use the compound file format + */ + public void setIndexUseCompoundFile(boolean useCompoundFile) { + conf.setBoolean("sea.use.compound.file", useCompoundFile); + } + + /** + * Get the max ram index size in bytes. The default is 50M. + * @return the max ram index size in bytes + */ + public long getMaxRAMSizeInBytes() { + return conf.getLong("sea.max.ramsize.bytes", 50L << 20); + } + + /** + * Set the max ram index size in bytes. + * @param b the max ram index size in bytes + */ + public void setMaxRAMSizeInBytes(long b) { + conf.setLong("sea.max.ramsize.bytes", b); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java index e6b39b5ddc6..5d4b4f68e68 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java @@ -1,199 +1,199 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.IOException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.MapReduceBase; -import org.apache.hadoop.mapred.Mapper; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.util.ReflectionUtils; -import org.apache.lucene.analysis.Analyzer; - -/** - * This class applies local analysis on a key-value pair and then convert the - * result docid-operation pair to a shard-and-intermediate form pair. - */ -public class IndexUpdateMapper - extends MapReduceBase implements Mapper { - static final Log LOG = LogFactory.getLog(IndexUpdateMapper.class); - - /** - * Get the map output key class. - * @return the map output key class - */ - public static Class getMapOutputKeyClass() { - return Shard.class; - } - - /** - * Get the map output value class. - * @return the map output value class - */ - public static Class getMapOutputValueClass() { - return IntermediateForm.class; - } - - IndexUpdateConfiguration iconf; - private Analyzer analyzer; - private Shard[] shards; - private IDistributionPolicy distributionPolicy; - - private ILocalAnalysis localAnalysis; - private DocumentID tmpKey; - private DocumentAndOp tmpValue; - - private OutputCollector tmpCollector = - new OutputCollector() { - public void collect(DocumentID key, DocumentAndOp value) - throws IOException { - tmpKey = key; - tmpValue = value; - } - }; - - /** - * Map a key-value pair to a shard-and-intermediate form pair. Internally, - * the local analysis is first applied to map the key-value pair to a - * document id-and-operation pair, then the docid-and-operation pair is - * mapped to a shard-intermediate form pair. The intermediate form is of the - * form of a single-document ram index and/or a single delete term. - */ - public void map(K key, V value, - OutputCollector output, Reporter reporter) - throws IOException { - - synchronized (this) { - localAnalysis.map(key, value, tmpCollector, reporter); - - if (tmpKey != null && tmpValue != null) { - DocumentAndOp doc = tmpValue; - IntermediateForm form = new IntermediateForm(); - form.configure(iconf); - form.process(doc, analyzer); - form.closeWriter(); - - if (doc.getOp() == DocumentAndOp.Op.INSERT) { - int chosenShard = distributionPolicy.chooseShardForInsert(tmpKey); - if (chosenShard >= 0) { - // insert into one shard - output.collect(shards[chosenShard], form); - } else { - throw new IOException("Chosen shard for insert must be >= 0"); - } - - } else if (doc.getOp() == DocumentAndOp.Op.DELETE) { - int chosenShard = distributionPolicy.chooseShardForDelete(tmpKey); - if (chosenShard >= 0) { - // delete from one shard - output.collect(shards[chosenShard], form); - } else { - // broadcast delete to all shards - for (int i = 0; i < shards.length; i++) { - output.collect(shards[i], form); - } - } - - } else { // UPDATE - int insertToShard = distributionPolicy.chooseShardForInsert(tmpKey); - int deleteFromShard = - distributionPolicy.chooseShardForDelete(tmpKey); - - if (insertToShard >= 0) { - if (insertToShard == deleteFromShard) { - // update into one shard - output.collect(shards[insertToShard], form); - } else { - // prepare a deletion form - IntermediateForm deletionForm = new IntermediateForm(); - deletionForm.configure(iconf); - deletionForm.process(new DocumentAndOp(DocumentAndOp.Op.DELETE, - doc.getTerm()), analyzer); - deletionForm.closeWriter(); - - if (deleteFromShard >= 0) { - // delete from one shard - output.collect(shards[deleteFromShard], deletionForm); - } else { - // broadcast delete to all shards - for (int i = 0; i < shards.length; i++) { - output.collect(shards[i], deletionForm); - } - } - - // prepare an insertion form - IntermediateForm insertionForm = new IntermediateForm(); - insertionForm.configure(iconf); - insertionForm.process(new DocumentAndOp(DocumentAndOp.Op.INSERT, - doc.getDocument()), analyzer); - insertionForm.closeWriter(); - - // insert into one shard - output.collect(shards[insertToShard], insertionForm); - } - } else { - throw new IOException("Chosen shard for insert must be >= 0"); - } - } - } - } - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) - */ - public void configure(JobConf job) { - iconf = new IndexUpdateConfiguration(job); - analyzer = - (Analyzer) ReflectionUtils.newInstance( - iconf.getDocumentAnalyzerClass(), job); - - localAnalysis = - (ILocalAnalysis) ReflectionUtils.newInstance( - iconf.getLocalAnalysisClass(), job); - localAnalysis.configure(job); - - shards = Shard.getIndexShards(iconf); - - distributionPolicy = - (IDistributionPolicy) ReflectionUtils.newInstance( - iconf.getDistributionPolicyClass(), job); - distributionPolicy.init(shards); - - LOG.info("sea.document.analyzer = " + analyzer.getClass().getName()); - LOG.info("sea.local.analysis = " + localAnalysis.getClass().getName()); - LOG.info(shards.length + " shards = " + iconf.getIndexShards()); - LOG.info("sea.distribution.policy = " - + distributionPolicy.getClass().getName()); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.MapReduceBase#close() - */ - public void close() throws IOException { - localAnalysis.close(); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.lucene.analysis.Analyzer; + +/** + * This class applies local analysis on a key-value pair and then convert the + * result docid-operation pair to a shard-and-intermediate form pair. + */ +public class IndexUpdateMapper + extends MapReduceBase implements Mapper { + static final Log LOG = LogFactory.getLog(IndexUpdateMapper.class); + + /** + * Get the map output key class. + * @return the map output key class + */ + public static Class getMapOutputKeyClass() { + return Shard.class; + } + + /** + * Get the map output value class. + * @return the map output value class + */ + public static Class getMapOutputValueClass() { + return IntermediateForm.class; + } + + IndexUpdateConfiguration iconf; + private Analyzer analyzer; + private Shard[] shards; + private IDistributionPolicy distributionPolicy; + + private ILocalAnalysis localAnalysis; + private DocumentID tmpKey; + private DocumentAndOp tmpValue; + + private OutputCollector tmpCollector = + new OutputCollector() { + public void collect(DocumentID key, DocumentAndOp value) + throws IOException { + tmpKey = key; + tmpValue = value; + } + }; + + /** + * Map a key-value pair to a shard-and-intermediate form pair. Internally, + * the local analysis is first applied to map the key-value pair to a + * document id-and-operation pair, then the docid-and-operation pair is + * mapped to a shard-intermediate form pair. The intermediate form is of the + * form of a single-document ram index and/or a single delete term. + */ + public void map(K key, V value, + OutputCollector output, Reporter reporter) + throws IOException { + + synchronized (this) { + localAnalysis.map(key, value, tmpCollector, reporter); + + if (tmpKey != null && tmpValue != null) { + DocumentAndOp doc = tmpValue; + IntermediateForm form = new IntermediateForm(); + form.configure(iconf); + form.process(doc, analyzer); + form.closeWriter(); + + if (doc.getOp() == DocumentAndOp.Op.INSERT) { + int chosenShard = distributionPolicy.chooseShardForInsert(tmpKey); + if (chosenShard >= 0) { + // insert into one shard + output.collect(shards[chosenShard], form); + } else { + throw new IOException("Chosen shard for insert must be >= 0"); + } + + } else if (doc.getOp() == DocumentAndOp.Op.DELETE) { + int chosenShard = distributionPolicy.chooseShardForDelete(tmpKey); + if (chosenShard >= 0) { + // delete from one shard + output.collect(shards[chosenShard], form); + } else { + // broadcast delete to all shards + for (int i = 0; i < shards.length; i++) { + output.collect(shards[i], form); + } + } + + } else { // UPDATE + int insertToShard = distributionPolicy.chooseShardForInsert(tmpKey); + int deleteFromShard = + distributionPolicy.chooseShardForDelete(tmpKey); + + if (insertToShard >= 0) { + if (insertToShard == deleteFromShard) { + // update into one shard + output.collect(shards[insertToShard], form); + } else { + // prepare a deletion form + IntermediateForm deletionForm = new IntermediateForm(); + deletionForm.configure(iconf); + deletionForm.process(new DocumentAndOp(DocumentAndOp.Op.DELETE, + doc.getTerm()), analyzer); + deletionForm.closeWriter(); + + if (deleteFromShard >= 0) { + // delete from one shard + output.collect(shards[deleteFromShard], deletionForm); + } else { + // broadcast delete to all shards + for (int i = 0; i < shards.length; i++) { + output.collect(shards[i], deletionForm); + } + } + + // prepare an insertion form + IntermediateForm insertionForm = new IntermediateForm(); + insertionForm.configure(iconf); + insertionForm.process(new DocumentAndOp(DocumentAndOp.Op.INSERT, + doc.getDocument()), analyzer); + insertionForm.closeWriter(); + + // insert into one shard + output.collect(shards[insertToShard], insertionForm); + } + } else { + throw new IOException("Chosen shard for insert must be >= 0"); + } + } + } + } + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) + */ + public void configure(JobConf job) { + iconf = new IndexUpdateConfiguration(job); + analyzer = + (Analyzer) ReflectionUtils.newInstance( + iconf.getDocumentAnalyzerClass(), job); + + localAnalysis = + (ILocalAnalysis) ReflectionUtils.newInstance( + iconf.getLocalAnalysisClass(), job); + localAnalysis.configure(job); + + shards = Shard.getIndexShards(iconf); + + distributionPolicy = + (IDistributionPolicy) ReflectionUtils.newInstance( + iconf.getDistributionPolicyClass(), job); + distributionPolicy.init(shards); + + LOG.info("sea.document.analyzer = " + analyzer.getClass().getName()); + LOG.info("sea.local.analysis = " + localAnalysis.getClass().getName()); + LOG.info(shards.length + " shards = " + iconf.getIndexShards()); + LOG.info("sea.distribution.policy = " + + distributionPolicy.getClass().getName()); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.MapReduceBase#close() + */ + public void close() throws IOException { + localAnalysis.close(); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java index d5fb9df14dc..7ebc561dbf6 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java @@ -1,60 +1,60 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.util.HashMap; -import java.util.Map; - -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Partitioner; - -/** - * This partitioner class puts the values of the same key - in this case the - * same shard - in the same partition. - */ -public class IndexUpdatePartitioner implements - Partitioner { - - private Shard[] shards; - private Map map; - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.Partitioner#getPartition(java.lang.Object, java.lang.Object, int) - */ - public int getPartition(Shard key, IntermediateForm value, int numPartitions) { - int partition = map.get(key).intValue(); - if (partition < numPartitions) { - return partition; - } else { - return numPartitions - 1; - } - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf) - */ - public void configure(JobConf job) { - shards = Shard.getIndexShards(new IndexUpdateConfiguration(job)); - map = new HashMap(); - for (int i = 0; i < shards.length; i++) { - map.put(shards[i], i); - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Partitioner; + +/** + * This partitioner class puts the values of the same key - in this case the + * same shard - in the same partition. + */ +public class IndexUpdatePartitioner implements + Partitioner { + + private Shard[] shards; + private Map map; + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.Partitioner#getPartition(java.lang.Object, java.lang.Object, int) + */ + public int getPartition(Shard key, IntermediateForm value, int numPartitions) { + int partition = map.get(key).intValue(); + if (partition < numPartitions) { + return partition; + } else { + return numPartitions - 1; + } + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf) + */ + public void configure(JobConf job) { + shards = Shard.getIndexShards(new IndexUpdateConfiguration(job)); + map = new HashMap(); + for (int i = 0; i < shards.length; i++) { + map.put(shards[i], i); + } + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java index 140e8f98fc5..5b1ab824afd 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java @@ -1,143 +1,143 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.contrib.index.lucene.ShardWriter; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Closeable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.MapReduceBase; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reducer; -import org.apache.hadoop.mapred.Reporter; - -/** - * This reducer applies to a shard the changes for it. A "new version" of - * a shard is created at the end of a reduce. It is important to note that - * the new version of the shard is not derived from scratch. By leveraging - * Lucene's update algorithm, the new version of each Lucene instance will - * share as many files as possible as the previous version. - */ -public class IndexUpdateReducer extends MapReduceBase implements - Reducer { - static final Log LOG = LogFactory.getLog(IndexUpdateReducer.class); - static final Text DONE = new Text("done"); - - /** - * Get the reduce output key class. - * @return the reduce output key class - */ - public static Class getOutputKeyClass() { - return Shard.class; - } - - /** - * Get the reduce output value class. - * @return the reduce output value class - */ - public static Class getOutputValueClass() { - return Text.class; - } - - private IndexUpdateConfiguration iconf; - private String mapredTempDir; - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.Reducer#reduce(java.lang.Object, java.util.Iterator, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) - */ - public void reduce(Shard key, Iterator values, - OutputCollector output, Reporter reporter) - throws IOException { - - LOG.info("Construct a shard writer for " + key); - FileSystem fs = FileSystem.get(iconf.getConfiguration()); - String temp = - mapredTempDir + Path.SEPARATOR + "shard_" + System.currentTimeMillis(); - final ShardWriter writer = new ShardWriter(fs, key, temp, iconf); - - // update the shard - while (values.hasNext()) { - IntermediateForm form = values.next(); - writer.process(form); - reporter.progress(); - } - - // close the shard - final Reporter fReporter = reporter; - new Closeable() { - volatile boolean closed = false; - - public void close() throws IOException { - // spawn a thread to give progress heartbeats - Thread prog = new Thread() { - public void run() { - while (!closed) { - try { - fReporter.setStatus("closing"); - Thread.sleep(1000); - } catch (InterruptedException e) { - continue; - } catch (Throwable e) { - return; - } - } - } - }; - - try { - prog.start(); - - if (writer != null) { - writer.close(); - } - } finally { - closed = true; - } - } - }.close(); - LOG.info("Closed the shard writer for " + key + ", writer = " + writer); - - output.collect(key, DONE); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) - */ - public void configure(JobConf job) { - iconf = new IndexUpdateConfiguration(job); - mapredTempDir = iconf.getMapredTempDir(); - mapredTempDir = Shard.normalizePath(mapredTempDir); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.mapred.MapReduceBase#close() - */ - public void close() throws IOException { - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.contrib.index.lucene.ShardWriter; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Closeable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +/** + * This reducer applies to a shard the changes for it. A "new version" of + * a shard is created at the end of a reduce. It is important to note that + * the new version of the shard is not derived from scratch. By leveraging + * Lucene's update algorithm, the new version of each Lucene instance will + * share as many files as possible as the previous version. + */ +public class IndexUpdateReducer extends MapReduceBase implements + Reducer { + static final Log LOG = LogFactory.getLog(IndexUpdateReducer.class); + static final Text DONE = new Text("done"); + + /** + * Get the reduce output key class. + * @return the reduce output key class + */ + public static Class getOutputKeyClass() { + return Shard.class; + } + + /** + * Get the reduce output value class. + * @return the reduce output value class + */ + public static Class getOutputValueClass() { + return Text.class; + } + + private IndexUpdateConfiguration iconf; + private String mapredTempDir; + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.Reducer#reduce(java.lang.Object, java.util.Iterator, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) + */ + public void reduce(Shard key, Iterator values, + OutputCollector output, Reporter reporter) + throws IOException { + + LOG.info("Construct a shard writer for " + key); + FileSystem fs = FileSystem.get(iconf.getConfiguration()); + String temp = + mapredTempDir + Path.SEPARATOR + "shard_" + System.currentTimeMillis(); + final ShardWriter writer = new ShardWriter(fs, key, temp, iconf); + + // update the shard + while (values.hasNext()) { + IntermediateForm form = values.next(); + writer.process(form); + reporter.progress(); + } + + // close the shard + final Reporter fReporter = reporter; + new Closeable() { + volatile boolean closed = false; + + public void close() throws IOException { + // spawn a thread to give progress heartbeats + Thread prog = new Thread() { + public void run() { + while (!closed) { + try { + fReporter.setStatus("closing"); + Thread.sleep(1000); + } catch (InterruptedException e) { + continue; + } catch (Throwable e) { + return; + } + } + } + }; + + try { + prog.start(); + + if (writer != null) { + writer.close(); + } + } finally { + closed = true; + } + } + }.close(); + LOG.info("Closed the shard writer for " + key + ", writer = " + writer); + + output.collect(key, DONE); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf) + */ + public void configure(JobConf job) { + iconf = new IndexUpdateConfiguration(job); + mapredTempDir = iconf.getMapredTempDir(); + mapredTempDir = Shard.normalizePath(mapredTempDir); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.MapReduceBase#close() + */ + public void close() throws IOException { + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java index 3ab14152c98..13450897b6a 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java @@ -1,252 +1,252 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Collection; -import java.util.Iterator; -import java.util.concurrent.ConcurrentLinkedQueue; - -import org.apache.hadoop.contrib.index.lucene.RAMDirectoryUtil; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; -import org.apache.lucene.index.Term; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; - -/** - * An intermediate form for one or more parsed Lucene documents and/or - * delete terms. It actually uses Lucene file format as the format for - * the intermediate form by using RAM dir files. - * - * Note: If process(*) is ever called, closeWriter() should be called. - * Otherwise, no need to call closeWriter(). - */ -public class IntermediateForm implements Writable { - - private IndexUpdateConfiguration iconf = null; - private final Collection deleteList; - private RAMDirectory dir; - private IndexWriter writer; - private int numDocs; - - /** - * Constructor - * @throws IOException - */ - public IntermediateForm() throws IOException { - deleteList = new ConcurrentLinkedQueue(); - dir = new RAMDirectory(); - writer = null; - numDocs = 0; - } - - /** - * Configure using an index update configuration. - * @param iconf the index update configuration - */ - public void configure(IndexUpdateConfiguration iconf) { - this.iconf = iconf; - } - - /** - * Get the ram directory of the intermediate form. - * @return the ram directory - */ - public Directory getDirectory() { - return dir; - } - - /** - * Get an iterator for the delete terms in the intermediate form. - * @return an iterator for the delete terms - */ - public Iterator deleteTermIterator() { - return deleteList.iterator(); - } - - /** - * This method is used by the index update mapper and process a document - * operation into the current intermediate form. - * @param doc input document operation - * @param analyzer the analyzer - * @throws IOException - */ - public void process(DocumentAndOp doc, Analyzer analyzer) throws IOException { - if (doc.getOp() == DocumentAndOp.Op.DELETE - || doc.getOp() == DocumentAndOp.Op.UPDATE) { - deleteList.add(doc.getTerm()); - - } - - if (doc.getOp() == DocumentAndOp.Op.INSERT - || doc.getOp() == DocumentAndOp.Op.UPDATE) { - - if (writer == null) { - // analyzer is null because we specify an analyzer with addDocument - writer = createWriter(); - } - - writer.addDocument(doc.getDocument(), analyzer); - numDocs++; - } - - } - - /** - * This method is used by the index update combiner and process an - * intermediate form into the current intermediate form. More specifically, - * the input intermediate forms are a single-document ram index and/or a - * single delete term. - * @param form the input intermediate form - * @throws IOException - */ - public void process(IntermediateForm form) throws IOException { - if (form.deleteList.size() > 0) { - deleteList.addAll(form.deleteList); - } - - if (form.dir.sizeInBytes() > 0) { - if (writer == null) { - writer = createWriter(); - } - - writer.addIndexesNoOptimize(new Directory[] { form.dir }); - numDocs++; - } - - } - - /** - * Close the Lucene index writer associated with the intermediate form, - * if created. Do not close the ram directory. In fact, there is no need - * to close a ram directory. - * @throws IOException - */ - public void closeWriter() throws IOException { - if (writer != null) { - writer.close(); - writer = null; - } - } - - /** - * The total size of files in the directory and ram used by the index writer. - * It does not include memory used by the delete list. - * @return the total size in bytes - */ - public long totalSizeInBytes() throws IOException { - long size = dir.sizeInBytes(); - if (writer != null) { - size += writer.ramSizeInBytes(); - } - return size; - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append(this.getClass().getSimpleName()); - buffer.append("[numDocs="); - buffer.append(numDocs); - buffer.append(", numDeletes="); - buffer.append(deleteList.size()); - if (deleteList.size() > 0) { - buffer.append("("); - Iterator iter = deleteTermIterator(); - while (iter.hasNext()) { - buffer.append(iter.next()); - buffer.append(" "); - } - buffer.append(")"); - } - buffer.append("]"); - return buffer.toString(); - } - - private IndexWriter createWriter() throws IOException { - IndexWriter writer = - new IndexWriter(dir, false, null, - new KeepOnlyLastCommitDeletionPolicy()); - writer.setUseCompoundFile(false); - - if (iconf != null) { - int maxFieldLength = iconf.getIndexMaxFieldLength(); - if (maxFieldLength > 0) { - writer.setMaxFieldLength(maxFieldLength); - } - } - - return writer; - } - - private void resetForm() throws IOException { - deleteList.clear(); - if (dir.sizeInBytes() > 0) { - // it's ok if we don't close a ram directory - dir.close(); - // an alternative is to delete all the files and reuse the ram directory - dir = new RAMDirectory(); - } - assert (writer == null); - numDocs = 0; - } - - // /////////////////////////////////// - // Writable - // /////////////////////////////////// - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) - */ - public void write(DataOutput out) throws IOException { - out.writeInt(deleteList.size()); - for (Term term : deleteList) { - Text.writeString(out, term.field()); - Text.writeString(out, term.text()); - } - - String[] files = dir.list(); - RAMDirectoryUtil.writeRAMFiles(out, dir, files); - } - - /* (non-Javadoc) - * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) - */ - public void readFields(DataInput in) throws IOException { - resetForm(); - - int numDeleteTerms = in.readInt(); - for (int i = 0; i < numDeleteTerms; i++) { - String field = Text.readString(in); - String text = Text.readString(in); - deleteList.add(new Term(field, text)); - } - - RAMDirectoryUtil.readRAMFiles(in, dir); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.concurrent.ConcurrentLinkedQueue; + +import org.apache.hadoop.contrib.index.lucene.RAMDirectoryUtil; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +/** + * An intermediate form for one or more parsed Lucene documents and/or + * delete terms. It actually uses Lucene file format as the format for + * the intermediate form by using RAM dir files. + * + * Note: If process(*) is ever called, closeWriter() should be called. + * Otherwise, no need to call closeWriter(). + */ +public class IntermediateForm implements Writable { + + private IndexUpdateConfiguration iconf = null; + private final Collection deleteList; + private RAMDirectory dir; + private IndexWriter writer; + private int numDocs; + + /** + * Constructor + * @throws IOException + */ + public IntermediateForm() throws IOException { + deleteList = new ConcurrentLinkedQueue(); + dir = new RAMDirectory(); + writer = null; + numDocs = 0; + } + + /** + * Configure using an index update configuration. + * @param iconf the index update configuration + */ + public void configure(IndexUpdateConfiguration iconf) { + this.iconf = iconf; + } + + /** + * Get the ram directory of the intermediate form. + * @return the ram directory + */ + public Directory getDirectory() { + return dir; + } + + /** + * Get an iterator for the delete terms in the intermediate form. + * @return an iterator for the delete terms + */ + public Iterator deleteTermIterator() { + return deleteList.iterator(); + } + + /** + * This method is used by the index update mapper and process a document + * operation into the current intermediate form. + * @param doc input document operation + * @param analyzer the analyzer + * @throws IOException + */ + public void process(DocumentAndOp doc, Analyzer analyzer) throws IOException { + if (doc.getOp() == DocumentAndOp.Op.DELETE + || doc.getOp() == DocumentAndOp.Op.UPDATE) { + deleteList.add(doc.getTerm()); + + } + + if (doc.getOp() == DocumentAndOp.Op.INSERT + || doc.getOp() == DocumentAndOp.Op.UPDATE) { + + if (writer == null) { + // analyzer is null because we specify an analyzer with addDocument + writer = createWriter(); + } + + writer.addDocument(doc.getDocument(), analyzer); + numDocs++; + } + + } + + /** + * This method is used by the index update combiner and process an + * intermediate form into the current intermediate form. More specifically, + * the input intermediate forms are a single-document ram index and/or a + * single delete term. + * @param form the input intermediate form + * @throws IOException + */ + public void process(IntermediateForm form) throws IOException { + if (form.deleteList.size() > 0) { + deleteList.addAll(form.deleteList); + } + + if (form.dir.sizeInBytes() > 0) { + if (writer == null) { + writer = createWriter(); + } + + writer.addIndexesNoOptimize(new Directory[] { form.dir }); + numDocs++; + } + + } + + /** + * Close the Lucene index writer associated with the intermediate form, + * if created. Do not close the ram directory. In fact, there is no need + * to close a ram directory. + * @throws IOException + */ + public void closeWriter() throws IOException { + if (writer != null) { + writer.close(); + writer = null; + } + } + + /** + * The total size of files in the directory and ram used by the index writer. + * It does not include memory used by the delete list. + * @return the total size in bytes + */ + public long totalSizeInBytes() throws IOException { + long size = dir.sizeInBytes(); + if (writer != null) { + size += writer.ramSizeInBytes(); + } + return size; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append(this.getClass().getSimpleName()); + buffer.append("[numDocs="); + buffer.append(numDocs); + buffer.append(", numDeletes="); + buffer.append(deleteList.size()); + if (deleteList.size() > 0) { + buffer.append("("); + Iterator iter = deleteTermIterator(); + while (iter.hasNext()) { + buffer.append(iter.next()); + buffer.append(" "); + } + buffer.append(")"); + } + buffer.append("]"); + return buffer.toString(); + } + + private IndexWriter createWriter() throws IOException { + IndexWriter writer = + new IndexWriter(dir, false, null, + new KeepOnlyLastCommitDeletionPolicy()); + writer.setUseCompoundFile(false); + + if (iconf != null) { + int maxFieldLength = iconf.getIndexMaxFieldLength(); + if (maxFieldLength > 0) { + writer.setMaxFieldLength(maxFieldLength); + } + } + + return writer; + } + + private void resetForm() throws IOException { + deleteList.clear(); + if (dir.sizeInBytes() > 0) { + // it's ok if we don't close a ram directory + dir.close(); + // an alternative is to delete all the files and reuse the ram directory + dir = new RAMDirectory(); + } + assert (writer == null); + numDocs = 0; + } + + // /////////////////////////////////// + // Writable + // /////////////////////////////////// + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) + */ + public void write(DataOutput out) throws IOException { + out.writeInt(deleteList.size()); + for (Term term : deleteList) { + Text.writeString(out, term.field()); + Text.writeString(out, term.text()); + } + + String[] files = dir.list(); + RAMDirectoryUtil.writeRAMFiles(out, dir, files); + } + + /* (non-Javadoc) + * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) + */ + public void readFields(DataInput in) throws IOException { + resetForm(); + + int numDeleteTerms = in.readInt(); + for (int i = 0; i < numDeleteTerms; i++) { + String field = Text.readString(in); + String text = Text.readString(in); + deleteList.add(new Term(field, text)); + } + + RAMDirectoryUtil.readRAMFiles(in, dir); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java b/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java index bad7ae792bf..8f8061d43d3 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java @@ -1,105 +1,105 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.lucene; - -import java.io.IOException; - -import junit.framework.TestCase; - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexDeletionPolicy; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Hits; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.RAMDirectory; - -public class TestMixedDirectory extends TestCase { - private int numDocsPerUpdate = 10; - private int maxBufferedDocs = 2; - - public void testMixedDirectoryAndPolicy() throws IOException { - Directory readDir = new RAMDirectory(); - updateIndex(readDir, 0, numDocsPerUpdate, - new KeepOnlyLastCommitDeletionPolicy()); - - verify(readDir, numDocsPerUpdate); - - IndexOutput out = - readDir.createOutput("_" + (numDocsPerUpdate / maxBufferedDocs + 2) - + ".cfs"); - out.writeInt(0); - out.close(); - - Directory writeDir = new RAMDirectory(); - Directory mixedDir = new MixedDirectory(readDir, writeDir); - updateIndex(mixedDir, numDocsPerUpdate, numDocsPerUpdate, - new MixedDeletionPolicy()); - - verify(readDir, numDocsPerUpdate); - verify(mixedDir, 2 * numDocsPerUpdate); - } - - public void updateIndex(Directory dir, int base, int numDocs, - IndexDeletionPolicy policy) throws IOException { - IndexWriter writer = - new IndexWriter(dir, false, new StandardAnalyzer(), policy); - writer.setMaxBufferedDocs(maxBufferedDocs); - writer.setMergeFactor(1000); - for (int i = 0; i < numDocs; i++) { - addDoc(writer, base + i); - } - writer.close(); - } - - private void addDoc(IndexWriter writer, int id) throws IOException { - Document doc = new Document(); - doc.add(new Field("id", String.valueOf(id), Field.Store.YES, - Field.Index.UN_TOKENIZED)); - doc.add(new Field("content", "apache", Field.Store.NO, - Field.Index.TOKENIZED)); - writer.addDocument(doc); - } - - private void verify(Directory dir, int expectedHits) throws IOException { - IndexSearcher searcher = new IndexSearcher(dir); - Hits hits = searcher.search(new TermQuery(new Term("content", "apache"))); - int numHits = hits.length(); - - assertEquals(expectedHits, numHits); - - int[] docs = new int[numHits]; - for (int i = 0; i < numHits; i++) { - Document hit = hits.doc(i); - docs[Integer.parseInt(hit.get("id"))]++; - } - for (int i = 0; i < numHits; i++) { - assertEquals(1, docs[i]); - } - - searcher.close(); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.lucene; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexDeletionPolicy; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; + +public class TestMixedDirectory extends TestCase { + private int numDocsPerUpdate = 10; + private int maxBufferedDocs = 2; + + public void testMixedDirectoryAndPolicy() throws IOException { + Directory readDir = new RAMDirectory(); + updateIndex(readDir, 0, numDocsPerUpdate, + new KeepOnlyLastCommitDeletionPolicy()); + + verify(readDir, numDocsPerUpdate); + + IndexOutput out = + readDir.createOutput("_" + (numDocsPerUpdate / maxBufferedDocs + 2) + + ".cfs"); + out.writeInt(0); + out.close(); + + Directory writeDir = new RAMDirectory(); + Directory mixedDir = new MixedDirectory(readDir, writeDir); + updateIndex(mixedDir, numDocsPerUpdate, numDocsPerUpdate, + new MixedDeletionPolicy()); + + verify(readDir, numDocsPerUpdate); + verify(mixedDir, 2 * numDocsPerUpdate); + } + + public void updateIndex(Directory dir, int base, int numDocs, + IndexDeletionPolicy policy) throws IOException { + IndexWriter writer = + new IndexWriter(dir, false, new StandardAnalyzer(), policy); + writer.setMaxBufferedDocs(maxBufferedDocs); + writer.setMergeFactor(1000); + for (int i = 0; i < numDocs; i++) { + addDoc(writer, base + i); + } + writer.close(); + } + + private void addDoc(IndexWriter writer, int id) throws IOException { + Document doc = new Document(); + doc.add(new Field("id", String.valueOf(id), Field.Store.YES, + Field.Index.UN_TOKENIZED)); + doc.add(new Field("content", "apache", Field.Store.NO, + Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + + private void verify(Directory dir, int expectedHits) throws IOException { + IndexSearcher searcher = new IndexSearcher(dir); + Hits hits = searcher.search(new TermQuery(new Term("content", "apache"))); + int numHits = hits.length(); + + assertEquals(expectedHits, numHits); + + int[] docs = new int[numHits]; + for (int i = 0; i < numHits; i++) { + Document hit = hits.doc(i); + docs[Integer.parseInt(hit.get("id"))]++; + } + for (int i = 0; i < numHits; i++) { + assertEquals(1, docs[i]); + } + + searcher.close(); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestDistributionPolicy.java b/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestDistributionPolicy.java index 6afea9f3336..f611bddf93d 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestDistributionPolicy.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestDistributionPolicy.java @@ -1,234 +1,234 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.File; -import java.io.IOException; -import java.text.NumberFormat; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.contrib.index.example.HashingDistributionPolicy; -import org.apache.hadoop.contrib.index.example.RoundRobinDistributionPolicy; -import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.File; +import java.io.IOException; +import java.text.NumberFormat; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.contrib.index.example.HashingDistributionPolicy; +import org.apache.hadoop.contrib.index.example.RoundRobinDistributionPolicy; +import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.MiniMRCluster; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Hits; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.Directory; - -import junit.framework.TestCase; - -public class TestDistributionPolicy extends TestCase { - - private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); - static { - NUMBER_FORMAT.setMinimumIntegerDigits(5); - NUMBER_FORMAT.setGroupingUsed(false); - } - - // however, "we only allow 0 or 1 reducer in local mode" - from - // LocalJobRunner - private Configuration conf; - private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt"); - private Path localUpdatePath = - new Path(System.getProperty("build.test") + "/sample/data2.txt"); - private Path inputPath = new Path("/myexample/data.txt"); - private Path updatePath = new Path("/myexample/data2.txt"); - private Path outputPath = new Path("/myoutput"); - private Path indexPath = new Path("/myindex"); - private int numShards = 3; - private int numMapTasks = 5; - - private int numDataNodes = 3; - private int numTaskTrackers = 3; - - private int numDocsPerRun = 10; // num of docs in local input path - - private FileSystem fs; - private MiniDFSCluster dfsCluster; - private MiniMRCluster mrCluster; - - public TestDistributionPolicy() throws IOException { - super(); - if (System.getProperty("hadoop.log.dir") == null) { - String base = new File(".").getPath(); // getAbsolutePath(); - System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs"); - } - conf = new Configuration(); - } - - protected void setUp() throws Exception { - super.setUp(); - try { - dfsCluster = - new MiniDFSCluster(conf, numDataNodes, true, (String[]) null); - - fs = dfsCluster.getFileSystem(); - if (fs.exists(inputPath)) { - fs.delete(inputPath, true); - } - fs.copyFromLocalFile(localInputPath, inputPath); - if (fs.exists(updatePath)) { - fs.delete(updatePath, true); - } - fs.copyFromLocalFile(localUpdatePath, updatePath); - - if (fs.exists(outputPath)) { - // do not create, mapred will create - fs.delete(outputPath, true); - } - - if (fs.exists(indexPath)) { - fs.delete(indexPath, true); - } - - mrCluster = - new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1); - - } catch (IOException e) { - if (dfsCluster != null) { - dfsCluster.shutdown(); - dfsCluster = null; - } - - if (fs != null) { - fs.close(); - fs = null; - } - - if (mrCluster != null) { - mrCluster.shutdown(); - mrCluster = null; - } - - throw e; - } - - } - - protected void tearDown() throws Exception { - if (dfsCluster != null) { - dfsCluster.shutdown(); - dfsCluster = null; - } - - if (fs != null) { - fs.close(); - fs = null; - } - - if (mrCluster != null) { - mrCluster.shutdown(); - mrCluster = null; - } - - super.tearDown(); - } - - public void testDistributionPolicy() throws IOException { - IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); - - // test hashing distribution policy - iconf.setDistributionPolicyClass(HashingDistributionPolicy.class); - onetest(); - - if (fs.exists(indexPath)) { - fs.delete(indexPath, true); - } - - // test round-robin distribution policy - iconf.setDistributionPolicyClass(RoundRobinDistributionPolicy.class); - onetest(); - } - - private void onetest() throws IOException { - long versionNumber = -1; - long generation = -1; - - Shard[] shards = new Shard[numShards]; - for (int j = 0; j < shards.length; j++) { - shards[j] = - new Shard(versionNumber, - new Path(indexPath, NUMBER_FORMAT.format(j)).toString(), - generation); - } - - if (fs.exists(outputPath)) { - fs.delete(outputPath, true); - } - - IIndexUpdater updater = new IndexUpdater(); - updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks, - shards); - - if (fs.exists(outputPath)) { - fs.delete(outputPath, true); - } - - // delete docs w/ even docids, update docs w/ odd docids - updater.run(conf, new Path[] { updatePath }, outputPath, numMapTasks, - shards); - - verify(shards); - } - - private void verify(Shard[] shards) throws IOException { - // verify the index - IndexReader[] readers = new IndexReader[shards.length]; - for (int i = 0; i < shards.length; i++) { - Directory dir = - new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), - false, conf); - readers[i] = IndexReader.open(dir); - } - - IndexReader reader = new MultiReader(readers); - IndexSearcher searcher = new IndexSearcher(reader); - Hits hits = searcher.search(new TermQuery(new Term("content", "apache"))); - assertEquals(0, hits.length()); - - hits = searcher.search(new TermQuery(new Term("content", "hadoop"))); - assertEquals(numDocsPerRun / 2, hits.length()); - - int[] counts = new int[numDocsPerRun]; - for (int i = 0; i < hits.length(); i++) { - Document doc = hits.doc(i); - counts[Integer.parseInt(doc.get("id"))]++; - } - - for (int i = 0; i < numDocsPerRun; i++) { - if (i % 2 == 0) { - assertEquals(0, counts[i]); - } else { - assertEquals(1, counts[i]); - } - } - - searcher.close(); - reader.close(); - } - -} +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.MiniMRCluster; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; + +import junit.framework.TestCase; + +public class TestDistributionPolicy extends TestCase { + + private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); + static { + NUMBER_FORMAT.setMinimumIntegerDigits(5); + NUMBER_FORMAT.setGroupingUsed(false); + } + + // however, "we only allow 0 or 1 reducer in local mode" - from + // LocalJobRunner + private Configuration conf; + private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt"); + private Path localUpdatePath = + new Path(System.getProperty("build.test") + "/sample/data2.txt"); + private Path inputPath = new Path("/myexample/data.txt"); + private Path updatePath = new Path("/myexample/data2.txt"); + private Path outputPath = new Path("/myoutput"); + private Path indexPath = new Path("/myindex"); + private int numShards = 3; + private int numMapTasks = 5; + + private int numDataNodes = 3; + private int numTaskTrackers = 3; + + private int numDocsPerRun = 10; // num of docs in local input path + + private FileSystem fs; + private MiniDFSCluster dfsCluster; + private MiniMRCluster mrCluster; + + public TestDistributionPolicy() throws IOException { + super(); + if (System.getProperty("hadoop.log.dir") == null) { + String base = new File(".").getPath(); // getAbsolutePath(); + System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs"); + } + conf = new Configuration(); + } + + protected void setUp() throws Exception { + super.setUp(); + try { + dfsCluster = + new MiniDFSCluster(conf, numDataNodes, true, (String[]) null); + + fs = dfsCluster.getFileSystem(); + if (fs.exists(inputPath)) { + fs.delete(inputPath, true); + } + fs.copyFromLocalFile(localInputPath, inputPath); + if (fs.exists(updatePath)) { + fs.delete(updatePath, true); + } + fs.copyFromLocalFile(localUpdatePath, updatePath); + + if (fs.exists(outputPath)) { + // do not create, mapred will create + fs.delete(outputPath, true); + } + + if (fs.exists(indexPath)) { + fs.delete(indexPath, true); + } + + mrCluster = + new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1); + + } catch (IOException e) { + if (dfsCluster != null) { + dfsCluster.shutdown(); + dfsCluster = null; + } + + if (fs != null) { + fs.close(); + fs = null; + } + + if (mrCluster != null) { + mrCluster.shutdown(); + mrCluster = null; + } + + throw e; + } + + } + + protected void tearDown() throws Exception { + if (dfsCluster != null) { + dfsCluster.shutdown(); + dfsCluster = null; + } + + if (fs != null) { + fs.close(); + fs = null; + } + + if (mrCluster != null) { + mrCluster.shutdown(); + mrCluster = null; + } + + super.tearDown(); + } + + public void testDistributionPolicy() throws IOException { + IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); + + // test hashing distribution policy + iconf.setDistributionPolicyClass(HashingDistributionPolicy.class); + onetest(); + + if (fs.exists(indexPath)) { + fs.delete(indexPath, true); + } + + // test round-robin distribution policy + iconf.setDistributionPolicyClass(RoundRobinDistributionPolicy.class); + onetest(); + } + + private void onetest() throws IOException { + long versionNumber = -1; + long generation = -1; + + Shard[] shards = new Shard[numShards]; + for (int j = 0; j < shards.length; j++) { + shards[j] = + new Shard(versionNumber, + new Path(indexPath, NUMBER_FORMAT.format(j)).toString(), + generation); + } + + if (fs.exists(outputPath)) { + fs.delete(outputPath, true); + } + + IIndexUpdater updater = new IndexUpdater(); + updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks, + shards); + + if (fs.exists(outputPath)) { + fs.delete(outputPath, true); + } + + // delete docs w/ even docids, update docs w/ odd docids + updater.run(conf, new Path[] { updatePath }, outputPath, numMapTasks, + shards); + + verify(shards); + } + + private void verify(Shard[] shards) throws IOException { + // verify the index + IndexReader[] readers = new IndexReader[shards.length]; + for (int i = 0; i < shards.length; i++) { + Directory dir = + new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), + false, conf); + readers[i] = IndexReader.open(dir); + } + + IndexReader reader = new MultiReader(readers); + IndexSearcher searcher = new IndexSearcher(reader); + Hits hits = searcher.search(new TermQuery(new Term("content", "apache"))); + assertEquals(0, hits.length()); + + hits = searcher.search(new TermQuery(new Term("content", "hadoop"))); + assertEquals(numDocsPerRun / 2, hits.length()); + + int[] counts = new int[numDocsPerRun]; + for (int i = 0; i < hits.length(); i++) { + Document doc = hits.doc(i); + counts[Integer.parseInt(doc.get("id"))]++; + } + + for (int i = 0; i < numDocsPerRun; i++) { + if (i % 2 == 0) { + assertEquals(0, counts[i]); + } else { + assertEquals(1, counts[i]); + } + } + + searcher.close(); + reader.close(); + } + +} diff --git a/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestIndexUpdater.java b/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestIndexUpdater.java index da9ecefaf2e..f21fa46a0ac 100755 --- a/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestIndexUpdater.java +++ b/hadoop-mapreduce-project/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestIndexUpdater.java @@ -1,258 +1,258 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.contrib.index.mapred; - -import java.io.File; -import java.io.IOException; -import java.text.NumberFormat; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.contrib.index.mapred; + +import java.io.File; +import java.io.IOException; +import java.text.NumberFormat; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.hadoop.mapred.MiniMRCluster; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; -import org.apache.lucene.index.MultiReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Hits; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.Directory; - -import junit.framework.TestCase; - -public class TestIndexUpdater extends TestCase { - - private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); - static { - NUMBER_FORMAT.setMinimumIntegerDigits(5); - NUMBER_FORMAT.setGroupingUsed(false); - } - - // however, "we only allow 0 or 1 reducer in local mode" - from - // LocalJobRunner - private Configuration conf; - private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt"); - private Path inputPath = new Path("/myexample/data.txt"); - private Path outputPath = new Path("/myoutput"); - private Path indexPath = new Path("/myindex"); - private int initNumShards = 3; - private int numMapTasks = 5; - - private int numDataNodes = 3; - private int numTaskTrackers = 3; - - private int numRuns = 3; - private int numDocsPerRun = 10; // num of docs in local input path - - private FileSystem fs; - private MiniDFSCluster dfsCluster; - private MiniMRCluster mrCluster; - - public TestIndexUpdater() throws IOException { - super(); - if (System.getProperty("hadoop.log.dir") == null) { - String base = new File(".").getPath(); // getAbsolutePath(); - System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs"); - } - conf = new Configuration(); +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.mapred.MiniMRCluster; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; + +import junit.framework.TestCase; + +public class TestIndexUpdater extends TestCase { + + private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); + static { + NUMBER_FORMAT.setMinimumIntegerDigits(5); + NUMBER_FORMAT.setGroupingUsed(false); + } + + // however, "we only allow 0 or 1 reducer in local mode" - from + // LocalJobRunner + private Configuration conf; + private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt"); + private Path inputPath = new Path("/myexample/data.txt"); + private Path outputPath = new Path("/myoutput"); + private Path indexPath = new Path("/myindex"); + private int initNumShards = 3; + private int numMapTasks = 5; + + private int numDataNodes = 3; + private int numTaskTrackers = 3; + + private int numRuns = 3; + private int numDocsPerRun = 10; // num of docs in local input path + + private FileSystem fs; + private MiniDFSCluster dfsCluster; + private MiniMRCluster mrCluster; + + public TestIndexUpdater() throws IOException { + super(); + if (System.getProperty("hadoop.log.dir") == null) { + String base = new File(".").getPath(); // getAbsolutePath(); + System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs"); + } + conf = new Configuration(); //See MAPREDUCE-947 for more details. Setting to false prevents the creation of _SUCCESS. conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); - } - - protected void setUp() throws Exception { - super.setUp(); - try { - dfsCluster = - new MiniDFSCluster(conf, numDataNodes, true, (String[]) null); - - fs = dfsCluster.getFileSystem(); - if (fs.exists(inputPath)) { - fs.delete(inputPath, true); - } - fs.copyFromLocalFile(localInputPath, inputPath); - - if (fs.exists(outputPath)) { - // do not create, mapred will create - fs.delete(outputPath, true); - } - - if (fs.exists(indexPath)) { - fs.delete(indexPath, true); - } - - mrCluster = - new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1); - - } catch (IOException e) { - if (dfsCluster != null) { - dfsCluster.shutdown(); - dfsCluster = null; - } - - if (fs != null) { - fs.close(); - fs = null; - } - - if (mrCluster != null) { - mrCluster.shutdown(); - mrCluster = null; - } - - throw e; - } - - } - - protected void tearDown() throws Exception { - if (dfsCluster != null) { - dfsCluster.shutdown(); - dfsCluster = null; - } - - if (fs != null) { - fs.close(); - fs = null; - } - - if (mrCluster != null) { - mrCluster.shutdown(); - mrCluster = null; - } - - super.tearDown(); - } - - public void testIndexUpdater() throws IOException { - IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); - // max field length, compound file and number of segments will be checked - // later - iconf.setIndexMaxFieldLength(2); - iconf.setIndexUseCompoundFile(true); - iconf.setIndexMaxNumSegments(1); - iconf.setMaxRAMSizeInBytes(20480); - - long versionNumber = -1; - long generation = -1; - - for (int i = 0; i < numRuns; i++) { - if (fs.exists(outputPath)) { - fs.delete(outputPath, true); - } - - Shard[] shards = new Shard[initNumShards + i]; - for (int j = 0; j < shards.length; j++) { - shards[j] = - new Shard(versionNumber, new Path(indexPath, - NUMBER_FORMAT.format(j)).toString(), generation); - } - run(i + 1, shards); - } - } - - private void run(int numRuns, Shard[] shards) throws IOException { - IIndexUpdater updater = new IndexUpdater(); - updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks, - shards); - - // verify the done files - Path[] doneFileNames = new Path[shards.length]; - int count = 0; - FileStatus[] fileStatus = fs.listStatus(outputPath); - for (int i = 0; i < fileStatus.length; i++) { - FileStatus[] doneFiles = fs.listStatus(fileStatus[i].getPath()); - for (int j = 0; j < doneFiles.length; j++) { - doneFileNames[count++] = doneFiles[j].getPath(); - } - } - assertEquals(shards.length, count); - for (int i = 0; i < count; i++) { - assertTrue(doneFileNames[i].getName().startsWith( - IndexUpdateReducer.DONE.toString())); - } - - // verify the index - IndexReader[] readers = new IndexReader[shards.length]; - for (int i = 0; i < shards.length; i++) { - Directory dir = - new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), - false, conf); - readers[i] = IndexReader.open(dir); - } - - IndexReader reader = new MultiReader(readers); - IndexSearcher searcher = new IndexSearcher(reader); - Hits hits = searcher.search(new TermQuery(new Term("content", "apache"))); - - assertEquals(numRuns * numDocsPerRun, hits.length()); - - int[] counts = new int[numDocsPerRun]; - for (int i = 0; i < hits.length(); i++) { - Document doc = hits.doc(i); - counts[Integer.parseInt(doc.get("id"))]++; - } - - for (int i = 0; i < numDocsPerRun; i++) { - assertEquals(numRuns, counts[i]); - } - - // max field length is 2, so "dot" is also indexed but not "org" - hits = searcher.search(new TermQuery(new Term("content", "dot"))); - assertEquals(numRuns, hits.length()); - - hits = searcher.search(new TermQuery(new Term("content", "org"))); - assertEquals(0, hits.length()); - - searcher.close(); - reader.close(); - - // open and close an index writer with KeepOnlyLastCommitDeletionPolicy - // to remove earlier checkpoints - for (int i = 0; i < shards.length; i++) { - Directory dir = - new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), - false, conf); - IndexWriter writer = - new IndexWriter(dir, false, null, - new KeepOnlyLastCommitDeletionPolicy()); - writer.close(); - } - - // verify the number of segments, must be done after an writer with - // KeepOnlyLastCommitDeletionPolicy so that earlier checkpoints are removed - for (int i = 0; i < shards.length; i++) { - PathFilter cfsFilter = new PathFilter() { - public boolean accept(Path path) { - return path.getName().endsWith(".cfs"); - } - }; - FileStatus[] cfsFiles = - fs.listStatus(new Path(shards[i].getDirectory()), cfsFilter); - assertEquals(1, cfsFiles.length); - } - } - -} + } + + protected void setUp() throws Exception { + super.setUp(); + try { + dfsCluster = + new MiniDFSCluster(conf, numDataNodes, true, (String[]) null); + + fs = dfsCluster.getFileSystem(); + if (fs.exists(inputPath)) { + fs.delete(inputPath, true); + } + fs.copyFromLocalFile(localInputPath, inputPath); + + if (fs.exists(outputPath)) { + // do not create, mapred will create + fs.delete(outputPath, true); + } + + if (fs.exists(indexPath)) { + fs.delete(indexPath, true); + } + + mrCluster = + new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1); + + } catch (IOException e) { + if (dfsCluster != null) { + dfsCluster.shutdown(); + dfsCluster = null; + } + + if (fs != null) { + fs.close(); + fs = null; + } + + if (mrCluster != null) { + mrCluster.shutdown(); + mrCluster = null; + } + + throw e; + } + + } + + protected void tearDown() throws Exception { + if (dfsCluster != null) { + dfsCluster.shutdown(); + dfsCluster = null; + } + + if (fs != null) { + fs.close(); + fs = null; + } + + if (mrCluster != null) { + mrCluster.shutdown(); + mrCluster = null; + } + + super.tearDown(); + } + + public void testIndexUpdater() throws IOException { + IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); + // max field length, compound file and number of segments will be checked + // later + iconf.setIndexMaxFieldLength(2); + iconf.setIndexUseCompoundFile(true); + iconf.setIndexMaxNumSegments(1); + iconf.setMaxRAMSizeInBytes(20480); + + long versionNumber = -1; + long generation = -1; + + for (int i = 0; i < numRuns; i++) { + if (fs.exists(outputPath)) { + fs.delete(outputPath, true); + } + + Shard[] shards = new Shard[initNumShards + i]; + for (int j = 0; j < shards.length; j++) { + shards[j] = + new Shard(versionNumber, new Path(indexPath, + NUMBER_FORMAT.format(j)).toString(), generation); + } + run(i + 1, shards); + } + } + + private void run(int numRuns, Shard[] shards) throws IOException { + IIndexUpdater updater = new IndexUpdater(); + updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks, + shards); + + // verify the done files + Path[] doneFileNames = new Path[shards.length]; + int count = 0; + FileStatus[] fileStatus = fs.listStatus(outputPath); + for (int i = 0; i < fileStatus.length; i++) { + FileStatus[] doneFiles = fs.listStatus(fileStatus[i].getPath()); + for (int j = 0; j < doneFiles.length; j++) { + doneFileNames[count++] = doneFiles[j].getPath(); + } + } + assertEquals(shards.length, count); + for (int i = 0; i < count; i++) { + assertTrue(doneFileNames[i].getName().startsWith( + IndexUpdateReducer.DONE.toString())); + } + + // verify the index + IndexReader[] readers = new IndexReader[shards.length]; + for (int i = 0; i < shards.length; i++) { + Directory dir = + new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), + false, conf); + readers[i] = IndexReader.open(dir); + } + + IndexReader reader = new MultiReader(readers); + IndexSearcher searcher = new IndexSearcher(reader); + Hits hits = searcher.search(new TermQuery(new Term("content", "apache"))); + + assertEquals(numRuns * numDocsPerRun, hits.length()); + + int[] counts = new int[numDocsPerRun]; + for (int i = 0; i < hits.length(); i++) { + Document doc = hits.doc(i); + counts[Integer.parseInt(doc.get("id"))]++; + } + + for (int i = 0; i < numDocsPerRun; i++) { + assertEquals(numRuns, counts[i]); + } + + // max field length is 2, so "dot" is also indexed but not "org" + hits = searcher.search(new TermQuery(new Term("content", "dot"))); + assertEquals(numRuns, hits.length()); + + hits = searcher.search(new TermQuery(new Term("content", "org"))); + assertEquals(0, hits.length()); + + searcher.close(); + reader.close(); + + // open and close an index writer with KeepOnlyLastCommitDeletionPolicy + // to remove earlier checkpoints + for (int i = 0; i < shards.length; i++) { + Directory dir = + new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), + false, conf); + IndexWriter writer = + new IndexWriter(dir, false, null, + new KeepOnlyLastCommitDeletionPolicy()); + writer.close(); + } + + // verify the number of segments, must be done after an writer with + // KeepOnlyLastCommitDeletionPolicy so that earlier checkpoints are removed + for (int i = 0; i < shards.length; i++) { + PathFilter cfsFilter = new PathFilter() { + public boolean accept(Path path) { + return path.getName().endsWith(".cfs"); + } + }; + FileStatus[] cfsFiles = + fs.listStatus(new Path(shards[i].getDirectory()), cfsFilter); + assertEquals(1, cfsFiles.length); + } + } + +} diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 27ac35441cf..75808a71161 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -44,6 +44,9 @@ Release 2.0.3-alpha - Unreleased YARN-146. Add unit tests for computing fair share in the fair scheduler. (Sandy Ryza via tomwhite) + HADOOP-8911. CRLF characters in source and text files. + (Raja Aluri via suresh) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm index c0389377465..44fcfc74e67 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm @@ -1,1126 +1,1126 @@ -~~ Licensed under the Apache License, Version 2.0 (the "License"); -~~ you may not use this file except in compliance with the License. -~~ You may obtain a copy of the License at -~~ -~~ http://www.apache.org/licenses/LICENSE-2.0 -~~ -~~ Unless required by applicable law or agreed to in writing, software -~~ distributed under the License is distributed on an "AS IS" BASIS, -~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -~~ See the License for the specific language governing permissions and -~~ limitations under the License. See accompanying LICENSE file. - - --- - Hadoop Map Reduce Next Generation-${project.version} - Cluster Setup - --- - --- - ${maven.build.timestamp} - -Hadoop MapReduce Next Generation - Cluster Setup - - \[ {{{./index.html}Go Back}} \] - -%{toc|section=1|fromDepth=0} - -* {Purpose} - - This document describes how to install, configure and manage non-trivial - Hadoop clusters ranging from a few nodes to extremely large clusters - with thousands of nodes. - - To play with Hadoop, you may first want to install it on a single - machine (see {{{SingleCluster}Single Node Setup}}). - -* {Prerequisites} - - Download a stable version of Hadoop from Apache mirrors. - -* {Installation} - - Installing a Hadoop cluster typically involves unpacking the software on all - the machines in the cluster or installing RPMs. - - Typically one machine in the cluster is designated as the NameNode and - another machine the as ResourceManager, exclusively. These are the masters. - - The rest of the machines in the cluster act as both DataNode and NodeManager. - These are the slaves. - -* {Running Hadoop in Non-Secure Mode} - - The following sections describe how to configure a Hadoop cluster. - - * {Configuration Files} - - Hadoop configuration is driven by two types of important configuration files: - - * Read-only default configuration - <<>>, - <<>>, <<>> and - <<>>. - - * Site-specific configuration - <>, - <>, <> and - <>. - - - Additionally, you can control the Hadoop scripts found in the bin/ - directory of the distribution, by setting site-specific values via the - <> and <>. - - * {Site Configuration} - - To configure the Hadoop cluster you will need to configure the - <<>> in which the Hadoop daemons execute as well as the - <<>> for the Hadoop daemons. - - The Hadoop daemons are NameNode/DataNode and ResourceManager/NodeManager. - - - * {Configuring Environment of Hadoop Daemons} - - Administrators should use the <> and - <> script to do site-specific customization of the - Hadoop daemons' process environment. - - At the very least you should specify the <<>> so that it is - correctly defined on each remote node. - - In most cases you should also specify <<>> and - <<>> to point to directories that can only be - written to by the users that are going to run the hadoop daemons. - Otherwise there is the potential for a symlink attack. - - Administrators can configure individual daemons using the configuration - options shown below in the table: - -*--------------------------------------+--------------------------------------+ -|| Daemon || Environment Variable | -*--------------------------------------+--------------------------------------+ -| NameNode | HADOOP_NAMENODE_OPTS | -*--------------------------------------+--------------------------------------+ -| DataNode | HADOOP_DATANODE_OPTS | -*--------------------------------------+--------------------------------------+ -| Secondary NameNode | HADOOP_SECONDARYNAMENODE_OPTS | -*--------------------------------------+--------------------------------------+ -| ResourceManager | YARN_RESOURCEMANAGER_OPTS | -*--------------------------------------+--------------------------------------+ -| NodeManager | YARN_NODEMANAGER_OPTS | -*--------------------------------------+--------------------------------------+ -| WebAppProxy | YARN_PROXYSERVER_OPTS | -*--------------------------------------+--------------------------------------+ -| Map Reduce Job History Server | HADOOP_JOB_HISTORYSERVER_OPTS | -*--------------------------------------+--------------------------------------+ - - - For example, To configure Namenode to use parallelGC, the following - statement should be added in hadoop-env.sh : - ----- - export HADOOP_NAMENODE_OPTS="-XX:+UseParallelGC ${HADOOP_NAMENODE_OPTS}" ----- - - Other useful configuration parameters that you can customize include: - - * <<>> / <<>> - The directory where the - daemons' log files are stored. They are automatically created if they - don't exist. - - * <<>> / <<>> - The maximum amount of - heapsize to use, in MB e.g. if the varibale is set to 1000 the heap - will be set to 1000MB. This is used to configure the heap - size for the daemon. By default, the value is 1000. If you want to - configure the values separately for each deamon you can use. -*--------------------------------------+--------------------------------------+ -|| Daemon || Environment Variable | -*--------------------------------------+--------------------------------------+ -| ResourceManager | YARN_RESOURCEMANAGER_HEAPSIZE | -*--------------------------------------+--------------------------------------+ -| NodeManager | YARN_NODEMANAGER_HEAPSIZE | -*--------------------------------------+--------------------------------------+ -| WebAppProxy | YARN_PROXYSERVER_HEAPSIZE | -*--------------------------------------+--------------------------------------+ -| Map Reduce Job History Server | HADOOP_JOB_HISTORYSERVER_HEAPSIZE | -*--------------------------------------+--------------------------------------+ - - * {Configuring the Hadoop Daemons in Non-Secure Mode} - - This section deals with important parameters to be specified in - the given configuration files: - - * <<>> - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | NameNode URI | | -*-------------------------+-------------------------+------------------------+ -| <<>> | 131072 | | -| | | Size of read/write buffer used in SequenceFiles. | -*-------------------------+-------------------------+------------------------+ - - * <<>> - - * Configurations for NameNode: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Path on the local filesystem where the NameNode stores the namespace | | -| | and transactions logs persistently. | | -| | | If this is a comma-delimited list of directories then the name table is | -| | | replicated in all of the directories, for redundancy. | -*-------------------------+-------------------------+------------------------+ -| <<>> / <<>> | | | -| | List of permitted/excluded DataNodes. | | -| | | If necessary, use these files to control the list of allowable | -| | | datanodes. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 268435456 | | -| | | HDFS blocksize of 256MB for large file-systems. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 100 | | -| | | More NameNode server threads to handle RPCs from large number of | -| | | DataNodes. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for DataNode: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Comma separated list of paths on the local filesystem of a | | -| | <<>> where it should store its blocks. | | -| | | If this is a comma-delimited list of directories, then data will be | -| | | stored in all named directories, typically on different devices. | -*-------------------------+-------------------------+------------------------+ - - * <<>> - - * Configurations for ResourceManager and NodeManager: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> / <<>> | | -| | | Enable ACLs? Defaults to . | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Admin ACL | | -| | | ACL to set admins on the cluster. | -| | | ACLs are of for . | -| | | Defaults to special value of <<*>> which means . | -| | | Special value of just means no one has access. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | | -| | | Configuration to enable or disable log aggregation | -*-------------------------+-------------------------+------------------------+ - - - * Configurations for ResourceManager: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> host:port for clients to submit jobs. | | -| | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> host:port for ApplicationMasters to talk to | | -| | Scheduler to obtain resources. | | -| | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> host:port for NodeManagers. | | -| | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> host:port for administrative commands. | | -| | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> web-ui host:port. | | -| | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> Scheduler class. | | -| | | <<>> (recommended) or <<>> | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Minimum limit of memory to allocate to each container request at the <<>>. | | -| | | In MBs | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Maximum limit of memory to allocate to each container request at the <<>>. | | -| | | In MBs | -*-------------------------+-------------------------+------------------------+ -| <<>> / | | | -| <<>> | | | -| | List of permitted/excluded NodeManagers. | | -| | | If necessary, use these files to control the list of allowable | -| | | NodeManagers. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for NodeManager: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Resource i.e. available physical memory, in MB, for given <<>> | | -| | | Defines total available resources on the <<>> to be made | -| | | available to running containers | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Maximum ratio by which virtual memory usage of tasks may exceed | -| | physical memory | | -| | | The virtual memory usage of each task may exceed its physical memory | -| | | limit by this ratio. The total amount of virtual memory used by tasks | -| | | on the NodeManager may exceed its physical memory usage by this ratio. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Comma-separated list of paths on the local filesystem where | | -| | intermediate data is written. || -| | | Multiple paths help spread disk i/o. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Comma-separated list of paths on the local filesystem where logs | | -| | are written. | | -| | | Multiple paths help spread disk i/o. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <10800> | | -| | | Default time (in seconds) to retain log files on the NodeManager | -| | | Only applicable if log-aggregation is disabled. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | | -| | | HDFS directory where the application logs are moved on application | -| | | completion. Need to set appropriate permissions. | -| | | Only applicable if log-aggregation is enabled. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | | -| | | Suffix appended to the remote log dir. Logs will be aggregated to | -| | | $\{yarn.nodemanager.remote-app-log-dir\}/$\{user\}/$\{thisParam\} | -| | | Only applicable if log-aggregation is enabled. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | mapreduce.shuffle | | -| | | Shuffle service that needs to be set for Map Reduce applications. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for History Server (Needs to be moved elsewhere): - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <-1> | | -| | | How long to keep aggregation logs before deleting them. -1 disables. | -| | | Be careful, set this too small and you will spam the name node. | -*-------------------------+-------------------------+------------------------+ - - - - * <<>> - - * Configurations for MapReduce Applications: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | yarn | | -| | | Execution framework set to Hadoop YARN. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 1536 | | -| | | Larger resource limit for maps. | -*-------------------------+-------------------------+------------------------+ -| <<>> | -Xmx1024M | | -| | | Larger heap-size for child jvms of maps. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 3072 | | -| | | Larger resource limit for reduces. | -*-------------------------+-------------------------+------------------------+ -| <<>> | -Xmx2560M | | -| | | Larger heap-size for child jvms of reduces. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 512 | | -| | | Higher memory-limit while sorting data for efficiency. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 100 | | -| | | More streams merged at once while sorting files. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 50 | | -| | | Higher number of parallel copies run by reduces to fetch outputs | -| | | from very large number of maps. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for MapReduce JobHistory Server: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | MapReduce JobHistory Server | Default port is 10020. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | MapReduce JobHistory Server Web UI | Default port is 19888. | -*-------------------------+-------------------------+------------------------+ -| <<>> | /mr-history/tmp | | -| | | Directory where history files are written by MapReduce jobs. | -*-------------------------+-------------------------+------------------------+ -| <<>> | /mr-history/done| | -| | | Directory where history files are managed by the MR JobHistory Server. | -*-------------------------+-------------------------+------------------------+ - - * Hadoop Rack Awareness - - The HDFS and the YARN components are rack-aware. - - The NameNode and the ResourceManager obtains the rack information of the - slaves in the cluster by invoking an API in an administrator - configured module. - - The API resolves the DNS name (also IP address) to a rack id. - - The site-specific module to use can be configured using the configuration - item <<>>. The default implementation - of the same runs a script/command configured using - <<>>. If <<>> is - not set, the rack id is returned for any passed IP address. - - * Monitoring Health of NodeManagers - - Hadoop provides a mechanism by which administrators can configure the - NodeManager to run an administrator supplied script periodically to - determine if a node is healthy or not. - - Administrators can determine if the node is in a healthy state by - performing any checks of their choice in the script. If the script - detects the node to be in an unhealthy state, it must print a line to - standard output beginning with the string ERROR. The NodeManager spawns - the script periodically and checks its output. If the script's output - contains the string ERROR, as described above, the node's status is - reported as <<>> and the node is black-listed by the - ResourceManager. No further tasks will be assigned to this node. - However, the NodeManager continues to run the script, so that if the - node becomes healthy again, it will be removed from the blacklisted nodes - on the ResourceManager automatically. The node's health along with the - output of the script, if it is unhealthy, is available to the - administrator in the ResourceManager web interface. The time since the - node was healthy is also displayed on the web interface. - - The following parameters can be used to control the node health - monitoring script in <<>>. - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Node health script | | -| | | Script to check for node's health status. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Node health script options | | -| | | Options for script to check for node's health status. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Node health script interval | | -| | | Time interval for running health script. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | Node health script timeout interval | | -| | | Timeout for health script execution. | -*-------------------------+-------------------------+------------------------+ - - The health checker script is not supposed to give ERROR if only some of the - local disks become bad. NodeManager has the ability to periodically check - the health of the local disks (specifically checks nodemanager-local-dirs - and nodemanager-log-dirs) and after reaching the threshold of number of - bad directories based on the value set for the config property - yarn.nodemanager.disk-health-checker.min-healthy-disks, the whole node is - marked unhealthy and this info is sent to resource manager also. The boot - disk is either raided or a failure in the boot disk is identified by the - health checker script. - - * {Slaves file} - - Typically you choose one machine in the cluster to act as the NameNode and - one machine as to act as the ResourceManager, exclusively. The rest of the - machines act as both a DataNode and NodeManager and are referred to as - . - - List all slave hostnames or IP addresses in your <<>> file, - one per line. - - * {Logging} - - Hadoop uses the Apache log4j via the Apache Commons Logging framework for - logging. Edit the <<>> file to customize the - Hadoop daemons' logging configuration (log-formats and so on). - - * {Operating the Hadoop Cluster} - - Once all the necessary configuration is complete, distribute the files to the - <<>> directory on all the machines. - - * Hadoop Startup - - To start a Hadoop cluster you will need to start both the HDFS and YARN - cluster. - - Format a new distributed filesystem: - ----- - $ $HADOOP_PREFIX/bin/hdfs namenode -format ----- - - Start the HDFS with the following command, run on the designated NameNode: - ----- - $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode ----- - - Run a script to start DataNodes on all slaves: - ----- - $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode ----- - - Start the YARN with the following command, run on the designated - ResourceManager: - ----- - $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager ----- - - Run a script to start NodeManagers on all slaves: - ----- - $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager ----- - - Start a standalone WebAppProxy server. If multiple servers - are used with load balancing it should be run on each of them: - ----- - $ $HADOOP_YARN_HOME/bin/yarn start proxyserver --config $HADOOP_CONF_DIR ----- - - Start the MapReduce JobHistory Server with the following command, run on the - designated server: - ----- - $ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver --config $HADOOP_CONF_DIR ----- - - * Hadoop Shutdown - - Stop the NameNode with the following command, run on the designated - NameNode: - ----- - $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode ----- - - Run a script to stop DataNodes on all slaves: - ----- - $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode ----- - - Stop the ResourceManager with the following command, run on the designated - ResourceManager: - ----- - $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager ----- - - Run a script to stop NodeManagers on all slaves: - ----- - $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager ----- - - Stop the WebAppProxy server. If multiple servers are used with load - balancing it should be run on each of them: - ----- - $ $HADOOP_YARN_HOME/bin/yarn stop proxyserver --config $HADOOP_CONF_DIR ----- - - - Stop the MapReduce JobHistory Server with the following command, run on the - designated server: - ----- - $ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh stop historyserver --config $HADOOP_CONF_DIR ----- - - -* {Running Hadoop in Secure Mode} - - This section deals with important parameters to be specified in - to run Hadoop in <> with strong, Kerberos-based - authentication. - - * <<>> - - Ensure that HDFS and YARN daemons run as different Unix users, for e.g. - <<>> and <<>>. Also, ensure that the MapReduce JobHistory - server runs as user <<>>. - - It's recommended to have them share a Unix group, for e.g. <<>>. - -*--------------------------------------+----------------------------------------------------------------------+ -|| User:Group || Daemons | -*--------------------------------------+----------------------------------------------------------------------+ -| hdfs:hadoop | NameNode, Secondary NameNode, Checkpoint Node, Backup Node, DataNode | -*--------------------------------------+----------------------------------------------------------------------+ -| yarn:hadoop | ResourceManager, NodeManager | -*--------------------------------------+----------------------------------------------------------------------+ -| mapred:hadoop | MapReduce JobHistory Server | -*--------------------------------------+----------------------------------------------------------------------+ - - * <<>> - - The following table lists various paths on HDFS and local filesystems (on - all nodes) and recommended permissions: - -*-------------------+-------------------+------------------+------------------+ -|| Filesystem || Path || User:Group || Permissions | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | hdfs:hadoop | drwx------ | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | hdfs:hadoop | drwx------ | -*-------------------+-------------------+------------------+------------------+ -| local | $HADOOP_LOG_DIR | hdfs:hadoop | drwxrwxr-x | -*-------------------+-------------------+------------------+------------------+ -| local | $YARN_LOG_DIR | yarn:hadoop | drwxrwxr-x | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | yarn:hadoop | drwxr-xr-x | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | yarn:hadoop | drwxr-xr-x | -*-------------------+-------------------+------------------+------------------+ -| local | container-executor | root:hadoop | --Sr-s--- | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | root:hadoop | r-------- | -*-------------------+-------------------+------------------+------------------+ -| hdfs | / | hdfs:hadoop | drwxr-xr-x | -*-------------------+-------------------+------------------+------------------+ -| hdfs | /tmp | hdfs:hadoop | drwxrwxrwxt | -*-------------------+-------------------+------------------+------------------+ -| hdfs | /user | hdfs:hadoop | drwxr-xr-x | -*-------------------+-------------------+------------------+------------------+ -| hdfs | <<>> | yarn:hadoop | drwxrwxrwxt | -*-------------------+-------------------+------------------+------------------+ -| hdfs | <<>> | mapred:hadoop | | -| | | | drwxrwxrwxt | -*-------------------+-------------------+------------------+------------------+ -| hdfs | <<>> | mapred:hadoop | | -| | | | drwxr-x--- | -*-------------------+-------------------+------------------+------------------+ - - * Kerberos Keytab files - - * HDFS - - The NameNode keytab file, on the NameNode host, should look like the - following: - ----- - -$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/nn.service.keytab -Keytab name: FILE:/etc/security/keytab/nn.service.keytab -KVNO Timestamp Principal - 4 07/18/11 21:08:09 nn/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 nn/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 nn/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - ----- - - The Secondary NameNode keytab file, on that host, should look like the - following: - ----- - -$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/sn.service.keytab -Keytab name: FILE:/etc/security/keytab/sn.service.keytab -KVNO Timestamp Principal - 4 07/18/11 21:08:09 sn/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 sn/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 sn/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - ----- - - The DataNode keytab file, on each host, should look like the following: - ----- - -$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/dn.service.keytab -Keytab name: FILE:/etc/security/keytab/dn.service.keytab -KVNO Timestamp Principal - 4 07/18/11 21:08:09 dn/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 dn/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 dn/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - ----- - - * YARN - - The ResourceManager keytab file, on the ResourceManager host, should look - like the following: - ----- - -$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/rm.service.keytab -Keytab name: FILE:/etc/security/keytab/rm.service.keytab -KVNO Timestamp Principal - 4 07/18/11 21:08:09 rm/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 rm/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 rm/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - ----- - - The NodeManager keytab file, on each host, should look like the following: - ----- - -$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/nm.service.keytab -Keytab name: FILE:/etc/security/keytab/nm.service.keytab -KVNO Timestamp Principal - 4 07/18/11 21:08:09 nm/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 nm/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 nm/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - ----- - - * MapReduce JobHistory Server - - The MapReduce JobHistory Server keytab file, on that host, should look - like the following: - ----- - -$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/jhs.service.keytab -Keytab name: FILE:/etc/security/keytab/jhs.service.keytab -KVNO Timestamp Principal - 4 07/18/11 21:08:09 jhs/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 jhs/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 jhs/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) - 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) - ----- - - * Configuration in Secure Mode - - * <<>> - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | is non-secure. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Enable RPC service-level authorization. | -*-------------------------+-------------------------+------------------------+ - - * <<>> - - * Configurations for NameNode: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Enable HDFS block access tokens for secure operations. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | <50470> | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Kerberos keytab file for the NameNode. | -*-------------------------+-------------------------+------------------------+ -| <<>> | nn/_HOST@REALM.TLD | | -| | | Kerberos principal name for the NameNode. | -*-------------------------+-------------------------+------------------------+ -| <<>> | host/_HOST@REALM.TLD | | -| | | HTTPS Kerberos principal name for the NameNode. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for Secondary NameNode: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -*-------------------------+-------------------------+------------------------+ -| <<>> | <50470> | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | | -| | | Kerberos keytab file for the NameNode. | -*-------------------------+-------------------------+------------------------+ -| <<>> | sn/_HOST@REALM.TLD | | -| | | Kerberos principal name for the Secondary NameNode. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | host/_HOST@REALM.TLD | | -| | | HTTPS Kerberos principal name for the Secondary NameNode. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for DataNode: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | 700 | | -*-------------------------+-------------------------+------------------------+ -| <<>> | <0.0.0.0:2003> | | -*-------------------------+-------------------------+------------------------+ -| <<>> | <0.0.0.0:2005> | | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Kerberos keytab file for the DataNode. | -*-------------------------+-------------------------+------------------------+ -| <<>> | dn/_HOST@REALM.TLD | | -| | | Kerberos principal name for the DataNode. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | host/_HOST@REALM.TLD | | -| | | HTTPS Kerberos principal name for the DataNode. | -*-------------------------+-------------------------+------------------------+ - - * <<>> - - * WebAppProxy - - The <<>> provides a proxy between the web applications - exported by an application and an end user. If security is enabled - it will warn users before accessing a potentially unsafe web application. - Authentication and authorization using the proxy is handled just like - any other privileged web application. - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> host:port for proxy to AM web apps. | | -| | | if this is the same as <<>>| -| | | or it is not defined then the <<>> will run the proxy| -| | | otherwise a standalone proxy server will need to be launched.| -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | | -| | | Kerberos keytab file for the WebAppProxy. | -*-------------------------+-------------------------+------------------------+ -| <<>> | wap/_HOST@REALM.TLD | | -| | | Kerberos principal name for the WebAppProxy. | -*-------------------------+-------------------------+------------------------+ - - * LinuxContainerExecutor - - A <<>> used by YARN framework which define how any - launched and controlled. - - The following are the available in Hadoop YARN: - -*--------------------------------------+--------------------------------------+ -|| ContainerExecutor || Description | -*--------------------------------------+--------------------------------------+ -| <<>> | | -| | The default executor which YARN uses to manage container execution. | -| | The container process has the same Unix user as the NodeManager. | -*--------------------------------------+--------------------------------------+ -| <<>> | | -| | Supported only on GNU/Linux, this executor runs the containers as the | -| | user who submitted the application. It requires all user accounts to be | -| | created on the cluster nodes where the containers are launched. It uses | -| | a executable that is included in the Hadoop distribution. | -| | The NodeManager uses this executable to launch and kill containers. | -| | The setuid executable switches to the user who has submitted the | -| | application and launches or kills the containers. For maximum security, | -| | this executor sets up restricted permissions and user/group ownership of | -| | local files and directories used by the containers such as the shared | -| | objects, jars, intermediate files, log files etc. Particularly note that, | -| | because of this, except the application owner and NodeManager, no other | -| | user can access any of the local files/directories including those | -| | localized as part of the distributed cache. | -*--------------------------------------+--------------------------------------+ - - To build the LinuxContainerExecutor executable run: - ----- - $ mvn package -Dcontainer-executor.conf.dir=/etc/hadoop/ ----- - - The path passed in <<<-Dcontainer-executor.conf.dir>>> should be the - path on the cluster nodes where a configuration file for the setuid - executable should be located. The executable should be installed in - $HADOOP_YARN_HOME/bin. - - The executable must have specific permissions: 6050 or --Sr-s--- - permissions user-owned by (super-user) and group-owned by a - special group (e.g. <<>>) of which the NodeManager Unix user is - the group member and no ordinary application user is. If any application - user belongs to this special group, security will be compromised. This - special group name should be specified for the configuration property - <<>> in both - <<>> and <<>>. - - For example, let's say that the NodeManager is run as user who is - part of the groups users and , any of them being the primary group. - Let also be that has both and another user - (application submitter) as its members, and does not - belong to . Going by the above description, the setuid/setgid - executable should be set 6050 or --Sr-s--- with user-owner as and - group-owner as which has as its member (and not - which has also as its member besides ). - - The LinuxTaskController requires that paths including and leading up to - the directories specified in <<>> and - <<>> to be set 755 permissions as described - above in the table on permissions on directories. - - * <<>> - - The executable requires a configuration file called - <<>> to be present in the configuration - directory passed to the mvn target mentioned above. - - The configuration file must be owned by the user running NodeManager - (user <<>> in the above example), group-owned by anyone and - should have the permissions 0400 or r--------. - - The executable requires following configuration items to be present - in the <<>> file. The items should be - mentioned as simple key=value pairs, one per-line: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Unix group of the NodeManager. The group owner of the | -| | | binary should be this group. Should be same as the | -| | | value with which the NodeManager is configured. This configuration is | -| | | required for validating the secure access of the | -| | | binary. | -*-------------------------+-------------------------+------------------------+ -| <<>> | hfds,yarn,mapred,bin | Banned users. | -*-------------------------+-------------------------+------------------------+ -| <<>> | 1000 | Prevent other super-users. | -*-------------------------+-------------------------+------------------------+ - - To re-cap, here are the local file-ssytem permissions required for the - various paths related to the <<>>: - -*-------------------+-------------------+------------------+------------------+ -|| Filesystem || Path || User:Group || Permissions | -*-------------------+-------------------+------------------+------------------+ -| local | container-executor | root:hadoop | --Sr-s--- | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | root:hadoop | r-------- | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | yarn:hadoop | drwxr-xr-x | -*-------------------+-------------------+------------------+------------------+ -| local | <<>> | yarn:hadoop | drwxr-xr-x | -*-------------------+-------------------+------------------+------------------+ - - * Configurations for ResourceManager: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | | -| | | Kerberos keytab file for the ResourceManager. | -*-------------------------+-------------------------+------------------------+ -| <<>> | rm/_HOST@REALM.TLD | | -| | | Kerberos principal name for the ResourceManager. | -*-------------------------+-------------------------+------------------------+ - - * Configurations for NodeManager: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Kerberos keytab file for the NodeManager. | -*-------------------------+-------------------------+------------------------+ -| <<>> | nm/_HOST@REALM.TLD | | -| | | Kerberos principal name for the NodeManager. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | <<>> | -| | | Use LinuxContainerExecutor. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | | Unix group of the NodeManager. | -*-------------------------+-------------------------+------------------------+ - - * <<>> - - * Configurations for MapReduce JobHistory Server: - -*-------------------------+-------------------------+------------------------+ -|| Parameter || Value || Notes | -*-------------------------+-------------------------+------------------------+ -| <<>> | | | -| | MapReduce JobHistory Server | Default port is 10020. | -*-------------------------+-------------------------+------------------------+ -| <<>> | | -| | | | -| | | Kerberos keytab file for the MapReduce JobHistory Server. | -*-------------------------+-------------------------+------------------------+ -| <<>> | jhs/_HOST@REALM.TLD | | -| | | Kerberos principal name for the MapReduce JobHistory Server. | -*-------------------------+-------------------------+------------------------+ - - - * {Operating the Hadoop Cluster} - - Once all the necessary configuration is complete, distribute the files to the - <<>> directory on all the machines. - - This section also describes the various Unix users who should be starting the - various components and uses the same Unix accounts and groups used previously: - - * Hadoop Startup - - To start a Hadoop cluster you will need to start both the HDFS and YARN - cluster. - - Format a new distributed filesystem as : - ----- -[hdfs]$ $HADOOP_PREFIX/bin/hdfs namenode -format ----- - - Start the HDFS with the following command, run on the designated NameNode - as : - ----- -[hdfs]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode ----- - - Run a script to start DataNodes on all slaves as with a special - environment variable <<>> set to : - ----- -[root]$ HADOOP_SECURE_DN_USER=hdfs $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode ----- - - Start the YARN with the following command, run on the designated - ResourceManager as : - ----- -[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager ----- - - Run a script to start NodeManagers on all slaves as : - ----- -[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager ----- - - Start a standalone WebAppProxy server. Run on the WebAppProxy - server as . If multiple servers are used with load balancing - it should be run on each of them: - ----- -[yarn]$ $HADOOP_YARN_HOME/bin/yarn start proxyserver --config $HADOOP_CONF_DIR ----- - - Start the MapReduce JobHistory Server with the following command, run on the - designated server as : - ----- -[mapred]$ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver --config $HADOOP_CONF_DIR ----- - - * Hadoop Shutdown - - Stop the NameNode with the following command, run on the designated NameNode - as : - ----- -[hdfs]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode ----- - - Run a script to stop DataNodes on all slaves as : - ----- -[root]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode ----- - - Stop the ResourceManager with the following command, run on the designated - ResourceManager as : - ----- -[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager ----- - - Run a script to stop NodeManagers on all slaves as : - ----- -[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager ----- - - Stop the WebAppProxy server. Run on the WebAppProxy server as - . If multiple servers are used with load balancing it - should be run on each of them: - ----- -[yarn]$ $HADOOP_YARN_HOME/bin/yarn stop proxyserver --config $HADOOP_CONF_DIR ----- - - Stop the MapReduce JobHistory Server with the following command, run on the - designated server as : - ----- -[mapred]$ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh stop historyserver --config $HADOOP_CONF_DIR ----- - -* {Web Interfaces} - - Once the Hadoop cluster is up and running check the web-ui of the - components as described below: - -*-------------------------+-------------------------+------------------------+ -|| Daemon || Web Interface || Notes | -*-------------------------+-------------------------+------------------------+ -| NameNode | http:/// | Default HTTP port is 50070. | -*-------------------------+-------------------------+------------------------+ -| ResourceManager | http:/// | Default HTTP port is 8088. | -*-------------------------+-------------------------+------------------------+ -| MapReduce JobHistory Server | http:/// | | -| | | Default HTTP port is 19888. | -*-------------------------+-------------------------+------------------------+ - - +~~ Licensed under the Apache License, Version 2.0 (the "License"); +~~ you may not use this file except in compliance with the License. +~~ You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, software +~~ distributed under the License is distributed on an "AS IS" BASIS, +~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +~~ See the License for the specific language governing permissions and +~~ limitations under the License. See accompanying LICENSE file. + + --- + Hadoop Map Reduce Next Generation-${project.version} - Cluster Setup + --- + --- + ${maven.build.timestamp} + +Hadoop MapReduce Next Generation - Cluster Setup + + \[ {{{./index.html}Go Back}} \] + +%{toc|section=1|fromDepth=0} + +* {Purpose} + + This document describes how to install, configure and manage non-trivial + Hadoop clusters ranging from a few nodes to extremely large clusters + with thousands of nodes. + + To play with Hadoop, you may first want to install it on a single + machine (see {{{SingleCluster}Single Node Setup}}). + +* {Prerequisites} + + Download a stable version of Hadoop from Apache mirrors. + +* {Installation} + + Installing a Hadoop cluster typically involves unpacking the software on all + the machines in the cluster or installing RPMs. + + Typically one machine in the cluster is designated as the NameNode and + another machine the as ResourceManager, exclusively. These are the masters. + + The rest of the machines in the cluster act as both DataNode and NodeManager. + These are the slaves. + +* {Running Hadoop in Non-Secure Mode} + + The following sections describe how to configure a Hadoop cluster. + + * {Configuration Files} + + Hadoop configuration is driven by two types of important configuration files: + + * Read-only default configuration - <<>>, + <<>>, <<>> and + <<>>. + + * Site-specific configuration - <>, + <>, <> and + <>. + + + Additionally, you can control the Hadoop scripts found in the bin/ + directory of the distribution, by setting site-specific values via the + <> and <>. + + * {Site Configuration} + + To configure the Hadoop cluster you will need to configure the + <<>> in which the Hadoop daemons execute as well as the + <<>> for the Hadoop daemons. + + The Hadoop daemons are NameNode/DataNode and ResourceManager/NodeManager. + + + * {Configuring Environment of Hadoop Daemons} + + Administrators should use the <> and + <> script to do site-specific customization of the + Hadoop daemons' process environment. + + At the very least you should specify the <<>> so that it is + correctly defined on each remote node. + + In most cases you should also specify <<>> and + <<>> to point to directories that can only be + written to by the users that are going to run the hadoop daemons. + Otherwise there is the potential for a symlink attack. + + Administrators can configure individual daemons using the configuration + options shown below in the table: + +*--------------------------------------+--------------------------------------+ +|| Daemon || Environment Variable | +*--------------------------------------+--------------------------------------+ +| NameNode | HADOOP_NAMENODE_OPTS | +*--------------------------------------+--------------------------------------+ +| DataNode | HADOOP_DATANODE_OPTS | +*--------------------------------------+--------------------------------------+ +| Secondary NameNode | HADOOP_SECONDARYNAMENODE_OPTS | +*--------------------------------------+--------------------------------------+ +| ResourceManager | YARN_RESOURCEMANAGER_OPTS | +*--------------------------------------+--------------------------------------+ +| NodeManager | YARN_NODEMANAGER_OPTS | +*--------------------------------------+--------------------------------------+ +| WebAppProxy | YARN_PROXYSERVER_OPTS | +*--------------------------------------+--------------------------------------+ +| Map Reduce Job History Server | HADOOP_JOB_HISTORYSERVER_OPTS | +*--------------------------------------+--------------------------------------+ + + + For example, To configure Namenode to use parallelGC, the following + statement should be added in hadoop-env.sh : + +---- + export HADOOP_NAMENODE_OPTS="-XX:+UseParallelGC ${HADOOP_NAMENODE_OPTS}" +---- + + Other useful configuration parameters that you can customize include: + + * <<>> / <<>> - The directory where the + daemons' log files are stored. They are automatically created if they + don't exist. + + * <<>> / <<>> - The maximum amount of + heapsize to use, in MB e.g. if the varibale is set to 1000 the heap + will be set to 1000MB. This is used to configure the heap + size for the daemon. By default, the value is 1000. If you want to + configure the values separately for each deamon you can use. +*--------------------------------------+--------------------------------------+ +|| Daemon || Environment Variable | +*--------------------------------------+--------------------------------------+ +| ResourceManager | YARN_RESOURCEMANAGER_HEAPSIZE | +*--------------------------------------+--------------------------------------+ +| NodeManager | YARN_NODEMANAGER_HEAPSIZE | +*--------------------------------------+--------------------------------------+ +| WebAppProxy | YARN_PROXYSERVER_HEAPSIZE | +*--------------------------------------+--------------------------------------+ +| Map Reduce Job History Server | HADOOP_JOB_HISTORYSERVER_HEAPSIZE | +*--------------------------------------+--------------------------------------+ + + * {Configuring the Hadoop Daemons in Non-Secure Mode} + + This section deals with important parameters to be specified in + the given configuration files: + + * <<>> + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | NameNode URI | | +*-------------------------+-------------------------+------------------------+ +| <<>> | 131072 | | +| | | Size of read/write buffer used in SequenceFiles. | +*-------------------------+-------------------------+------------------------+ + + * <<>> + + * Configurations for NameNode: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Path on the local filesystem where the NameNode stores the namespace | | +| | and transactions logs persistently. | | +| | | If this is a comma-delimited list of directories then the name table is | +| | | replicated in all of the directories, for redundancy. | +*-------------------------+-------------------------+------------------------+ +| <<>> / <<>> | | | +| | List of permitted/excluded DataNodes. | | +| | | If necessary, use these files to control the list of allowable | +| | | datanodes. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 268435456 | | +| | | HDFS blocksize of 256MB for large file-systems. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 100 | | +| | | More NameNode server threads to handle RPCs from large number of | +| | | DataNodes. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for DataNode: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Comma separated list of paths on the local filesystem of a | | +| | <<>> where it should store its blocks. | | +| | | If this is a comma-delimited list of directories, then data will be | +| | | stored in all named directories, typically on different devices. | +*-------------------------+-------------------------+------------------------+ + + * <<>> + + * Configurations for ResourceManager and NodeManager: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> / <<>> | | +| | | Enable ACLs? Defaults to . | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Admin ACL | | +| | | ACL to set admins on the cluster. | +| | | ACLs are of for . | +| | | Defaults to special value of <<*>> which means . | +| | | Special value of just means no one has access. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | | +| | | Configuration to enable or disable log aggregation | +*-------------------------+-------------------------+------------------------+ + + + * Configurations for ResourceManager: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> host:port for clients to submit jobs. | | +| | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> host:port for ApplicationMasters to talk to | | +| | Scheduler to obtain resources. | | +| | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> host:port for NodeManagers. | | +| | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> host:port for administrative commands. | | +| | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> web-ui host:port. | | +| | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> Scheduler class. | | +| | | <<>> (recommended) or <<>> | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Minimum limit of memory to allocate to each container request at the <<>>. | | +| | | In MBs | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Maximum limit of memory to allocate to each container request at the <<>>. | | +| | | In MBs | +*-------------------------+-------------------------+------------------------+ +| <<>> / | | | +| <<>> | | | +| | List of permitted/excluded NodeManagers. | | +| | | If necessary, use these files to control the list of allowable | +| | | NodeManagers. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for NodeManager: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Resource i.e. available physical memory, in MB, for given <<>> | | +| | | Defines total available resources on the <<>> to be made | +| | | available to running containers | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Maximum ratio by which virtual memory usage of tasks may exceed | +| | physical memory | | +| | | The virtual memory usage of each task may exceed its physical memory | +| | | limit by this ratio. The total amount of virtual memory used by tasks | +| | | on the NodeManager may exceed its physical memory usage by this ratio. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Comma-separated list of paths on the local filesystem where | | +| | intermediate data is written. || +| | | Multiple paths help spread disk i/o. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Comma-separated list of paths on the local filesystem where logs | | +| | are written. | | +| | | Multiple paths help spread disk i/o. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <10800> | | +| | | Default time (in seconds) to retain log files on the NodeManager | +| | | Only applicable if log-aggregation is disabled. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | | +| | | HDFS directory where the application logs are moved on application | +| | | completion. Need to set appropriate permissions. | +| | | Only applicable if log-aggregation is enabled. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | | +| | | Suffix appended to the remote log dir. Logs will be aggregated to | +| | | $\{yarn.nodemanager.remote-app-log-dir\}/$\{user\}/$\{thisParam\} | +| | | Only applicable if log-aggregation is enabled. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | mapreduce.shuffle | | +| | | Shuffle service that needs to be set for Map Reduce applications. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for History Server (Needs to be moved elsewhere): + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <-1> | | +| | | How long to keep aggregation logs before deleting them. -1 disables. | +| | | Be careful, set this too small and you will spam the name node. | +*-------------------------+-------------------------+------------------------+ + + + + * <<>> + + * Configurations for MapReduce Applications: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | yarn | | +| | | Execution framework set to Hadoop YARN. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 1536 | | +| | | Larger resource limit for maps. | +*-------------------------+-------------------------+------------------------+ +| <<>> | -Xmx1024M | | +| | | Larger heap-size for child jvms of maps. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 3072 | | +| | | Larger resource limit for reduces. | +*-------------------------+-------------------------+------------------------+ +| <<>> | -Xmx2560M | | +| | | Larger heap-size for child jvms of reduces. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 512 | | +| | | Higher memory-limit while sorting data for efficiency. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 100 | | +| | | More streams merged at once while sorting files. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 50 | | +| | | Higher number of parallel copies run by reduces to fetch outputs | +| | | from very large number of maps. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for MapReduce JobHistory Server: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | MapReduce JobHistory Server | Default port is 10020. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | MapReduce JobHistory Server Web UI | Default port is 19888. | +*-------------------------+-------------------------+------------------------+ +| <<>> | /mr-history/tmp | | +| | | Directory where history files are written by MapReduce jobs. | +*-------------------------+-------------------------+------------------------+ +| <<>> | /mr-history/done| | +| | | Directory where history files are managed by the MR JobHistory Server. | +*-------------------------+-------------------------+------------------------+ + + * Hadoop Rack Awareness + + The HDFS and the YARN components are rack-aware. + + The NameNode and the ResourceManager obtains the rack information of the + slaves in the cluster by invoking an API in an administrator + configured module. + + The API resolves the DNS name (also IP address) to a rack id. + + The site-specific module to use can be configured using the configuration + item <<>>. The default implementation + of the same runs a script/command configured using + <<>>. If <<>> is + not set, the rack id is returned for any passed IP address. + + * Monitoring Health of NodeManagers + + Hadoop provides a mechanism by which administrators can configure the + NodeManager to run an administrator supplied script periodically to + determine if a node is healthy or not. + + Administrators can determine if the node is in a healthy state by + performing any checks of their choice in the script. If the script + detects the node to be in an unhealthy state, it must print a line to + standard output beginning with the string ERROR. The NodeManager spawns + the script periodically and checks its output. If the script's output + contains the string ERROR, as described above, the node's status is + reported as <<>> and the node is black-listed by the + ResourceManager. No further tasks will be assigned to this node. + However, the NodeManager continues to run the script, so that if the + node becomes healthy again, it will be removed from the blacklisted nodes + on the ResourceManager automatically. The node's health along with the + output of the script, if it is unhealthy, is available to the + administrator in the ResourceManager web interface. The time since the + node was healthy is also displayed on the web interface. + + The following parameters can be used to control the node health + monitoring script in <<>>. + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Node health script | | +| | | Script to check for node's health status. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Node health script options | | +| | | Options for script to check for node's health status. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Node health script interval | | +| | | Time interval for running health script. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | Node health script timeout interval | | +| | | Timeout for health script execution. | +*-------------------------+-------------------------+------------------------+ + + The health checker script is not supposed to give ERROR if only some of the + local disks become bad. NodeManager has the ability to periodically check + the health of the local disks (specifically checks nodemanager-local-dirs + and nodemanager-log-dirs) and after reaching the threshold of number of + bad directories based on the value set for the config property + yarn.nodemanager.disk-health-checker.min-healthy-disks, the whole node is + marked unhealthy and this info is sent to resource manager also. The boot + disk is either raided or a failure in the boot disk is identified by the + health checker script. + + * {Slaves file} + + Typically you choose one machine in the cluster to act as the NameNode and + one machine as to act as the ResourceManager, exclusively. The rest of the + machines act as both a DataNode and NodeManager and are referred to as + . + + List all slave hostnames or IP addresses in your <<>> file, + one per line. + + * {Logging} + + Hadoop uses the Apache log4j via the Apache Commons Logging framework for + logging. Edit the <<>> file to customize the + Hadoop daemons' logging configuration (log-formats and so on). + + * {Operating the Hadoop Cluster} + + Once all the necessary configuration is complete, distribute the files to the + <<>> directory on all the machines. + + * Hadoop Startup + + To start a Hadoop cluster you will need to start both the HDFS and YARN + cluster. + + Format a new distributed filesystem: + +---- + $ $HADOOP_PREFIX/bin/hdfs namenode -format +---- + + Start the HDFS with the following command, run on the designated NameNode: + +---- + $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode +---- + + Run a script to start DataNodes on all slaves: + +---- + $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode +---- + + Start the YARN with the following command, run on the designated + ResourceManager: + +---- + $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager +---- + + Run a script to start NodeManagers on all slaves: + +---- + $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager +---- + + Start a standalone WebAppProxy server. If multiple servers + are used with load balancing it should be run on each of them: + +---- + $ $HADOOP_YARN_HOME/bin/yarn start proxyserver --config $HADOOP_CONF_DIR +---- + + Start the MapReduce JobHistory Server with the following command, run on the + designated server: + +---- + $ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver --config $HADOOP_CONF_DIR +---- + + * Hadoop Shutdown + + Stop the NameNode with the following command, run on the designated + NameNode: + +---- + $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode +---- + + Run a script to stop DataNodes on all slaves: + +---- + $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode +---- + + Stop the ResourceManager with the following command, run on the designated + ResourceManager: + +---- + $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager +---- + + Run a script to stop NodeManagers on all slaves: + +---- + $ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager +---- + + Stop the WebAppProxy server. If multiple servers are used with load + balancing it should be run on each of them: + +---- + $ $HADOOP_YARN_HOME/bin/yarn stop proxyserver --config $HADOOP_CONF_DIR +---- + + + Stop the MapReduce JobHistory Server with the following command, run on the + designated server: + +---- + $ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh stop historyserver --config $HADOOP_CONF_DIR +---- + + +* {Running Hadoop in Secure Mode} + + This section deals with important parameters to be specified in + to run Hadoop in <> with strong, Kerberos-based + authentication. + + * <<>> + + Ensure that HDFS and YARN daemons run as different Unix users, for e.g. + <<>> and <<>>. Also, ensure that the MapReduce JobHistory + server runs as user <<>>. + + It's recommended to have them share a Unix group, for e.g. <<>>. + +*--------------------------------------+----------------------------------------------------------------------+ +|| User:Group || Daemons | +*--------------------------------------+----------------------------------------------------------------------+ +| hdfs:hadoop | NameNode, Secondary NameNode, Checkpoint Node, Backup Node, DataNode | +*--------------------------------------+----------------------------------------------------------------------+ +| yarn:hadoop | ResourceManager, NodeManager | +*--------------------------------------+----------------------------------------------------------------------+ +| mapred:hadoop | MapReduce JobHistory Server | +*--------------------------------------+----------------------------------------------------------------------+ + + * <<>> + + The following table lists various paths on HDFS and local filesystems (on + all nodes) and recommended permissions: + +*-------------------+-------------------+------------------+------------------+ +|| Filesystem || Path || User:Group || Permissions | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | hdfs:hadoop | drwx------ | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | hdfs:hadoop | drwx------ | +*-------------------+-------------------+------------------+------------------+ +| local | $HADOOP_LOG_DIR | hdfs:hadoop | drwxrwxr-x | +*-------------------+-------------------+------------------+------------------+ +| local | $YARN_LOG_DIR | yarn:hadoop | drwxrwxr-x | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | yarn:hadoop | drwxr-xr-x | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | yarn:hadoop | drwxr-xr-x | +*-------------------+-------------------+------------------+------------------+ +| local | container-executor | root:hadoop | --Sr-s--- | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | root:hadoop | r-------- | +*-------------------+-------------------+------------------+------------------+ +| hdfs | / | hdfs:hadoop | drwxr-xr-x | +*-------------------+-------------------+------------------+------------------+ +| hdfs | /tmp | hdfs:hadoop | drwxrwxrwxt | +*-------------------+-------------------+------------------+------------------+ +| hdfs | /user | hdfs:hadoop | drwxr-xr-x | +*-------------------+-------------------+------------------+------------------+ +| hdfs | <<>> | yarn:hadoop | drwxrwxrwxt | +*-------------------+-------------------+------------------+------------------+ +| hdfs | <<>> | mapred:hadoop | | +| | | | drwxrwxrwxt | +*-------------------+-------------------+------------------+------------------+ +| hdfs | <<>> | mapred:hadoop | | +| | | | drwxr-x--- | +*-------------------+-------------------+------------------+------------------+ + + * Kerberos Keytab files + + * HDFS + + The NameNode keytab file, on the NameNode host, should look like the + following: + +---- + +$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/nn.service.keytab +Keytab name: FILE:/etc/security/keytab/nn.service.keytab +KVNO Timestamp Principal + 4 07/18/11 21:08:09 nn/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 nn/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 nn/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + +---- + + The Secondary NameNode keytab file, on that host, should look like the + following: + +---- + +$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/sn.service.keytab +Keytab name: FILE:/etc/security/keytab/sn.service.keytab +KVNO Timestamp Principal + 4 07/18/11 21:08:09 sn/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 sn/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 sn/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + +---- + + The DataNode keytab file, on each host, should look like the following: + +---- + +$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/dn.service.keytab +Keytab name: FILE:/etc/security/keytab/dn.service.keytab +KVNO Timestamp Principal + 4 07/18/11 21:08:09 dn/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 dn/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 dn/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + +---- + + * YARN + + The ResourceManager keytab file, on the ResourceManager host, should look + like the following: + +---- + +$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/rm.service.keytab +Keytab name: FILE:/etc/security/keytab/rm.service.keytab +KVNO Timestamp Principal + 4 07/18/11 21:08:09 rm/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 rm/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 rm/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + +---- + + The NodeManager keytab file, on each host, should look like the following: + +---- + +$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/nm.service.keytab +Keytab name: FILE:/etc/security/keytab/nm.service.keytab +KVNO Timestamp Principal + 4 07/18/11 21:08:09 nm/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 nm/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 nm/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + +---- + + * MapReduce JobHistory Server + + The MapReduce JobHistory Server keytab file, on that host, should look + like the following: + +---- + +$ /usr/kerberos/bin/klist -e -k -t /etc/security/keytab/jhs.service.keytab +Keytab name: FILE:/etc/security/keytab/jhs.service.keytab +KVNO Timestamp Principal + 4 07/18/11 21:08:09 jhs/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 jhs/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 jhs/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-256 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (AES-128 CTS mode with 96-bit SHA-1 HMAC) + 4 07/18/11 21:08:09 host/full.qualified.domain.name@REALM.TLD (ArcFour with HMAC/md5) + +---- + + * Configuration in Secure Mode + + * <<>> + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | is non-secure. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Enable RPC service-level authorization. | +*-------------------------+-------------------------+------------------------+ + + * <<>> + + * Configurations for NameNode: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Enable HDFS block access tokens for secure operations. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | <50470> | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Kerberos keytab file for the NameNode. | +*-------------------------+-------------------------+------------------------+ +| <<>> | nn/_HOST@REALM.TLD | | +| | | Kerberos principal name for the NameNode. | +*-------------------------+-------------------------+------------------------+ +| <<>> | host/_HOST@REALM.TLD | | +| | | HTTPS Kerberos principal name for the NameNode. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for Secondary NameNode: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +*-------------------------+-------------------------+------------------------+ +| <<>> | <50470> | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | | +| | | Kerberos keytab file for the NameNode. | +*-------------------------+-------------------------+------------------------+ +| <<>> | sn/_HOST@REALM.TLD | | +| | | Kerberos principal name for the Secondary NameNode. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | host/_HOST@REALM.TLD | | +| | | HTTPS Kerberos principal name for the Secondary NameNode. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for DataNode: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | 700 | | +*-------------------------+-------------------------+------------------------+ +| <<>> | <0.0.0.0:2003> | | +*-------------------------+-------------------------+------------------------+ +| <<>> | <0.0.0.0:2005> | | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Kerberos keytab file for the DataNode. | +*-------------------------+-------------------------+------------------------+ +| <<>> | dn/_HOST@REALM.TLD | | +| | | Kerberos principal name for the DataNode. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | host/_HOST@REALM.TLD | | +| | | HTTPS Kerberos principal name for the DataNode. | +*-------------------------+-------------------------+------------------------+ + + * <<>> + + * WebAppProxy + + The <<>> provides a proxy between the web applications + exported by an application and an end user. If security is enabled + it will warn users before accessing a potentially unsafe web application. + Authentication and authorization using the proxy is handled just like + any other privileged web application. + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> host:port for proxy to AM web apps. | | +| | | if this is the same as <<>>| +| | | or it is not defined then the <<>> will run the proxy| +| | | otherwise a standalone proxy server will need to be launched.| +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | | +| | | Kerberos keytab file for the WebAppProxy. | +*-------------------------+-------------------------+------------------------+ +| <<>> | wap/_HOST@REALM.TLD | | +| | | Kerberos principal name for the WebAppProxy. | +*-------------------------+-------------------------+------------------------+ + + * LinuxContainerExecutor + + A <<>> used by YARN framework which define how any + launched and controlled. + + The following are the available in Hadoop YARN: + +*--------------------------------------+--------------------------------------+ +|| ContainerExecutor || Description | +*--------------------------------------+--------------------------------------+ +| <<>> | | +| | The default executor which YARN uses to manage container execution. | +| | The container process has the same Unix user as the NodeManager. | +*--------------------------------------+--------------------------------------+ +| <<>> | | +| | Supported only on GNU/Linux, this executor runs the containers as the | +| | user who submitted the application. It requires all user accounts to be | +| | created on the cluster nodes where the containers are launched. It uses | +| | a executable that is included in the Hadoop distribution. | +| | The NodeManager uses this executable to launch and kill containers. | +| | The setuid executable switches to the user who has submitted the | +| | application and launches or kills the containers. For maximum security, | +| | this executor sets up restricted permissions and user/group ownership of | +| | local files and directories used by the containers such as the shared | +| | objects, jars, intermediate files, log files etc. Particularly note that, | +| | because of this, except the application owner and NodeManager, no other | +| | user can access any of the local files/directories including those | +| | localized as part of the distributed cache. | +*--------------------------------------+--------------------------------------+ + + To build the LinuxContainerExecutor executable run: + +---- + $ mvn package -Dcontainer-executor.conf.dir=/etc/hadoop/ +---- + + The path passed in <<<-Dcontainer-executor.conf.dir>>> should be the + path on the cluster nodes where a configuration file for the setuid + executable should be located. The executable should be installed in + $HADOOP_YARN_HOME/bin. + + The executable must have specific permissions: 6050 or --Sr-s--- + permissions user-owned by (super-user) and group-owned by a + special group (e.g. <<>>) of which the NodeManager Unix user is + the group member and no ordinary application user is. If any application + user belongs to this special group, security will be compromised. This + special group name should be specified for the configuration property + <<>> in both + <<>> and <<>>. + + For example, let's say that the NodeManager is run as user who is + part of the groups users and , any of them being the primary group. + Let also be that has both and another user + (application submitter) as its members, and does not + belong to . Going by the above description, the setuid/setgid + executable should be set 6050 or --Sr-s--- with user-owner as and + group-owner as which has as its member (and not + which has also as its member besides ). + + The LinuxTaskController requires that paths including and leading up to + the directories specified in <<>> and + <<>> to be set 755 permissions as described + above in the table on permissions on directories. + + * <<>> + + The executable requires a configuration file called + <<>> to be present in the configuration + directory passed to the mvn target mentioned above. + + The configuration file must be owned by the user running NodeManager + (user <<>> in the above example), group-owned by anyone and + should have the permissions 0400 or r--------. + + The executable requires following configuration items to be present + in the <<>> file. The items should be + mentioned as simple key=value pairs, one per-line: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Unix group of the NodeManager. The group owner of the | +| | | binary should be this group. Should be same as the | +| | | value with which the NodeManager is configured. This configuration is | +| | | required for validating the secure access of the | +| | | binary. | +*-------------------------+-------------------------+------------------------+ +| <<>> | hfds,yarn,mapred,bin | Banned users. | +*-------------------------+-------------------------+------------------------+ +| <<>> | 1000 | Prevent other super-users. | +*-------------------------+-------------------------+------------------------+ + + To re-cap, here are the local file-ssytem permissions required for the + various paths related to the <<>>: + +*-------------------+-------------------+------------------+------------------+ +|| Filesystem || Path || User:Group || Permissions | +*-------------------+-------------------+------------------+------------------+ +| local | container-executor | root:hadoop | --Sr-s--- | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | root:hadoop | r-------- | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | yarn:hadoop | drwxr-xr-x | +*-------------------+-------------------+------------------+------------------+ +| local | <<>> | yarn:hadoop | drwxr-xr-x | +*-------------------+-------------------+------------------+------------------+ + + * Configurations for ResourceManager: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | | +| | | Kerberos keytab file for the ResourceManager. | +*-------------------------+-------------------------+------------------------+ +| <<>> | rm/_HOST@REALM.TLD | | +| | | Kerberos principal name for the ResourceManager. | +*-------------------------+-------------------------+------------------------+ + + * Configurations for NodeManager: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Kerberos keytab file for the NodeManager. | +*-------------------------+-------------------------+------------------------+ +| <<>> | nm/_HOST@REALM.TLD | | +| | | Kerberos principal name for the NodeManager. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | <<>> | +| | | Use LinuxContainerExecutor. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | | Unix group of the NodeManager. | +*-------------------------+-------------------------+------------------------+ + + * <<>> + + * Configurations for MapReduce JobHistory Server: + +*-------------------------+-------------------------+------------------------+ +|| Parameter || Value || Notes | +*-------------------------+-------------------------+------------------------+ +| <<>> | | | +| | MapReduce JobHistory Server | Default port is 10020. | +*-------------------------+-------------------------+------------------------+ +| <<>> | | +| | | | +| | | Kerberos keytab file for the MapReduce JobHistory Server. | +*-------------------------+-------------------------+------------------------+ +| <<>> | jhs/_HOST@REALM.TLD | | +| | | Kerberos principal name for the MapReduce JobHistory Server. | +*-------------------------+-------------------------+------------------------+ + + + * {Operating the Hadoop Cluster} + + Once all the necessary configuration is complete, distribute the files to the + <<>> directory on all the machines. + + This section also describes the various Unix users who should be starting the + various components and uses the same Unix accounts and groups used previously: + + * Hadoop Startup + + To start a Hadoop cluster you will need to start both the HDFS and YARN + cluster. + + Format a new distributed filesystem as : + +---- +[hdfs]$ $HADOOP_PREFIX/bin/hdfs namenode -format +---- + + Start the HDFS with the following command, run on the designated NameNode + as : + +---- +[hdfs]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode +---- + + Run a script to start DataNodes on all slaves as with a special + environment variable <<>> set to : + +---- +[root]$ HADOOP_SECURE_DN_USER=hdfs $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode +---- + + Start the YARN with the following command, run on the designated + ResourceManager as : + +---- +[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager +---- + + Run a script to start NodeManagers on all slaves as : + +---- +[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager +---- + + Start a standalone WebAppProxy server. Run on the WebAppProxy + server as . If multiple servers are used with load balancing + it should be run on each of them: + +---- +[yarn]$ $HADOOP_YARN_HOME/bin/yarn start proxyserver --config $HADOOP_CONF_DIR +---- + + Start the MapReduce JobHistory Server with the following command, run on the + designated server as : + +---- +[mapred]$ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver --config $HADOOP_CONF_DIR +---- + + * Hadoop Shutdown + + Stop the NameNode with the following command, run on the designated NameNode + as : + +---- +[hdfs]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode +---- + + Run a script to stop DataNodes on all slaves as : + +---- +[root]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode +---- + + Stop the ResourceManager with the following command, run on the designated + ResourceManager as : + +---- +[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager +---- + + Run a script to stop NodeManagers on all slaves as : + +---- +[yarn]$ $HADOOP_YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager +---- + + Stop the WebAppProxy server. Run on the WebAppProxy server as + . If multiple servers are used with load balancing it + should be run on each of them: + +---- +[yarn]$ $HADOOP_YARN_HOME/bin/yarn stop proxyserver --config $HADOOP_CONF_DIR +---- + + Stop the MapReduce JobHistory Server with the following command, run on the + designated server as : + +---- +[mapred]$ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh stop historyserver --config $HADOOP_CONF_DIR +---- + +* {Web Interfaces} + + Once the Hadoop cluster is up and running check the web-ui of the + components as described below: + +*-------------------------+-------------------------+------------------------+ +|| Daemon || Web Interface || Notes | +*-------------------------+-------------------------+------------------------+ +| NameNode | http:/// | Default HTTP port is 50070. | +*-------------------------+-------------------------+------------------------+ +| ResourceManager | http:/// | Default HTTP port is 8088. | +*-------------------------+-------------------------+------------------------+ +| MapReduce JobHistory Server | http:/// | | +| | | Default HTTP port is 19888. | +*-------------------------+-------------------------+------------------------+ + +