HBASE-4285 partitions file created in user's home directory by importtsv

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1457078 13f79535-47bb-0310-9956-ffa450edef68
2013-03-15 19:45:09 +00:00 · 2013-03-15 19:45:09 +00:00 · 167766c262
parent afa31acc64
commit 167766c262
2 changed files with 47 additions and 28 deletions
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestImportTsv.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/mapreduce/IntegrationTestImportTsv.java
@ -5,6 +5,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;

+import java.io.IOException;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.Set;
@ -15,6 +16,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.IntegrationTestingUtility;
@ -25,6 +27,7 @@ import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
 import org.apache.hadoop.util.GenericOptionsParser;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@ -137,6 +140,18 @@ public class IntegrationTestImportTsv implements Configurable, Tool {
    }
  }

+  /**
+   * Confirm the absence of the {@link TotalOrderPartitioner} partitions file.
+   */
+  protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException {
+    if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false))
+      return;
+
+    FileSystem fs = FileSystem.get(conf);
+    Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf));
+    assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile));
+  }
+
  @Test
  public void testGenerateAndLoad() throws Exception {
    String table = NAME + "-" + UUID.randomUUID();
@ -155,8 +170,13 @@ public class IntegrationTestImportTsv implements Configurable, Tool {

    // run the job, complete the load.
    util.createTable(table, cf);
-    TestImportTsv.doMROnTableTest(util, cf, simple_tsv, args);
+    Tool t = TestImportTsv.doMROnTableTest(util, cf, simple_tsv, args);
    doLoadIncrementalHFiles(hfiles, table);
+
+    // validate post-conditions
+    validateDeletedPartitionsFile(t.getConf());
+
+    // clean up after ourselves.
    util.deleteTable(table);
    util.cleanupDataTestDirOnTestFS(table);
  }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java
@ -20,8 +20,6 @@ package org.apache.hadoop.hbase.mapreduce;

 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
-import java.net.URI;
-import java.net.URISyntaxException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.util.ArrayList;
@ -37,7 +35,6 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HColumnDescriptor;
@ -54,9 +51,9 @@ import org.apache.hadoop.hbase.io.hfile.CacheConfig;
 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoder;
 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoderImpl;
 import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
+import org.apache.hadoop.hbase.regionserver.BloomType;
 import org.apache.hadoop.hbase.regionserver.HStore;
 import org.apache.hadoop.hbase.regionserver.StoreFile;
-import org.apache.hadoop.hbase.regionserver.BloomType;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.SequenceFile;
@ -267,13 +264,12 @@ public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,
  }

  /**
-   * Write out a SequenceFile that can be read by TotalOrderPartitioner
-   * that contains the split points in startKeys.
-   * @param partitionsPath output path for SequenceFile
-   * @param startKeys the region start keys
+   * Write out a {@link SequenceFile} that can be read by
+   * {@link TotalOrderPartitioner} that contains the split points in startKeys.
   */
  private static void writePartitions(Configuration conf, Path partitionsPath,
      List<ImmutableBytesWritable> startKeys) throws IOException {
+    LOG.info("Writing partition information to " + partitionsPath);
    if (startKeys.isEmpty()) {
      throw new IllegalArgumentException("No regions passed");
    }
@ -325,7 +321,6 @@ public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,
  throws IOException {
    Configuration conf = job.getConfiguration();

-    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setOutputFormatClass(HFileOutputFormat.class);
@ -341,29 +336,14 @@ public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,
      LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

+    // Use table's region boundaries for TOP split points.
    LOG.info("Looking up current regions for table " + table);
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
        "to match current region count");
    job.setNumReduceTasks(startKeys.size());

-    Path partitionsPath = new Path(job.getWorkingDirectory(),
-                                   "partitions_" + UUID.randomUUID());
-    LOG.info("Writing partition information to " + partitionsPath);
-
-    FileSystem fs = partitionsPath.getFileSystem(conf);
-    writePartitions(conf, partitionsPath, startKeys);
-    partitionsPath.makeQualified(fs);
-
-    URI cacheUri;
-    try {
-      cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
-    } catch (URISyntaxException e) {
-      throw new IOException(e);
-    }
-    DistributedCache.addCacheFile(cacheUri, conf);
-    DistributedCache.createSymlink(conf);
-
+    configurePartitioner(job, startKeys);
    // Set compression algorithms based on column families
    configureCompression(table, conf);
    configureBloomType(table, conf);
@ -415,7 +395,26 @@ public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,
    }
    return confValMap;
  }
-  
+
+  /**
+   * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
+   * <code>splitPoints</code>. Cleans up the partitions file after job exists.
+   */
+  static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
+      throws IOException {
+
+    // create the partitions file
+    FileSystem fs = FileSystem.get(job.getConfiguration());
+    Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
+    fs.makeQualified(partitionsPath);
+    fs.deleteOnExit(partitionsPath);
+    writePartitions(job.getConfiguration(), partitionsPath, splitPoints);
+
+    // configure job to use it
+    job.setPartitionerClass(TotalOrderPartitioner.class);
+    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
+  }
+
  /**
   * Serialize column family to compression algorithm map to configuration.
   * Invoked while configuring the MR job for incremental load.