From a603bd9547a2ec36301ab35c1dfe73f623d43143 Mon Sep 17 00:00:00 2001 From: Himanshu Gupta Date: Thu, 20 Aug 2015 17:02:01 -0500 Subject: [PATCH] HadoopGlobPathSplitter implementation to split hadoop glob paths This can be safely reverted once https://issues.apache.org/jira/browse/MAPREDUCE-5061 is fixed --- .../indexer/path/HadoopGlobPathSplitter.java | 104 +++++++++++++ .../path/HadoopGlobPathSplitterTest.java | 137 ++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 indexing-hadoop/src/main/java/io/druid/indexer/path/HadoopGlobPathSplitter.java create mode 100644 indexing-hadoop/src/test/java/io/druid/indexer/path/HadoopGlobPathSplitterTest.java diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/path/HadoopGlobPathSplitter.java b/indexing-hadoop/src/main/java/io/druid/indexer/path/HadoopGlobPathSplitter.java new file mode 100644 index 00000000000..0bcd862d9a4 --- /dev/null +++ b/indexing-hadoop/src/main/java/io/druid/indexer/path/HadoopGlobPathSplitter.java @@ -0,0 +1,104 @@ +/* +* Licensed to Metamarkets Group Inc. (Metamarkets) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. Metamarkets licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ + +package io.druid.indexer.path; + +import java.util.ArrayList; +import java.util.List; + +//Note: This class has been created to workaround https://issues.apache.org/jira/browse/MAPREDUCE-5061 +public class HadoopGlobPathSplitter +{ + + /** + * Splits given hadoop glob path by commas. + * e.g. splitGlob("/a,/b") -> ["/a","/b"] + * splitGlob("/a/{c,d}") -> ["/a/c", "/a/d"] + */ + public static List splitGlob(String path) + { + return splitGlob(new CharStream(path)); + } + + private static List splitGlob(CharStream path) + { + List result = new ArrayList<>(); + + List current = new ArrayList<>(); + current.add(new StringBuilder()); + + while (path.hasMore()) { + char c = path.next(); + switch (c) { + case '{': + List childResult = splitGlob(path); + List oldCurrent = current; + current = new ArrayList<>(); + + for (StringBuilder sb1 : oldCurrent) { + for (StringBuilder sb2 : childResult) { + StringBuilder sb3 = new StringBuilder(); + sb3.append(sb1); + sb3.append(sb2); + current.add(sb3); + } + } + break; + case '}': + result.addAll(current); + return result; + case ',': + result.addAll(current); + current = new ArrayList<>(); + current.add(new StringBuilder()); + break; + default: + for (StringBuilder sb : current) { + sb.append(c); + } + } + } + + result.addAll(current); + return result; + } +} + +class CharStream +{ + private String string; + private int offset; + + public CharStream(String string) + { + super(); + this.string = string; + this.offset = 0; + } + + public boolean hasMore() + { + return offset < string.length(); + } + + public char next() + { + return string.charAt(offset++); + } +} diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/path/HadoopGlobPathSplitterTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/path/HadoopGlobPathSplitterTest.java new file mode 100644 index 00000000000..ddac9478660 --- /dev/null +++ b/indexing-hadoop/src/test/java/io/druid/indexer/path/HadoopGlobPathSplitterTest.java @@ -0,0 +1,137 @@ +/* +* Licensed to Metamarkets Group Inc. (Metamarkets) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. Metamarkets licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ + +package io.druid.indexer.path; + +import com.google.common.collect.ImmutableList; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + */ +public class HadoopGlobPathSplitterTest +{ + @Test + public void testGlobSplitting() throws Exception { + String path = "/a/b/c"; + List expected = ImmutableList.of( + "/a/b/c" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/c,/d/e"; + expected = ImmutableList.of( + "/a/b/c", + "/d/e" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/*.c,/d/*.e"; + expected = ImmutableList.of( + "/a/b/*.c", + "/d/*.e" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/c,/d/e,/f/g"; + expected = ImmutableList.of( + "/a/b/c", + "/d/e", + "/f/g" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/{c,d}"; + expected = ImmutableList.of( + "/a/b/c", + "/a/b/d" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/{c,d}/e"; + expected = ImmutableList.of( + "/a/b/c/e", + "/a/b/d/e" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "{c,d}"; + expected = ImmutableList.of( + "c", + "d" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "{c,d}/e"; + expected = ImmutableList.of( + "c/e", + "d/e" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/{c,d},/a/b/{c,d}/e,{c,d},{c,d}/e"; + expected = ImmutableList.of( + "/a/b/c", + "/a/b/d", + "/a/b/c/e", + "/a/b/d/e", + "c", + "d", + "c/e", + "d/e" + ); + Assert.assertEquals(expected, splitGlob(path)); + + path = "/a/b/{c/{d,e/{f,g},h},i}/{j,k}/l"; + expected = ImmutableList.of( + "/a/b/c/d/j/l", + "/a/b/c/d/k/l", + "/a/b/c/e/f/j/l", + "/a/b/c/e/f/k/l", + "/a/b/c/e/g/j/l", + "/a/b/c/e/g/k/l", + "/a/b/c/h/j/l", + "/a/b/c/h/k/l", + "/a/b/i/j/l", + "/a/b/i/k/l" + ); + Assert.assertEquals(expected, splitGlob(path)); + + + path = ""; + expected = ImmutableList.of(""); + Assert.assertEquals(expected, splitGlob(path)); + + path = "{}"; + expected = ImmutableList.of(""); + Assert.assertEquals(expected, splitGlob(path)); + } + + private static List splitGlob(String path) { + List tmp = HadoopGlobPathSplitter.splitGlob(path); + List result = new ArrayList<>(tmp.size()); + for(StringBuilder sb : tmp) { + result.add(sb.toString()); + } + return result; + } +}