From 9c87911c4ab35faead3e4729951b2855fb20e3b0 Mon Sep 17 00:00:00 2001
From: Robert Joseph Evans <bobby@apache.org>
Date: Wed, 11 Jul 2012 15:44:43 +0000
Subject: [PATCH] HADOOP-8521. Port StreamInputFormat to new Map Reduce API
 (madhukara phatak via bobby)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1360238 13f79535-47bb-0310-9956-ffa450edef68
---
 .../hadoop-common/CHANGES.txt                 |   3 +
 .../streaming/StreamBaseRecordReader.java     |   2 +-
 .../apache/hadoop/streaming/StreamUtil.java   |   9 +-
 .../streaming/StreamXmlRecordReader.java      |  18 +-
 .../mapreduce/StreamBaseRecordReader.java     | 153 ++++++++
 .../mapreduce/StreamInputFormat.java          |  94 +++++
 .../mapreduce/StreamXmlRecordReader.java      | 340 ++++++++++++++++++
 .../mapreduce/TestStreamXmlRecordReader.java  | 144 ++++++++
 8 files changed, 751 insertions(+), 12 deletions(-)
 create mode 100644 hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamBaseRecordReader.java
 create mode 100644 hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java
 create mode 100644 hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamXmlRecordReader.java
 create mode 100644 hadoop-tools/hadoop-streaming/src/test/java/org/apache/hadoop/streaming/mapreduce/TestStreamXmlRecordReader.java

diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt
index c5ce17b3882..4b75d6ad8bd 100644
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@@ -175,6 +175,9 @@ Trunk (unreleased changes)
     HADOOP-8584. test-patch.sh should not immediately exit when no
     tests are added or modified. (Colin Patrick McCabe via eli)
 
+    HADOOP-8521. Port StreamInputFormat to new Map Reduce API (madhukara
+    phatak via bobby)
+
   OPTIMIZATIONS
 
     HADOOP-7761. Improve the performance of raw comparisons. (todd)
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamBaseRecordReader.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamBaseRecordReader.java
index dbfa75a855f..85e5ab3e7bd 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamBaseRecordReader.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamBaseRecordReader.java
@@ -126,7 +126,7 @@ String getStatus(CharSequence record) {
     }
     String unqualSplit = split_.getPath().getName() + ":" +
                          split_.getStart() + "+" + split_.getLength();
-    String status = "HSTR " + StreamUtil.HOST + " " + numRec_ + ". pos=" + pos + " " + unqualSplit
+    String status = "HSTR " + StreamUtil.getHost() + " " + numRec_ + ". pos=" + pos + " " + unqualSplit
       + " Processing record=" + recStr;
     status += " " + splitName_;
     return status;
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamUtil.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamUtil.java
index 7e8d0a2ac44..8dd987e870c 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamUtil.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamUtil.java
@@ -168,12 +168,17 @@ static String slurpHadoop(Path p, FileSystem fs) throws IOException {
   }
 
   static private Environment env;
-  static String HOST;
+  private static String host;
 
+  public static String getHost(){
+    return host;
+  }
+ 
+   
   static {
     try {
       env = new Environment();
-      HOST = env.getHost();
+      host = env.getHost();
     } catch (IOException io) {
       io.printStackTrace();
     }
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamXmlRecordReader.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamXmlRecordReader.java
index b5327dacf42..7438cb8191a 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamXmlRecordReader.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/StreamXmlRecordReader.java
@@ -64,7 +64,7 @@ public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter rep
     init();
   }
 
-  public void init() throws IOException {
+  public final void init() throws IOException {
     LOG.info("StreamBaseRecordReader.init: " + " start_=" + start_ + " end_=" + end_ + " length_="
              + length_ + " start_ > in_.getPos() =" + (start_ > in_.getPos()) + " " + start_ + " > "
              + in_.getPos());
@@ -185,14 +185,14 @@ private boolean slowReadUntilMatch(Pattern markPattern, boolean includePat,
   }
 
   // states
-  final static int CDATA_IN = 10;
-  final static int CDATA_OUT = 11;
-  final static int CDATA_UNK = 12;
-  final static int RECORD_ACCEPT = 13;
+  private final static int CDATA_IN = 10;
+  private final static int CDATA_OUT = 11;
+  private final static int CDATA_UNK = 12;
+  private final static int RECORD_ACCEPT = 13;
   // inputs
-  final static int CDATA_BEGIN = 20;
-  final static int CDATA_END = 21;
-  final static int RECORD_MAYBE = 22;
+  private final static int CDATA_BEGIN = 20;
+  private final static int CDATA_END = 21;
+  private final static int RECORD_MAYBE = 22;
 
   /* also updates firstMatchStart_;*/
   int nextState(int state, int input, int bufPos) {
@@ -293,7 +293,7 @@ String checkJobGet(String prop) throws IOException {
   BufferedInputStream bin_; // Wrap FSDataInputStream for efficient backward seeks 
   long pos_; // Keep track on position with respect encapsulated FSDataInputStream  
 
-  final static int NA = -1;
+  private final static int NA = -1;
   int firstMatchStart_ = 0; // candidate record boundary. Might just be CDATA.
   int firstMatchEnd_ = 0;
 
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamBaseRecordReader.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamBaseRecordReader.java
new file mode 100644
index 00000000000..d71c20d23a1
--- /dev/null
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamBaseRecordReader.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.streaming.mapreduce;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.streaming.StreamUtil;
+
+/**
+ * Shared functionality for hadoopStreaming formats. A custom reader can be
+ * defined to be a RecordReader with the constructor below and is selected with
+ * the option bin/hadoopStreaming -inputreader ...
+ * 
+ * @see StreamXmlRecordReader
+ */
+public abstract class StreamBaseRecordReader extends RecordReader<Text, Text> {
+
+  protected static final Log LOG = LogFactory
+      .getLog(StreamBaseRecordReader.class.getName());
+
+  // custom JobConf properties for this class are prefixed with this namespace
+  final static String CONF_NS = "stream.recordreader.";
+
+  public StreamBaseRecordReader(FSDataInputStream in, FileSplit split,
+      TaskAttemptContext context, Configuration conf, FileSystem fs)
+      throws IOException {
+    in_ = in;
+    split_ = split;
+    start_ = split_.getStart();
+    length_ = split_.getLength();
+    end_ = start_ + length_;
+    splitName_ = split_.getPath().getName();
+    this.context_ = context;
+    conf_ = conf;
+    fs_ = fs;
+
+    statusMaxRecordChars_ = conf.getInt(CONF_NS + "statuschars", 200);
+  }
+
+  // / RecordReader API
+
+  /**
+   * Read a record. Implementation should call numRecStats at the end
+   */
+  public abstract boolean next(Text key, Text value) throws IOException;
+
+  /** Returns the current position in the input. */
+  public synchronized long getPos() throws IOException {
+    return in_.getPos();
+  }
+
+  /** Close this to future operations. */
+  public synchronized void close() throws IOException {
+    in_.close();
+  }
+
+  public float getProgress() throws IOException {
+    if (end_ == start_) {
+      return 1.0f;
+    } else {
+      return ((float) (in_.getPos() - start_)) / ((float) (end_ - start_));
+    }
+  }
+
+  public Text createKey() {
+    return new Text();
+  }
+
+  public Text createValue() {
+    return new Text();
+  }
+
+  // / StreamBaseRecordReader API
+
+  /**
+   * Implementation should seek forward in_ to the first byte of the next
+   * record. The initial byte offset in the stream is arbitrary.
+   */
+  public abstract void seekNextRecordBoundary() throws IOException;
+
+  void numRecStats(byte[] record, int start, int len) throws IOException {
+    numRec_++;
+    if (numRec_ == nextStatusRec_) {
+      String recordStr = new String(record, start, Math.min(len,
+          statusMaxRecordChars_), "UTF-8");
+      nextStatusRec_ += 100;// *= 10;
+      String status = getStatus(recordStr);
+      LOG.info(status);
+      context_.setStatus(status);
+    }
+  }
+
+  long lastMem = 0;
+
+  String getStatus(CharSequence record) {
+    long pos = -1;
+    try {
+      pos = getPos();
+    } catch (IOException io) {
+    }
+    String recStr;
+    if (record.length() > statusMaxRecordChars_) {
+      recStr = record.subSequence(0, statusMaxRecordChars_) + "...";
+    } else {
+      recStr = record.toString();
+    }
+    String unqualSplit = split_.getPath().getName() + ":" + split_.getStart()
+        + "+" + split_.getLength();
+    String status = "HSTR " + StreamUtil.getHost() + " " + numRec_ + ". pos="
+        + pos + " " + unqualSplit + " Processing record=" + recStr;
+    status += " " + splitName_;
+    return status;
+  }
+
+  FSDataInputStream in_;
+  FileSplit split_;
+  long start_;
+  long end_;
+  long length_;
+  String splitName_;
+  TaskAttemptContext context_;
+  Configuration conf_;
+  FileSystem fs_;
+  int numRec_ = 0;
+  int nextStatusRec_ = 1;
+  int statusMaxRecordChars_;
+
+}
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java
new file mode 100644
index 00000000000..a77c13762ca
--- /dev/null
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.streaming.mapreduce;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
+import org.apache.hadoop.streaming.StreamUtil;
+
+/**
+ * An input format that selects a RecordReader based on a JobConf property. This
+ * should be used only for non-standard record reader such as
+ * StreamXmlRecordReader. For all other standard record readers, the appropriate
+ * input format classes should be used.
+ */
+public class StreamInputFormat extends KeyValueTextInputFormat {
+
+  @Override
+  public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit,
+      TaskAttemptContext context) throws IOException {
+
+    Configuration conf = context.getConfiguration();
+
+    String c = conf.get("stream.recordreader.class");
+    if (c == null || c.indexOf("LineRecordReader") >= 0) {
+      return super.createRecordReader(genericSplit, context);
+    }
+
+    // handling non-standard record reader (likely StreamXmlRecordReader)
+    FileSplit split = (FileSplit) genericSplit;
+    // LOG.info("getRecordReader start.....split=" + split);
+    context.setStatus(split.toString());
+    context.progress();
+
+    // Open the file and seek to the start of the split
+    FileSystem fs = split.getPath().getFileSystem(conf);
+    FSDataInputStream in = fs.open(split.getPath());
+
+    // Factory dispatch based on available params..
+    Class readerClass;
+
+    {
+      readerClass = StreamUtil.goodClassOrNull(conf, c, null);
+      if (readerClass == null) {
+        throw new RuntimeException("Class not found: " + c);
+      }
+    }
+    Constructor ctor;
+
+    try {
+      ctor = readerClass.getConstructor(new Class[] { FSDataInputStream.class,
+          FileSplit.class, TaskAttemptContext.class, Configuration.class,
+          FileSystem.class });
+    } catch (NoSuchMethodException nsm) {
+      throw new RuntimeException(nsm);
+    }
+
+    RecordReader<Text, Text> reader;
+    try {
+      reader = (RecordReader<Text, Text>) ctor.newInstance(new Object[] { in,
+          split, context, conf, fs });
+    } catch (Exception nsm) {
+      throw new RuntimeException(nsm);
+    }
+    return reader;
+
+  }
+
+}
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamXmlRecordReader.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamXmlRecordReader.java
new file mode 100644
index 00000000000..c7ee847763f
--- /dev/null
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamXmlRecordReader.java
@@ -0,0 +1,340 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.streaming.mapreduce;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.streaming.StreamUtil;
+
+/**
+ * A way to interpret XML fragments as Mapper input records. Values are XML
+ * subtrees delimited by configurable tags. Keys could be the value of a certain
+ * attribute in the XML subtree, but this is left to the stream processor
+ * application.
+ * 
+ * The name-value properties that StreamXmlRecordReader understands are: String
+ * begin (chars marking beginning of record) String end (chars marking end of
+ * record) int maxrec (maximum record size) int lookahead(maximum lookahead to
+ * sync CDATA) boolean slowmatch
+ */
+public class StreamXmlRecordReader extends StreamBaseRecordReader {
+
+  private Text key;
+  private Text value;
+
+  public StreamXmlRecordReader(FSDataInputStream in, FileSplit split,
+      TaskAttemptContext context, Configuration conf, FileSystem fs)
+  throws IOException {
+    super(in, split, context, conf, fs);
+
+    beginMark_ = checkJobGet(CONF_NS + "begin");
+    endMark_ = checkJobGet(CONF_NS + "end");
+
+    maxRecSize_ = conf_.getInt(CONF_NS + "maxrec", 50 * 1000);
+    lookAhead_ = conf_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
+    synched_ = false;
+
+    slowMatch_ = conf_.getBoolean(CONF_NS + "slowmatch", false);
+    if (slowMatch_) {
+      beginPat_ = makePatternCDataOrMark(beginMark_);
+      endPat_ = makePatternCDataOrMark(endMark_);
+    }
+    init();
+  }
+
+  public final void init() throws IOException {
+    LOG.info("StreamBaseRecordReader.init: " + " start_=" + start_ + " end_="
+        + end_ + " length_=" + length_ + " start_ > in_.getPos() ="
+        + (start_ > in_.getPos()) + " " + start_ + " > " + in_.getPos());
+    if (start_ > in_.getPos()) {
+      in_.seek(start_);
+    }
+    pos_ = start_;
+    bin_ = new BufferedInputStream(in_);
+    seekNextRecordBoundary();
+  }
+
+  int numNext = 0;
+
+  public synchronized boolean next(Text key, Text value) throws IOException {
+    numNext++;
+    if (pos_ >= end_) {
+      return false;
+    }
+
+    DataOutputBuffer buf = new DataOutputBuffer();
+    if (!readUntilMatchBegin()) {
+      return false;
+    }
+    if (pos_ >= end_ || !readUntilMatchEnd(buf)) {
+      return false;
+    }
+
+    // There is only one elem..key/value splitting is not done here.
+    byte[] record = new byte[buf.getLength()];
+    System.arraycopy(buf.getData(), 0, record, 0, record.length);
+
+    numRecStats(record, 0, record.length);
+
+    key.set(record);
+    value.set("");
+
+    return true;
+  }
+
+  public void seekNextRecordBoundary() throws IOException {
+    readUntilMatchBegin();
+  }
+
+  boolean readUntilMatchBegin() throws IOException {
+    if (slowMatch_) {
+      return slowReadUntilMatch(beginPat_, false, null);
+    } else {
+      return fastReadUntilMatch(beginMark_, false, null);
+    }
+  }
+
+  private boolean readUntilMatchEnd(DataOutputBuffer buf) throws IOException {
+    if (slowMatch_) {
+      return slowReadUntilMatch(endPat_, true, buf);
+    } else {
+      return fastReadUntilMatch(endMark_, true, buf);
+    }
+  }
+
+  private boolean slowReadUntilMatch(Pattern markPattern, boolean includePat,
+      DataOutputBuffer outBufOrNull) throws IOException {
+    byte[] buf = new byte[Math.max(lookAhead_, maxRecSize_)];
+    int read = 0;
+    bin_.mark(Math.max(lookAhead_, maxRecSize_) + 2); // mark to invalidate if
+    // we read more
+    read = bin_.read(buf);
+    if (read == -1)
+      return false;
+
+    String sbuf = new String(buf, 0, read, "UTF-8");
+    Matcher match = markPattern.matcher(sbuf);
+
+    firstMatchStart_ = NA;
+    firstMatchEnd_ = NA;
+    int bufPos = 0;
+    int state = synched_ ? CDATA_OUT : CDATA_UNK;
+    int s = 0;
+
+    while (match.find(bufPos)) {
+      int input;
+      if (match.group(1) != null) {
+        input = CDATA_BEGIN;
+      } else if (match.group(2) != null) {
+        input = CDATA_END;
+        firstMatchStart_ = NA; // |<DOC CDATA[ </DOC> ]]> should keep it
+      } else {
+        input = RECORD_MAYBE;
+      }
+      if (input == RECORD_MAYBE) {
+        if (firstMatchStart_ == NA) {
+          firstMatchStart_ = match.start();
+          firstMatchEnd_ = match.end();
+        }
+      }
+      state = nextState(state, input, match.start());
+      if (state == RECORD_ACCEPT) {
+        break;
+      }
+      bufPos = match.end();
+      s++;
+    }
+    if (state != CDATA_UNK) {
+      synched_ = true;
+    }
+    boolean matched = (firstMatchStart_ != NA)
+    && (state == RECORD_ACCEPT || state == CDATA_UNK);
+    if (matched) {
+      int endPos = includePat ? firstMatchEnd_ : firstMatchStart_;
+      bin_.reset();
+
+      for (long skiplen = endPos; skiplen > 0;) {
+        skiplen -= bin_.skip(skiplen); // Skip succeeds as we have read this
+        // buffer
+      }
+
+      pos_ += endPos;
+      if (outBufOrNull != null) {
+        outBufOrNull.writeBytes(sbuf.substring(0, endPos));
+      }
+    }
+    return matched;
+  }
+
+  // states
+  private final static int CDATA_IN = 10;
+  private final static int CDATA_OUT = 11;
+  private final static int CDATA_UNK = 12;
+  private final static int RECORD_ACCEPT = 13;
+  // inputs
+  private final static int CDATA_BEGIN = 20;
+  private final static int CDATA_END = 21;
+  private final static int RECORD_MAYBE = 22;
+
+  /* also updates firstMatchStart_; */
+  int nextState(int state, int input, int bufPos) {
+    switch (state) {
+    case CDATA_UNK:
+    case CDATA_OUT:
+      switch (input) {
+      case CDATA_BEGIN:
+        return CDATA_IN;
+      case CDATA_END:
+        if (state == CDATA_OUT) {
+          // System.out.println("buggy XML " + bufPos);
+        }
+        return CDATA_OUT;
+      case RECORD_MAYBE:
+        return (state == CDATA_UNK) ? CDATA_UNK : RECORD_ACCEPT;
+      }
+      break;
+    case CDATA_IN:
+      return (input == CDATA_END) ? CDATA_OUT : CDATA_IN;
+    }
+    throw new IllegalStateException(state + " " + input + " " + bufPos + " "
+        + splitName_);
+  }
+
+  Pattern makePatternCDataOrMark(String escapedMark) {
+    StringBuffer pat = new StringBuffer();
+    addGroup(pat, StreamUtil.regexpEscape("CDATA[")); // CDATA_BEGIN
+    addGroup(pat, StreamUtil.regexpEscape("]]>")); // CDATA_END
+    addGroup(pat, escapedMark); // RECORD_MAYBE
+    return Pattern.compile(pat.toString());
+  }
+
+  void addGroup(StringBuffer pat, String escapedGroup) {
+    if (pat.length() > 0) {
+      pat.append("|");
+    }
+    pat.append("(");
+    pat.append(escapedGroup);
+    pat.append(")");
+  }
+
+  boolean fastReadUntilMatch(String textPat, boolean includePat,
+      DataOutputBuffer outBufOrNull) throws IOException {
+    byte[] cpat = textPat.getBytes("UTF-8");
+    int m = 0;
+    boolean match = false;
+    int msup = cpat.length;
+    int LL = 120000 * 10;
+
+    bin_.mark(LL); // large number to invalidate mark
+    while (true) {
+      int b = bin_.read();
+      if (b == -1)
+        break;
+
+      byte c = (byte) b; // this assumes eight-bit matching. OK with UTF-8
+      if (c == cpat[m]) {
+        m++;
+        if (m == msup) {
+          match = true;
+          break;
+        }
+      } else {
+        bin_.mark(LL); // rest mark so we could jump back if we found a match
+        if (outBufOrNull != null) {
+          outBufOrNull.write(cpat, 0, m);
+          outBufOrNull.write(c);
+        }
+        pos_ += m + 1; // skip m chars, +1 for 'c'
+        m = 0;
+      }
+    }
+    if (!includePat && match) {
+      bin_.reset();
+    } else if (outBufOrNull != null) {
+      outBufOrNull.write(cpat);
+      pos_ += msup;
+    }
+    return match;
+  }
+
+  String checkJobGet(String prop) throws IOException {
+    String val = conf_.get(prop);
+    if (val == null) {
+      throw new IOException("JobConf: missing required property: " + prop);
+    }
+    return val;
+  }
+
+  String beginMark_;
+  String endMark_;
+
+  Pattern beginPat_;
+  Pattern endPat_;
+
+  boolean slowMatch_;
+  int lookAhead_; // bytes to read to try to synch CDATA/non-CDATA. Should be
+  // more than max record size
+  int maxRecSize_;
+
+  BufferedInputStream bin_; // Wrap FSDataInputStream for efficient backward
+  // seeks
+  long pos_; // Keep track on position with respect encapsulated
+  // FSDataInputStream
+
+  private final static int NA = -1;
+  int firstMatchStart_ = 0; // candidate record boundary. Might just be CDATA.
+  int firstMatchEnd_ = 0;
+
+  boolean synched_;
+
+  @Override
+  public Text getCurrentKey() throws IOException, InterruptedException {
+    return key;
+  }
+
+  @Override
+  public Text getCurrentValue() throws IOException, InterruptedException {
+    return value;
+  }
+
+  @Override
+  public void initialize(InputSplit arg0, TaskAttemptContext arg1)
+  throws IOException, InterruptedException {
+
+  }
+
+  @Override
+  public boolean nextKeyValue() throws IOException, InterruptedException {
+    key = createKey();
+    value = createValue();
+    return next(key, value);
+  }
+
+}
diff --git a/hadoop-tools/hadoop-streaming/src/test/java/org/apache/hadoop/streaming/mapreduce/TestStreamXmlRecordReader.java b/hadoop-tools/hadoop-streaming/src/test/java/org/apache/hadoop/streaming/mapreduce/TestStreamXmlRecordReader.java
new file mode 100644
index 00000000000..50f38bd20c2
--- /dev/null
+++ b/hadoop-tools/hadoop-streaming/src/test/java/org/apache/hadoop/streaming/mapreduce/TestStreamXmlRecordReader.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.streaming.mapreduce;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * This class tests StreamXmlRecordReader The test creates an XML file, uses
+ * StreamXmlRecordReader and compares the expected output against the generated
+ * output
+ */
+public class TestStreamXmlRecordReader {
+
+  private File INPUT_FILE;
+  private String input;
+  private String outputExpect;
+  Path OUTPUT_DIR;
+  FileSystem fs;
+
+  public TestStreamXmlRecordReader() throws IOException {
+    INPUT_FILE = new File("target/input.xml");
+    input = "<xmltag>\t\nroses.are.red\t\nviolets.are.blue\t\n"
+        + "bunnies.are.pink\t\n</xmltag>\t\n";
+    outputExpect = input;
+  }
+
+  protected void assertOutput(String expectedOutput, String output)
+      throws IOException {
+    String[] words = expectedOutput.split("\t\n");
+    Set<String> expectedWords = new HashSet<String>(Arrays.asList(words));
+    words = output.split("\t\n");
+    Set<String> returnedWords = new HashSet<String>(Arrays.asList(words));
+    assertTrue(returnedWords.containsAll(expectedWords));
+  }
+
+  protected void checkOutput() throws IOException {
+    File outFile = new File(OUTPUT_DIR.toString());
+    Path outPath = new Path(outFile.getAbsolutePath(), "part-r-00000");
+    String output = slurpHadoop(outPath, fs);
+    fs.delete(outPath, true);
+    outputExpect = "<PATTERN>\n" + outputExpect + "</PATTERN>";
+    System.err.println("outEx1=" + outputExpect);
+    System.err.println("  out1=" + output);
+    assertOutput(outputExpect, output);
+  }
+
+  private String slurpHadoop(Path p, FileSystem fs) throws IOException {
+    int len = (int) fs.getFileStatus(p).getLen();
+    byte[] buf = new byte[len];
+    FSDataInputStream in = fs.open(p);
+    String contents = null;
+    try {
+      in.readFully(in.getPos(), buf);
+      contents = new String(buf, "UTF-8");
+    } finally {
+      in.close();
+    }
+    return contents;
+  }
+
+  @Before
+  public void createInput() throws IOException {
+    FileOutputStream out = new FileOutputStream(INPUT_FILE.getAbsoluteFile());
+    String dummyXmlStartTag = "<PATTERN>\n";
+    String dummyXmlEndTag = "</PATTERN>\n";
+    out.write(dummyXmlStartTag.getBytes("UTF-8"));
+    out.write(input.getBytes("UTF-8"));
+    out.write(dummyXmlEndTag.getBytes("UTF-8"));
+    out.close();
+  }
+
+  @Test
+  public void testStreamXmlRecordReader() throws Exception {
+
+    Job job = new Job();
+    Configuration conf = job.getConfiguration();
+    job.setJarByClass(TestStreamXmlRecordReader.class);
+    job.setMapperClass(Mapper.class);
+    conf.set("stream.recordreader.class",
+        "org.apache.hadoop.streaming.mapreduce.StreamXmlRecordReader");
+    conf.set("stream.recordreader.begin", "<PATTERN>");
+    conf.set("stream.recordreader.end", "</PATTERN>");
+    job.setInputFormatClass(StreamInputFormat.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Text.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+    FileInputFormat.addInputPath(job, new Path("target/input.xml"));
+    OUTPUT_DIR = new Path("target/output");
+    fs = FileSystem.get(conf);
+    if (fs.exists(OUTPUT_DIR)) {
+      fs.delete(OUTPUT_DIR, true);
+    }
+    FileOutputFormat.setOutputPath(job, OUTPUT_DIR);
+    boolean ret = job.waitForCompletion(true);
+
+    assertEquals(true, ret);
+    checkOutput();
+
+  }
+
+  @After
+  public void tearDown() throws IOException {
+    fs.delete(OUTPUT_DIR, true);
+  }
+
+}