HADOOP-13192. org.apache.hadoop.util.LineReader cannot handle multibyte delimiters correctly. Contributed by binde.

(cherry picked from commit fc6b50cc57)
(cherry picked from commit 39ea0891d2)
This commit is contained in:
Akira Ajisaka 2016-06-20 17:07:26 +09:00
parent 9d27530c36
commit 615a023be3
2 changed files with 42 additions and 24 deletions

View File

@ -318,7 +318,10 @@ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
break; break;
} }
} else if (delPosn != 0) { } else if (delPosn != 0) {
bufferPosn--; bufferPosn -= delPosn;
if(bufferPosn < -1) {
bufferPosn = -1;
}
delPosn = 0; delPosn = 0;
} }
} }

View File

@ -58,7 +58,7 @@ public void testCustomDelimiter() throws Exception {
* Check Condition * Check Condition
* In the second key value pair, the value should contain * In the second key value pair, the value should contain
* "</" from currentToken and * "</" from currentToken and
* "id>" from next token * "id>" from next token
*/ */
Delimiter="</entity>"; Delimiter="</entity>";
@ -80,20 +80,21 @@ public void testCustomDelimiter() throws Exception {
String TestPartOfInput = CurrentBufferTailToken+NextBufferHeadToken; String TestPartOfInput = CurrentBufferTailToken+NextBufferHeadToken;
int BufferSize=64 * 1024; int BufferSize=64 * 1024;
int numberOfCharToFillTheBuffer=BufferSize-CurrentBufferTailToken.length(); int numberOfCharToFillTheBuffer =
BufferSize - CurrentBufferTailToken.length();
StringBuilder fillerString=new StringBuilder(); StringBuilder fillerString=new StringBuilder();
for (int i=0;i<numberOfCharToFillTheBuffer;i++) { for (int i=0; i<numberOfCharToFillTheBuffer; i++) {
fillerString.append('a'); // char 'a' as a filler for the test string fillerString.append('a'); // char 'a' as a filler for the test string
} }
TestData = fillerString + TestPartOfInput; TestData = fillerString + TestPartOfInput;
lineReader = new LineReader( lineReader = new LineReader(
new ByteArrayInputStream(TestData.getBytes()),Delimiter.getBytes()); new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
line = new Text(); line = new Text();
lineReader.readLine(line); lineReader.readLine(line);
Assert.assertEquals(fillerString.toString(),line.toString()); Assert.assertEquals(fillerString.toString(), line.toString());
lineReader.readLine(line); lineReader.readLine(line);
Assert.assertEquals(Expected, line.toString()); Assert.assertEquals(Expected, line.toString());
@ -107,35 +108,49 @@ public void testCustomDelimiter() throws Exception {
Delimiter = "record"; Delimiter = "record";
StringBuilder TestStringBuilder = new StringBuilder(); StringBuilder TestStringBuilder = new StringBuilder();
TestStringBuilder.append(Delimiter+"Kerala "); TestStringBuilder.append(Delimiter + "Kerala ");
TestStringBuilder.append(Delimiter+"Bangalore"); TestStringBuilder.append(Delimiter + "Bangalore");
TestStringBuilder.append(Delimiter+" North Korea"); TestStringBuilder.append(Delimiter + " North Korea");
TestStringBuilder.append(Delimiter+Delimiter+ TestStringBuilder.append(Delimiter + Delimiter+
"Guantanamo"); "Guantanamo");
TestStringBuilder.append(Delimiter+"ecord"+"recor"+"core"); //~EOF with 're' TestStringBuilder.append(Delimiter + "ecord"
+ "recor" + "core"); //~EOF with 're'
TestData=TestStringBuilder.toString(); TestData=TestStringBuilder.toString();
lineReader = new LineReader( lineReader = new LineReader(
new ByteArrayInputStream(TestData.getBytes()),Delimiter.getBytes()); new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
lineReader.readLine(line);
Assert.assertEquals("", line.toString());
lineReader.readLine(line);
Assert.assertEquals("Kerala ", line.toString());
lineReader.readLine(line); lineReader.readLine(line);
Assert.assertEquals("",line.toString()); Assert.assertEquals("Bangalore", line.toString());
lineReader.readLine(line);
Assert.assertEquals("Kerala ",line.toString());
lineReader.readLine(line); lineReader.readLine(line);
Assert.assertEquals("Bangalore",line.toString()); Assert.assertEquals(" North Korea", line.toString());
lineReader.readLine(line); lineReader.readLine(line);
Assert.assertEquals(" North Korea",line.toString()); Assert.assertEquals("", line.toString());
lineReader.readLine(line);
Assert.assertEquals("Guantanamo", line.toString());
lineReader.readLine(line); lineReader.readLine(line);
Assert.assertEquals("",line.toString()); Assert.assertEquals(("ecord"+"recor"+"core"), line.toString());
lineReader.readLine(line);
Assert.assertEquals("Guantanamo",line.toString()); // Test 3
// The test scenario is such that,
lineReader.readLine(line); // aaaabccc split by aaab
Assert.assertEquals(("ecord"+"recor"+"core"),line.toString()); TestData = "aaaabccc";
Delimiter = "aaab";
lineReader = new LineReader(
new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
lineReader.readLine(line);
Assert.assertEquals("a", line.toString());
lineReader.readLine(line);
Assert.assertEquals("ccc", line.toString());
} }
} }