mirror of https://github.com/apache/poi.git
Bug 47304: use fixed encoding when extracting text in WordDocument
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668367 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4c8d98eb0b
commit
0850e7d846
|
@ -18,12 +18,21 @@ package org.apache.poi.stress;
|
||||||
|
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
|
||||||
|
import org.apache.poi.hdf.extractor.WordDocument;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@SuppressWarnings("deprecation")
|
||||||
public class HWPFFileHandler extends POIFSFileHandler {
|
public class HWPFFileHandler extends POIFSFileHandler {
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
|
@ -33,16 +42,53 @@ public class HWPFFileHandler extends POIFSFileHandler {
|
||||||
assertNotNull(doc.getEndnotes());
|
assertNotNull(doc.getEndnotes());
|
||||||
|
|
||||||
handlePOIDocument(doc);
|
handlePOIDocument(doc);
|
||||||
|
|
||||||
|
// fails for many documents, but is deprecated anyway...
|
||||||
|
// handleWordDocument(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void handleWordDocument(HWPFDocument doc) throws IOException {
|
||||||
|
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
||||||
|
doc.write(outStream);
|
||||||
|
|
||||||
|
WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray()));
|
||||||
|
|
||||||
|
StringWriter docTextWriter = new StringWriter();
|
||||||
|
PrintWriter out = new PrintWriter(docTextWriter);
|
||||||
|
try {
|
||||||
|
wordDoc.writeAllText(out);
|
||||||
|
} finally {
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
docTextWriter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// a test-case to test this locally without executing the full TestAllFiles
|
// a test-case to test this locally without executing the full TestAllFiles
|
||||||
@Test
|
@Test
|
||||||
public void test() throws Exception {
|
public void test() throws Exception {
|
||||||
InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc");
|
File file = new File("test-data/document/47304.doc");
|
||||||
|
|
||||||
|
InputStream stream = new FileInputStream(file);
|
||||||
try {
|
try {
|
||||||
handleFile(stream);
|
handleFile(stream);
|
||||||
} finally {
|
} finally {
|
||||||
stream.close();
|
stream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
handleExtracting(file);
|
||||||
|
|
||||||
|
stream = new FileInputStream(file);
|
||||||
|
try {
|
||||||
|
WordExtractor extractor = new WordExtractor(stream);
|
||||||
|
try {
|
||||||
|
assertNotNull(extractor.getText());
|
||||||
|
} finally {
|
||||||
|
extractor.close();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -177,7 +177,7 @@ public final class WordDocument {
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
String sText = new String(_header, start, end-start);
|
String sText = new String(_header, start, end-start, "windows-1252");
|
||||||
out.write(sText);
|
out.write(sText);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,15 @@
|
||||||
|
|
||||||
package org.apache.poi.hdf.extractor;
|
package org.apache.poi.hdf.extractor;
|
||||||
|
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||||
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,4 +40,31 @@ public class TestWordDocument {
|
||||||
//WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"});
|
//WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"});
|
||||||
WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"});
|
WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("deprecation")
|
||||||
|
@Test
|
||||||
|
public void test47304() throws IOException {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc");
|
||||||
|
assertNotNull(doc);
|
||||||
|
|
||||||
|
WordExtractor extractor = new WordExtractor(doc);
|
||||||
|
String text = extractor.getText();
|
||||||
|
//System.out.println(text);
|
||||||
|
assertTrue("Had: " + text, text.contains("Just a \u201Ctest\u201D"));
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
|
WordDocument wordDoc = new WordDocument("test-data/document/47304.doc");
|
||||||
|
|
||||||
|
StringWriter docTextWriter = new StringWriter();
|
||||||
|
PrintWriter out = new PrintWriter(docTextWriter);
|
||||||
|
try {
|
||||||
|
wordDoc.writeAllText(out);
|
||||||
|
} finally {
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
docTextWriter.close();
|
||||||
|
|
||||||
|
//System.out.println(docTextWriter.toString());
|
||||||
|
assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just a \u201Ctest\u201D"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue