mirror of https://github.com/apache/poi.git
Friendly wrapper on HWPF for extracting text from Word Documents
git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377372 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5bacacee85
commit
5c8d2a20d1
Binary file not shown.
|
@ -0,0 +1,87 @@
|
|||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Test the different routes to extracting text
|
||||
*
|
||||
* @author Nick Burch (nick at torchbox dot com)
|
||||
*/
|
||||
public class TestDifferentRoutes extends TestCase {
|
||||
private String[] p_text = new String[] {
|
||||
"This is a simple word document\r",
|
||||
"\r",
|
||||
"It has a number of paragraphs in it\r",
|
||||
"\r",
|
||||
"Some of them even feature bold, italic and underlined text\r",
|
||||
"\r",
|
||||
"\r",
|
||||
"This bit is in a different font and size\r",
|
||||
"\r",
|
||||
"\r",
|
||||
"This bit features some red text.\r",
|
||||
"\r",
|
||||
"\r",
|
||||
"It is otherwise very very boring.\r"
|
||||
};
|
||||
|
||||
private HWPFDocument doc;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HWPF.testdata.path");
|
||||
|
||||
String filename = dirname + "/test2.doc";
|
||||
doc = new HWPFDocument(new FileInputStream(filename));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test model based extraction
|
||||
*/
|
||||
public void testExtractFromModel() {
|
||||
Range r = doc.getRange();
|
||||
|
||||
String[] text = new String[r.numParagraphs()];
|
||||
for(int i=0; i < r.numParagraphs(); i++) {
|
||||
Paragraph p = r.getParagraph(i);
|
||||
text[i] = p.text();
|
||||
}
|
||||
|
||||
assertEquals(p_text.length, text.length);
|
||||
for(int i=0; i<p_text.length; i++) {
|
||||
assertEquals(p_text[i], text[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test textPieces based extraction
|
||||
*/
|
||||
public void testExtractFromTextPieces() throws Exception {
|
||||
StringBuffer textBuf = new StringBuffer();
|
||||
|
||||
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
|
||||
while (textPieces.hasNext()) {
|
||||
TextPiece piece = (TextPiece) textPieces.next();
|
||||
|
||||
String encoding = "Cp1252";
|
||||
if (piece.usesUnicode()) {
|
||||
encoding = "UTF-16LE";
|
||||
}
|
||||
String text = new String(piece.getRawBytes(), encoding);
|
||||
textBuf.append(text);
|
||||
}
|
||||
|
||||
StringBuffer exp = new StringBuffer();
|
||||
for(int i=0; i<p_text.length; i++) {
|
||||
exp.append(p_text[i]);
|
||||
}
|
||||
assertEquals(exp.toString(), textBuf.toString());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Test the different routes to extracting text
|
||||
*
|
||||
* @author Nick Burch (nick at torchbox dot com)
|
||||
*/
|
||||
public class TestWordExtractor extends TestCase {
|
||||
private String[] p_text1 = new String[] {
|
||||
"This is a simple word document\r\n",
|
||||
"\r\n",
|
||||
"It has a number of paragraphs in it\r\n",
|
||||
"\r\n",
|
||||
"Some of them even feature bold, italic and underlined text\r\n",
|
||||
"\r\n",
|
||||
"\r\n",
|
||||
"This bit is in a different font and size\r\n",
|
||||
"\r\n",
|
||||
"\r\n",
|
||||
"This bit features some red text.\r\n",
|
||||
"\r\n",
|
||||
"\r\n",
|
||||
"It is otherwise very very boring.\r\n"
|
||||
};
|
||||
private String p_text1_block = new String();
|
||||
|
||||
// Well behaved document
|
||||
private WordExtractor extractor;
|
||||
// Corrupted document - can't do paragraph based stuff
|
||||
private WordExtractor extractor2;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HWPF.testdata.path");
|
||||
|
||||
String filename = dirname + "/test2.doc";
|
||||
String filename2 = dirname + "/test.doc";
|
||||
extractor = new WordExtractor(new FileInputStream(filename));
|
||||
extractor2 = new WordExtractor(new FileInputStream(filename2));
|
||||
|
||||
// Build splat'd out text version
|
||||
for(int i=0; i<p_text1.length; i++) {
|
||||
p_text1_block += p_text1[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test paragraph based extraction
|
||||
*/
|
||||
public void testExtractFromParagraphs() {
|
||||
String[] text = extractor.getParagraphText();
|
||||
|
||||
assertEquals(p_text1.length, text.length);
|
||||
for(int i=0; i<p_text1.length; i++) {
|
||||
assertEquals(p_text1[i], text[i]);
|
||||
}
|
||||
|
||||
// On second one, should fall back
|
||||
assertEquals(1, extractor2.getParagraphText().length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the paragraph -> flat extraction
|
||||
*/
|
||||
public void testGetText() {
|
||||
assertEquals(p_text1_block, extractor.getText());
|
||||
|
||||
// On second one, should fall back to text piece
|
||||
assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test textPieces based extraction
|
||||
*/
|
||||
public void testExtractFromTextPieces() throws Exception {
|
||||
String text = extractor.getTextFromPieces();
|
||||
assertEquals(p_text1_block, text);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue