Code cleanup for PDF module (#800)
* PDF to X * PDF to X * Remove created doc * Code fixes and cleanup for PDF module
This commit is contained in:
parent
8628a63d95
commit
0ea29df589
|
@ -23,27 +23,17 @@ public class PDF2HTMLExample {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
|
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
|
||||||
System.out.println("Creating HTML file from a PDF file: " + filename);
|
|
||||||
PDDocument pdf = null;
|
|
||||||
try {
|
try {
|
||||||
// load the PDF file using PDFBox
|
PDDocument pdf = PDDocument.load(new File(filename));
|
||||||
pdf = PDDocument.load(new File(filename));
|
|
||||||
// create the DOM parser
|
|
||||||
PDFDomTree parser = new PDFDomTree();
|
PDFDomTree parser = new PDFDomTree();
|
||||||
// parse the file and get the DOM Document
|
|
||||||
Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
|
Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
|
||||||
parser.writeText(pdf, output);
|
parser.writeText(pdf, output);
|
||||||
output.close();
|
output.close();
|
||||||
} finally {
|
|
||||||
if (pdf != null) {
|
if (pdf != null) {
|
||||||
try {
|
|
||||||
pdf.close();
|
pdf.close();
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
System.err.println("Error: " + e.getMessage());
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
System.out.println("Done.");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
|
@ -24,15 +24,12 @@ public class PDF2ImageExample {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void generateImageFromPDF(String filename, String extension) throws IOException {
|
private static void generateImageFromPDF(String filename, String extension) throws IOException {
|
||||||
System.out.println("Creating " + extension + " image from a PDF file: " + filename);
|
|
||||||
PDDocument document = PDDocument.load(new File(filename));
|
PDDocument document = PDDocument.load(new File(filename));
|
||||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
for (int page = 0; page < document.getNumberOfPages(); ++page) {
|
for (int page = 0; page < document.getNumberOfPages(); ++page) {
|
||||||
System.out.println("Page number: " + (page + 1) + " is being rendered");
|
|
||||||
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
|
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
|
||||||
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
|
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
|
||||||
}
|
}
|
||||||
document.close();
|
document.close();
|
||||||
System.out.println("Done.");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,43 +23,30 @@ public class PDF2TextExample {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void generateTxtFromPDF(String filename) throws IOException {
|
private static void generateTxtFromPDF(String filename) throws IOException {
|
||||||
System.out.println("Parsing text from PDF file " + filename);
|
try {
|
||||||
String parsedText = null;
|
|
||||||
PDFTextStripper pdfStripper;
|
|
||||||
PDDocument pdDoc = null;
|
|
||||||
COSDocument cosDoc = null;
|
|
||||||
|
|
||||||
File f = new File(filename);
|
File f = new File(filename);
|
||||||
|
String parsedText;
|
||||||
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
|
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
|
||||||
|
|
||||||
try {
|
|
||||||
parser.parse();
|
parser.parse();
|
||||||
cosDoc = parser.getDocument();
|
|
||||||
pdfStripper = new PDFTextStripper();
|
COSDocument cosDoc = parser.getDocument();
|
||||||
pdDoc = new PDDocument(cosDoc);
|
|
||||||
|
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||||
|
PDDocument pdDoc = new PDDocument(cosDoc);
|
||||||
|
|
||||||
parsedText = pdfStripper.getText(pdDoc);
|
parsedText = pdfStripper.getText(pdDoc);
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println("An exception occured in parsing the PDF Document.");
|
|
||||||
e.printStackTrace();
|
|
||||||
try {
|
|
||||||
if (cosDoc != null)
|
if (cosDoc != null)
|
||||||
cosDoc.close();
|
cosDoc.close();
|
||||||
if (pdDoc != null)
|
if (pdDoc != null)
|
||||||
pdDoc.close();
|
pdDoc.close();
|
||||||
} catch (Exception e1) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
System.out.println("Writing PDF text to output text file");
|
|
||||||
try {
|
|
||||||
PrintWriter pw = new PrintWriter("src/output/pdf.txt");
|
PrintWriter pw = new PrintWriter("src/output/pdf.txt");
|
||||||
pw.print(parsedText);
|
pw.print(parsedText);
|
||||||
pw.close();
|
pw.close();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
System.out.println("An exception occured in writing the pdf text to file.");
|
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
System.out.println("Done.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,36 +26,25 @@ public class PDF2WordExample {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void generateDocFromPDF(String filename) throws IOException {
|
private static void generateDocFromPDF(String filename) throws IOException {
|
||||||
System.out.println("Creating a docx file from a PDF file: " + filename);
|
|
||||||
// Create the word document
|
|
||||||
XWPFDocument doc = new XWPFDocument();
|
XWPFDocument doc = new XWPFDocument();
|
||||||
|
|
||||||
// Open the pdf file
|
|
||||||
String pdf = filename;
|
String pdf = filename;
|
||||||
PdfReader reader = new PdfReader(pdf);
|
PdfReader reader = new PdfReader(pdf);
|
||||||
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
|
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
|
||||||
|
|
||||||
// Read the PDF page by page
|
|
||||||
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
|
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
|
||||||
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
|
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
|
||||||
// Extract the text
|
|
||||||
String text = strategy.getResultantText();
|
String text = strategy.getResultantText();
|
||||||
// Create a new paragraph in the word document, adding the extracted
|
|
||||||
// text
|
|
||||||
XWPFParagraph p = doc.createParagraph();
|
XWPFParagraph p = doc.createParagraph();
|
||||||
XWPFRun run = p.createRun();
|
XWPFRun run = p.createRun();
|
||||||
run.setText(text);
|
run.setText(text);
|
||||||
// Adding a page break
|
|
||||||
run.addBreak(BreakType.PAGE);
|
run.addBreak(BreakType.PAGE);
|
||||||
}
|
}
|
||||||
// Write the word document
|
|
||||||
FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
|
FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
|
||||||
doc.write(out);
|
doc.write(out);
|
||||||
// Close all open files
|
|
||||||
out.close();
|
out.close();
|
||||||
reader.close();
|
reader.close();
|
||||||
doc.close();
|
doc.close();
|
||||||
System.out.println("Done.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue