Code cleanup for PDF module (#800)

* PDF to X

* PDF to X

* Remove created doc

* Code fixes and cleanup for PDF module
This commit is contained in:
maibin 2016-11-03 12:04:43 +01:00 committed by Grzegorz Piwowarek
parent 8628a63d95
commit 0ea29df589
4 changed files with 19 additions and 56 deletions

View File

@ -23,27 +23,17 @@ public class PDF2HTMLExample {
} }
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException { private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
System.out.println("Creating HTML file from a PDF file: " + filename);
PDDocument pdf = null;
try { try {
// load the PDF file using PDFBox PDDocument pdf = PDDocument.load(new File(filename));
pdf = PDDocument.load(new File(filename));
// create the DOM parser
PDFDomTree parser = new PDFDomTree(); PDFDomTree parser = new PDFDomTree();
// parse the file and get the DOM Document
Writer output = new PrintWriter("src/output/pdf.html", "utf-8"); Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
parser.writeText(pdf, output); parser.writeText(pdf, output);
output.close(); output.close();
} finally {
if (pdf != null) { if (pdf != null) {
try {
pdf.close(); pdf.close();
}
} catch (IOException e) { } catch (IOException e) {
System.err.println("Error: " + e.getMessage()); e.printStackTrace();
} }
} }
} }
System.out.println("Done.");
}
}

View File

@ -24,15 +24,12 @@ public class PDF2ImageExample {
} }
private static void generateImageFromPDF(String filename, String extension) throws IOException { private static void generateImageFromPDF(String filename, String extension) throws IOException {
System.out.println("Creating " + extension + " image from a PDF file: " + filename);
PDDocument document = PDDocument.load(new File(filename)); PDDocument document = PDDocument.load(new File(filename));
PDFRenderer pdfRenderer = new PDFRenderer(document); PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page) { for (int page = 0; page < document.getNumberOfPages(); ++page) {
System.out.println("Page number: " + (page + 1) + " is being rendered");
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300); ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
} }
document.close(); document.close();
System.out.println("Done.");
} }
} }

View File

@ -23,43 +23,30 @@ public class PDF2TextExample {
} }
private static void generateTxtFromPDF(String filename) throws IOException { private static void generateTxtFromPDF(String filename) throws IOException {
System.out.println("Parsing text from PDF file " + filename); try {
String parsedText = null;
PDFTextStripper pdfStripper;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
File f = new File(filename); File f = new File(filename);
String parsedText;
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r")); PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
try {
parser.parse(); parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper(); COSDocument cosDoc = parser.getDocument();
pdDoc = new PDDocument(cosDoc);
PDFTextStripper pdfStripper = new PDFTextStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc); parsedText = pdfStripper.getText(pdDoc);
} catch (Exception e) {
System.err.println("An exception occured in parsing the PDF Document.");
e.printStackTrace();
try {
if (cosDoc != null) if (cosDoc != null)
cosDoc.close(); cosDoc.close();
if (pdDoc != null) if (pdDoc != null)
pdDoc.close(); pdDoc.close();
} catch (Exception e1) {
e.printStackTrace();
}
}
System.out.println("Writing PDF text to output text file");
try {
PrintWriter pw = new PrintWriter("src/output/pdf.txt"); PrintWriter pw = new PrintWriter("src/output/pdf.txt");
pw.print(parsedText); pw.print(parsedText);
pw.close(); pw.close();
} catch (Exception e) { } catch (Exception e) {
System.out.println("An exception occured in writing the pdf text to file.");
e.printStackTrace(); e.printStackTrace();
} }
System.out.println("Done.");
} }
} }

View File

@ -26,36 +26,25 @@ public class PDF2WordExample {
} }
private static void generateDocFromPDF(String filename) throws IOException { private static void generateDocFromPDF(String filename) throws IOException {
System.out.println("Creating a docx file from a PDF file: " + filename);
// Create the word document
XWPFDocument doc = new XWPFDocument(); XWPFDocument doc = new XWPFDocument();
// Open the pdf file
String pdf = filename; String pdf = filename;
PdfReader reader = new PdfReader(pdf); PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader); PdfReaderContentParser parser = new PdfReaderContentParser(reader);
// Read the PDF page by page
for (int i = 1; i <= reader.getNumberOfPages(); i++) { for (int i = 1; i <= reader.getNumberOfPages(); i++) {
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
// Extract the text
String text = strategy.getResultantText(); String text = strategy.getResultantText();
// Create a new paragraph in the word document, adding the extracted
// text
XWPFParagraph p = doc.createParagraph(); XWPFParagraph p = doc.createParagraph();
XWPFRun run = p.createRun(); XWPFRun run = p.createRun();
run.setText(text); run.setText(text);
// Adding a page break
run.addBreak(BreakType.PAGE); run.addBreak(BreakType.PAGE);
} }
// Write the word document
FileOutputStream out = new FileOutputStream("src/output/pdf.docx"); FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
doc.write(out); doc.write(out);
// Close all open files
out.close(); out.close();
reader.close(); reader.close();
doc.close(); doc.close();
System.out.println("Done.");
} }
} }