Code fixes and cleanup for PDF module

This commit is contained in:
michal_aibin 2016-11-03 11:59:14 +01:00
parent db79d31308
commit c4f9a6e120
4 changed files with 19 additions and 56 deletions

View File

@ -23,27 +23,17 @@ public class PDF2HTMLExample {
}
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
System.out.println("Creating HTML file from a PDF file: " + filename);
PDDocument pdf = null;
try {
// load the PDF file using PDFBox
pdf = PDDocument.load(new File(filename));
// create the DOM parser
PDDocument pdf = PDDocument.load(new File(filename));
PDFDomTree parser = new PDFDomTree();
// parse the file and get the DOM Document
Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
parser.writeText(pdf, output);
output.close();
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (IOException e) {
System.err.println("Error: " + e.getMessage());
}
pdf.close();
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done.");
}
}

View File

@ -24,15 +24,12 @@ public class PDF2ImageExample {
}
private static void generateImageFromPDF(String filename, String extension) throws IOException {
System.out.println("Creating " + extension + " image from a PDF file: " + filename);
PDDocument document = PDDocument.load(new File(filename));
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
System.out.println("Page number: " + (page + 1) + " is being rendered");
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
}
document.close();
System.out.println("Done.");
}
}

View File

@ -23,43 +23,30 @@ public class PDF2TextExample {
}
private static void generateTxtFromPDF(String filename) throws IOException {
System.out.println("Parsing text from PDF file " + filename);
String parsedText = null;
PDFTextStripper pdfStripper;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
File f = new File(filename);
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
try {
File f = new File(filename);
String parsedText;
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
COSDocument cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc);
} catch (Exception e) {
System.err.println("An exception occured in parsing the PDF Document.");
e.printStackTrace();
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e1) {
e.printStackTrace();
}
}
System.out.println("Writing PDF text to output text file");
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
PrintWriter pw = new PrintWriter("src/output/pdf.txt");
pw.print(parsedText);
pw.close();
} catch (Exception e) {
System.out.println("An exception occured in writing the pdf text to file.");
e.printStackTrace();
}
System.out.println("Done.");
}
}

View File

@ -26,36 +26,25 @@ public class PDF2WordExample {
}
private static void generateDocFromPDF(String filename) throws IOException {
System.out.println("Creating a docx file from a PDF file: " + filename);
// Create the word document
XWPFDocument doc = new XWPFDocument();
// Open the pdf file
String pdf = filename;
PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
// Read the PDF page by page
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
// Extract the text
String text = strategy.getResultantText();
// Create a new paragraph in the word document, adding the extracted
// text
XWPFParagraph p = doc.createParagraph();
XWPFRun run = p.createRun();
run.setText(text);
// Adding a page break
run.addBreak(BreakType.PAGE);
}
// Write the word document
FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
doc.write(out);
// Close all open files
out.close();
reader.close();
doc.close();
System.out.println("Done.");
}
}