Code cleanup for PDF module (#800)
* PDF to X * PDF to X * Remove created doc * Code fixes and cleanup for PDF module
This commit is contained in:
parent
8628a63d95
commit
0ea29df589
|
@ -23,27 +23,17 @@ public class PDF2HTMLExample {
|
|||
}
|
||||
|
||||
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
|
||||
System.out.println("Creating HTML file from a PDF file: " + filename);
|
||||
PDDocument pdf = null;
|
||||
try {
|
||||
// load the PDF file using PDFBox
|
||||
pdf = PDDocument.load(new File(filename));
|
||||
// create the DOM parser
|
||||
PDDocument pdf = PDDocument.load(new File(filename));
|
||||
PDFDomTree parser = new PDFDomTree();
|
||||
// parse the file and get the DOM Document
|
||||
Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
|
||||
parser.writeText(pdf, output);
|
||||
output.close();
|
||||
} finally {
|
||||
if (pdf != null) {
|
||||
try {
|
||||
pdf.close();
|
||||
} catch (IOException e) {
|
||||
System.err.println("Error: " + e.getMessage());
|
||||
}
|
||||
pdf.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
System.out.println("Done.");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,15 +24,12 @@ public class PDF2ImageExample {
|
|||
}
|
||||
|
||||
private static void generateImageFromPDF(String filename, String extension) throws IOException {
|
||||
System.out.println("Creating " + extension + " image from a PDF file: " + filename);
|
||||
PDDocument document = PDDocument.load(new File(filename));
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
for (int page = 0; page < document.getNumberOfPages(); ++page) {
|
||||
System.out.println("Page number: " + (page + 1) + " is being rendered");
|
||||
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
|
||||
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
|
||||
}
|
||||
document.close();
|
||||
System.out.println("Done.");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,43 +23,30 @@ public class PDF2TextExample {
|
|||
}
|
||||
|
||||
private static void generateTxtFromPDF(String filename) throws IOException {
|
||||
System.out.println("Parsing text from PDF file " + filename);
|
||||
String parsedText = null;
|
||||
PDFTextStripper pdfStripper;
|
||||
PDDocument pdDoc = null;
|
||||
COSDocument cosDoc = null;
|
||||
|
||||
File f = new File(filename);
|
||||
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
|
||||
|
||||
try {
|
||||
File f = new File(filename);
|
||||
String parsedText;
|
||||
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
|
||||
parser.parse();
|
||||
cosDoc = parser.getDocument();
|
||||
pdfStripper = new PDFTextStripper();
|
||||
pdDoc = new PDDocument(cosDoc);
|
||||
|
||||
COSDocument cosDoc = parser.getDocument();
|
||||
|
||||
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||
PDDocument pdDoc = new PDDocument(cosDoc);
|
||||
|
||||
parsedText = pdfStripper.getText(pdDoc);
|
||||
} catch (Exception e) {
|
||||
System.err.println("An exception occured in parsing the PDF Document.");
|
||||
e.printStackTrace();
|
||||
try {
|
||||
if (cosDoc != null)
|
||||
cosDoc.close();
|
||||
if (pdDoc != null)
|
||||
pdDoc.close();
|
||||
} catch (Exception e1) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
System.out.println("Writing PDF text to output text file");
|
||||
try {
|
||||
|
||||
if (cosDoc != null)
|
||||
cosDoc.close();
|
||||
if (pdDoc != null)
|
||||
pdDoc.close();
|
||||
|
||||
PrintWriter pw = new PrintWriter("src/output/pdf.txt");
|
||||
pw.print(parsedText);
|
||||
pw.close();
|
||||
} catch (Exception e) {
|
||||
System.out.println("An exception occured in writing the pdf text to file.");
|
||||
e.printStackTrace();
|
||||
}
|
||||
System.out.println("Done.");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -26,36 +26,25 @@ public class PDF2WordExample {
|
|||
}
|
||||
|
||||
private static void generateDocFromPDF(String filename) throws IOException {
|
||||
System.out.println("Creating a docx file from a PDF file: " + filename);
|
||||
// Create the word document
|
||||
XWPFDocument doc = new XWPFDocument();
|
||||
|
||||
// Open the pdf file
|
||||
String pdf = filename;
|
||||
PdfReader reader = new PdfReader(pdf);
|
||||
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
|
||||
|
||||
// Read the PDF page by page
|
||||
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
|
||||
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
|
||||
// Extract the text
|
||||
String text = strategy.getResultantText();
|
||||
// Create a new paragraph in the word document, adding the extracted
|
||||
// text
|
||||
XWPFParagraph p = doc.createParagraph();
|
||||
XWPFRun run = p.createRun();
|
||||
run.setText(text);
|
||||
// Adding a page break
|
||||
run.addBreak(BreakType.PAGE);
|
||||
}
|
||||
// Write the word document
|
||||
FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
|
||||
doc.write(out);
|
||||
// Close all open files
|
||||
out.close();
|
||||
reader.close();
|
||||
doc.close();
|
||||
System.out.println("Done.");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue