diff --git a/pdf/.gitignore b/pdf/.gitignore new file mode 100644 index 0000000000..b83d22266a --- /dev/null +++ b/pdf/.gitignore @@ -0,0 +1 @@ +/target/ diff --git a/pdf/pom.xml b/pdf/pom.xml new file mode 100644 index 0000000000..078a364e77 --- /dev/null +++ b/pdf/pom.xml @@ -0,0 +1,100 @@ + + + 4.0.0 + + com.baeldung + parent-modules + 1.0.0-SNAPSHOT + + pdf + pdf + http://maven.apache.org + + + UTF-8 + 3.5.1 + + + + + junit + junit + 3.8.1 + test + + + + org.apache.pdfbox + pdfbox + 2.0.3 + + + + org.apache.pdfbox + pdfbox-tools + 2.0.3 + + + + net.sf.cssbox + pdf2dom + 1.6 + + + + com.itextpdf + itextpdf + 5.5.10 + + + + org.apache.poi + poi + 3.15 + + + + org.apache.poi + poi-ooxml + 3.15 + + + + org.apache.poi + poi-scratchpad + 3.15 + + + + org.apache.xmlgraphics + batik-transcoder + 1.8 + + + + + pdf + + + src/main/resources + true + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler-plugin.version} + + 1.8 + 1.8 + + + + + + diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2HTMLExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2HTMLExample.java new file mode 100644 index 0000000000..72877a465a --- /dev/null +++ b/pdf/src/main/java/com/baeldung/pdf/PDF2HTMLExample.java @@ -0,0 +1,49 @@ +package com.baeldung.pdf; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.Writer; + +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.fit.pdfdom.PDFDomTree; + +public class PDF2HTMLExample { + + private static final String FILENAME = "src/main/resources/pdf.pdf"; + + public static void main(String[] args) { + try { + generateHTMLFromPDF(FILENAME); + } catch (IOException | ParserConfigurationException e) { + e.printStackTrace(); + } + } + + private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException { + System.out.println("Creating HTML file from a PDF file: " + filename); + PDDocument pdf = null; + try { + // load the PDF file using PDFBox + pdf = PDDocument.load(new File(filename)); + // create the DOM parser + PDFDomTree parser = new PDFDomTree(); + // parse the file and get the DOM Document + Writer output = new PrintWriter("src/output/pdf.html", "utf-8"); + parser.writeText(pdf, output); + output.close(); + } finally { + if (pdf != null) { + try { + pdf.close(); + } catch (IOException e) { + System.err.println("Error: " + e.getMessage()); + } + } + } + System.out.println("Done."); + } + +} diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2ImageExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2ImageExample.java new file mode 100644 index 0000000000..4cfaea26b9 --- /dev/null +++ b/pdf/src/main/java/com/baeldung/pdf/PDF2ImageExample.java @@ -0,0 +1,38 @@ +package com.baeldung.pdf; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.tools.imageio.ImageIOUtil; + +public class PDF2ImageExample { + + private static final String FILENAME = "src/main/resources/pdf.pdf"; + + public static void main(String[] args) { + try { + generateImageFromPDF(FILENAME, "png"); + generateImageFromPDF(FILENAME, "jpeg"); + generateImageFromPDF(FILENAME, "gif"); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void generateImageFromPDF(String filename, String extension) throws IOException { + System.out.println("Creating " + extension + " image from a PDF file: " + filename); + PDDocument document = PDDocument.load(new File(filename)); + PDFRenderer pdfRenderer = new PDFRenderer(document); + for (int page = 0; page < document.getNumberOfPages(); ++page) { + System.out.println("Page number: " + (page + 1) + " is being rendered"); + BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); + ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300); + } + document.close(); + System.out.println("Done."); + } +} diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2TextExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2TextExample.java new file mode 100644 index 0000000000..eafdc07560 --- /dev/null +++ b/pdf/src/main/java/com/baeldung/pdf/PDF2TextExample.java @@ -0,0 +1,65 @@ +package com.baeldung.pdf; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; + +import org.apache.pdfbox.cos.COSDocument; +import org.apache.pdfbox.io.RandomAccessFile; +import org.apache.pdfbox.pdfparser.PDFParser; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; + +public class PDF2TextExample { + + private static final String FILENAME = "src/main/resources/pdf.pdf"; + + public static void main(String[] args) { + try { + generateTxtFromPDF(FILENAME); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void generateTxtFromPDF(String filename) throws IOException { + System.out.println("Parsing text from PDF file " + filename); + String parsedText = null; + PDFTextStripper pdfStripper; + PDDocument pdDoc = null; + COSDocument cosDoc = null; + + File f = new File(filename); + PDFParser parser = new PDFParser(new RandomAccessFile(f, "r")); + + try { + parser.parse(); + cosDoc = parser.getDocument(); + pdfStripper = new PDFTextStripper(); + pdDoc = new PDDocument(cosDoc); + parsedText = pdfStripper.getText(pdDoc); + } catch (Exception e) { + System.err.println("An exception occured in parsing the PDF Document."); + e.printStackTrace(); + try { + if (cosDoc != null) + cosDoc.close(); + if (pdDoc != null) + pdDoc.close(); + } catch (Exception e1) { + e.printStackTrace(); + } + } + System.out.println("Writing PDF text to output text file"); + try { + PrintWriter pw = new PrintWriter("src/output/pdf.txt"); + pw.print(parsedText); + pw.close(); + } catch (Exception e) { + System.out.println("An exception occured in writing the pdf text to file."); + e.printStackTrace(); + } + System.out.println("Done."); + } + +} diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2WordExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2WordExample.java new file mode 100644 index 0000000000..6777ea9c45 --- /dev/null +++ b/pdf/src/main/java/com/baeldung/pdf/PDF2WordExample.java @@ -0,0 +1,61 @@ +package com.baeldung.pdf; + +import java.io.FileOutputStream; +import java.io.IOException; + +import org.apache.poi.xwpf.usermodel.BreakType; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.poi.xwpf.usermodel.XWPFRun; + +import com.itextpdf.text.pdf.PdfReader; +import com.itextpdf.text.pdf.parser.PdfReaderContentParser; +import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; +import com.itextpdf.text.pdf.parser.TextExtractionStrategy; + +public class PDF2WordExample { + + private static final String FILENAME = "src/main/resources/pdf.pdf"; + + public static void main(String[] args) { + try { + generateDocFromPDF(FILENAME); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void generateDocFromPDF(String filename) throws IOException { + System.out.println("Creating a docx file from a PDF file: " + filename); + // Create the word document + XWPFDocument doc = new XWPFDocument(); + + // Open the pdf file + String pdf = filename; + PdfReader reader = new PdfReader(pdf); + PdfReaderContentParser parser = new PdfReaderContentParser(reader); + + // Read the PDF page by page + for (int i = 1; i <= reader.getNumberOfPages(); i++) { + TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); + // Extract the text + String text = strategy.getResultantText(); + // Create a new paragraph in the word document, adding the extracted + // text + XWPFParagraph p = doc.createParagraph(); + XWPFRun run = p.createRun(); + run.setText(text); + // Adding a page break + run.addBreak(BreakType.PAGE); + } + // Write the word document + FileOutputStream out = new FileOutputStream("src/output/pdf.docx"); + doc.write(out); + // Close all open files + out.close(); + reader.close(); + doc.close(); + System.out.println("Done."); + } + +} diff --git a/pdf/src/main/resources/pdf.pdf b/pdf/src/main/resources/pdf.pdf new file mode 100644 index 0000000000..f45d226b39 Binary files /dev/null and b/pdf/src/main/resources/pdf.pdf differ diff --git a/pom.xml b/pom.xml index 82aa022269..ef24531698 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,5 @@ - + + 4.0.0 com.baeldung parent-modules @@ -143,6 +143,7 @@ xml xmlunit2 xstream - + pdf + - + \ No newline at end of file