diff --git a/pdf/.gitignore b/pdf/.gitignore
new file mode 100644
index 0000000000..b83d22266a
--- /dev/null
+++ b/pdf/.gitignore
@@ -0,0 +1 @@
+/target/
diff --git a/pdf/pom.xml b/pdf/pom.xml
new file mode 100644
index 0000000000..078a364e77
--- /dev/null
+++ b/pdf/pom.xml
@@ -0,0 +1,100 @@
+
+
+ 4.0.0
+
+ com.baeldung
+ parent-modules
+ 1.0.0-SNAPSHOT
+
+ pdf
+ pdf
+ http://maven.apache.org
+
+
+ UTF-8
+ 3.5.1
+
+
+
+
+ junit
+ junit
+ 3.8.1
+ test
+
+
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.3
+
+
+
+ org.apache.pdfbox
+ pdfbox-tools
+ 2.0.3
+
+
+
+ net.sf.cssbox
+ pdf2dom
+ 1.6
+
+
+
+ com.itextpdf
+ itextpdf
+ 5.5.10
+
+
+
+ org.apache.poi
+ poi
+ 3.15
+
+
+
+ org.apache.poi
+ poi-ooxml
+ 3.15
+
+
+
+ org.apache.poi
+ poi-scratchpad
+ 3.15
+
+
+
+ org.apache.xmlgraphics
+ batik-transcoder
+ 1.8
+
+
+
+
+ pdf
+
+
+ src/main/resources
+ true
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ ${maven-compiler-plugin.version}
+
+
+ 1.8
+
+
+
+
+
+
diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2HTMLExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2HTMLExample.java
new file mode 100644
index 0000000000..72877a465a
--- /dev/null
+++ b/pdf/src/main/java/com/baeldung/pdf/PDF2HTMLExample.java
@@ -0,0 +1,49 @@
+package com.baeldung.pdf;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Writer;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.fit.pdfdom.PDFDomTree;
+
+public class PDF2HTMLExample {
+
+ private static final String FILENAME = "src/main/resources/pdf.pdf";
+
+ public static void main(String[] args) {
+ try {
+ generateHTMLFromPDF(FILENAME);
+ } catch (IOException | ParserConfigurationException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
+ System.out.println("Creating HTML file from a PDF file: " + filename);
+ PDDocument pdf = null;
+ try {
+ // load the PDF file using PDFBox
+ pdf = PDDocument.load(new File(filename));
+ // create the DOM parser
+ PDFDomTree parser = new PDFDomTree();
+ // parse the file and get the DOM Document
+ Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
+ parser.writeText(pdf, output);
+ output.close();
+ } finally {
+ if (pdf != null) {
+ try {
+ pdf.close();
+ } catch (IOException e) {
+ System.err.println("Error: " + e.getMessage());
+ }
+ }
+ }
+ System.out.println("Done.");
+ }
+
+}
diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2ImageExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2ImageExample.java
new file mode 100644
index 0000000000..4cfaea26b9
--- /dev/null
+++ b/pdf/src/main/java/com/baeldung/pdf/PDF2ImageExample.java
@@ -0,0 +1,38 @@
+package com.baeldung.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+
+public class PDF2ImageExample {
+
+ private static final String FILENAME = "src/main/resources/pdf.pdf";
+
+ public static void main(String[] args) {
+ try {
+ generateImageFromPDF(FILENAME, "png");
+ generateImageFromPDF(FILENAME, "jpeg");
+ generateImageFromPDF(FILENAME, "gif");
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void generateImageFromPDF(String filename, String extension) throws IOException {
+ System.out.println("Creating " + extension + " image from a PDF file: " + filename);
+ PDDocument document = PDDocument.load(new File(filename));
+ PDFRenderer pdfRenderer = new PDFRenderer(document);
+ for (int page = 0; page < document.getNumberOfPages(); ++page) {
+ System.out.println("Page number: " + (page + 1) + " is being rendered");
+ BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
+ ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
+ }
+ document.close();
+ System.out.println("Done.");
+ }
+}
diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2TextExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2TextExample.java
new file mode 100644
index 0000000000..eafdc07560
--- /dev/null
+++ b/pdf/src/main/java/com/baeldung/pdf/PDF2TextExample.java
@@ -0,0 +1,65 @@
+package com.baeldung.pdf;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.io.RandomAccessFile;
+import org.apache.pdfbox.pdfparser.PDFParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+public class PDF2TextExample {
+
+ private static final String FILENAME = "src/main/resources/pdf.pdf";
+
+ public static void main(String[] args) {
+ try {
+ generateTxtFromPDF(FILENAME);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void generateTxtFromPDF(String filename) throws IOException {
+ System.out.println("Parsing text from PDF file " + filename);
+ String parsedText = null;
+ PDFTextStripper pdfStripper;
+ PDDocument pdDoc = null;
+ COSDocument cosDoc = null;
+
+ File f = new File(filename);
+ PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
+
+ try {
+ parser.parse();
+ cosDoc = parser.getDocument();
+ pdfStripper = new PDFTextStripper();
+ pdDoc = new PDDocument(cosDoc);
+ parsedText = pdfStripper.getText(pdDoc);
+ } catch (Exception e) {
+ System.err.println("An exception occured in parsing the PDF Document.");
+ e.printStackTrace();
+ try {
+ if (cosDoc != null)
+ cosDoc.close();
+ if (pdDoc != null)
+ pdDoc.close();
+ } catch (Exception e1) {
+ e.printStackTrace();
+ }
+ }
+ System.out.println("Writing PDF text to output text file");
+ try {
+ PrintWriter pw = new PrintWriter("src/output/pdf.txt");
+ pw.print(parsedText);
+ pw.close();
+ } catch (Exception e) {
+ System.out.println("An exception occured in writing the pdf text to file.");
+ e.printStackTrace();
+ }
+ System.out.println("Done.");
+ }
+
+}
diff --git a/pdf/src/main/java/com/baeldung/pdf/PDF2WordExample.java b/pdf/src/main/java/com/baeldung/pdf/PDF2WordExample.java
new file mode 100644
index 0000000000..6777ea9c45
--- /dev/null
+++ b/pdf/src/main/java/com/baeldung/pdf/PDF2WordExample.java
@@ -0,0 +1,61 @@
+package com.baeldung.pdf;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.poi.xwpf.usermodel.BreakType;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+
+import com.itextpdf.text.pdf.PdfReader;
+import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
+import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
+import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
+
+public class PDF2WordExample {
+
+ private static final String FILENAME = "src/main/resources/pdf.pdf";
+
+ public static void main(String[] args) {
+ try {
+ generateDocFromPDF(FILENAME);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void generateDocFromPDF(String filename) throws IOException {
+ System.out.println("Creating a docx file from a PDF file: " + filename);
+ // Create the word document
+ XWPFDocument doc = new XWPFDocument();
+
+ // Open the pdf file
+ String pdf = filename;
+ PdfReader reader = new PdfReader(pdf);
+ PdfReaderContentParser parser = new PdfReaderContentParser(reader);
+
+ // Read the PDF page by page
+ for (int i = 1; i <= reader.getNumberOfPages(); i++) {
+ TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
+ // Extract the text
+ String text = strategy.getResultantText();
+ // Create a new paragraph in the word document, adding the extracted
+ // text
+ XWPFParagraph p = doc.createParagraph();
+ XWPFRun run = p.createRun();
+ run.setText(text);
+ // Adding a page break
+ run.addBreak(BreakType.PAGE);
+ }
+ // Write the word document
+ FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
+ doc.write(out);
+ // Close all open files
+ out.close();
+ reader.close();
+ doc.close();
+ System.out.println("Done.");
+ }
+
+}
diff --git a/pdf/src/main/resources/pdf.pdf b/pdf/src/main/resources/pdf.pdf
new file mode 100644
index 0000000000..f45d226b39
Binary files /dev/null and b/pdf/src/main/resources/pdf.pdf differ
diff --git a/pom.xml b/pom.xml
index 82aa022269..ef24531698 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,5 +1,5 @@
-
+
+
4.0.0
com.baeldung
parent-modules
@@ -143,6 +143,7 @@
xml
xmlunit2
xstream
-
+ pdf
+
-
+
\ No newline at end of file