This commit is contained in:
michal_aibin 2016-11-02 19:27:44 +01:00
parent e3b40e4556
commit 1150cbd406
8 changed files with 319 additions and 4 deletions

1
pdf/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target/

100
pdf/pom.xml Normal file
View File

@ -0,0 +1,100 @@
<?xml version="1.0"?>
<project
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>pdf</artifactId>
<name>pdf</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven-compiler-plugin.version>3.5.1</maven-compiler-plugin.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox-tools -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sf.cssbox/pdf2dom -->
<dependency>
<groupId>net.sf.cssbox</groupId>
<artifactId>pdf2dom</artifactId>
<version>1.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.itextpdf/itextpdf -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.xmlgraphics/batik-transcoder -->
<dependency>
<groupId>org.apache.xmlgraphics</groupId>
<artifactId>batik-transcoder</artifactId>
<version>1.8</version>
</dependency>
</dependencies>
<build>
<finalName>pdf</finalName>
<resources>
<resource>
<directory>src/main/resources</directory>
<filtering>true</filtering>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compiler-plugin.version}</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,49 @@
package com.baeldung.pdf;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
public class PDF2HTMLExample {
private static final String FILENAME = "src/main/resources/pdf.pdf";
public static void main(String[] args) {
try {
generateHTMLFromPDF(FILENAME);
} catch (IOException | ParserConfigurationException e) {
e.printStackTrace();
}
}
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
System.out.println("Creating HTML file from a PDF file: " + filename);
PDDocument pdf = null;
try {
// load the PDF file using PDFBox
pdf = PDDocument.load(new File(filename));
// create the DOM parser
PDFDomTree parser = new PDFDomTree();
// parse the file and get the DOM Document
Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
parser.writeText(pdf, output);
output.close();
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (IOException e) {
System.err.println("Error: " + e.getMessage());
}
}
}
System.out.println("Done.");
}
}

View File

@ -0,0 +1,38 @@
package com.baeldung.pdf;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
public class PDF2ImageExample {
private static final String FILENAME = "src/main/resources/pdf.pdf";
public static void main(String[] args) {
try {
generateImageFromPDF(FILENAME, "png");
generateImageFromPDF(FILENAME, "jpeg");
generateImageFromPDF(FILENAME, "gif");
} catch (IOException e) {
e.printStackTrace();
}
}
private static void generateImageFromPDF(String filename, String extension) throws IOException {
System.out.println("Creating " + extension + " image from a PDF file: " + filename);
PDDocument document = PDDocument.load(new File(filename));
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
System.out.println("Page number: " + (page + 1) + " is being rendered");
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
}
document.close();
System.out.println("Done.");
}
}

View File

@ -0,0 +1,65 @@
package com.baeldung.pdf;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDF2TextExample {
private static final String FILENAME = "src/main/resources/pdf.pdf";
public static void main(String[] args) {
try {
generateTxtFromPDF(FILENAME);
} catch (IOException e) {
e.printStackTrace();
}
}
private static void generateTxtFromPDF(String filename) throws IOException {
System.out.println("Parsing text from PDF file " + filename);
String parsedText = null;
PDFTextStripper pdfStripper;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
File f = new File(filename);
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc);
} catch (Exception e) {
System.err.println("An exception occured in parsing the PDF Document.");
e.printStackTrace();
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e1) {
e.printStackTrace();
}
}
System.out.println("Writing PDF text to output text file");
try {
PrintWriter pw = new PrintWriter("src/output/pdf.txt");
pw.print(parsedText);
pw.close();
} catch (Exception e) {
System.out.println("An exception occured in writing the pdf text to file.");
e.printStackTrace();
}
System.out.println("Done.");
}
}

View File

@ -0,0 +1,61 @@
package com.baeldung.pdf;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.poi.xwpf.usermodel.BreakType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
public class PDF2WordExample {
private static final String FILENAME = "src/main/resources/pdf.pdf";
public static void main(String[] args) {
try {
generateDocFromPDF(FILENAME);
} catch (IOException e) {
e.printStackTrace();
}
}
private static void generateDocFromPDF(String filename) throws IOException {
System.out.println("Creating a docx file from a PDF file: " + filename);
// Create the word document
XWPFDocument doc = new XWPFDocument();
// Open the pdf file
String pdf = filename;
PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
// Read the PDF page by page
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
// Extract the text
String text = strategy.getResultantText();
// Create a new paragraph in the word document, adding the extracted
// text
XWPFParagraph p = doc.createParagraph();
XWPFRun run = p.createRun();
run.setText(text);
// Adding a page break
run.addBreak(BreakType.PAGE);
}
// Write the word document
FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
doc.write(out);
// Close all open files
out.close();
reader.close();
doc.close();
System.out.println("Done.");
}
}

Binary file not shown.

View File

@ -1,5 +1,5 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
@ -143,6 +143,7 @@
<module>xml</module>
<module>xmlunit2</module>
<module>xstream</module>
</modules>
<module>pdf</module>
</modules>
</project>
</project>