//default package
import java.io.FileInputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.util.URIref;
import com.hp.hpl.jena.vocabulary.DC;
import com.hp.hpl.jena.vocabulary.DCTerms;
import com.hp.hpl.jena.vocabulary.DCTypes;
import com.hp.hpl.jena.vocabulary.RDF;
/**
* quick'n'dirty PDF to RDF converter
*
* @author mimas@mimas.ceti.pl
* @version 1.0
*
* @see PDFBox - Java PDF Library
* @see Jena - Semantic Web Framework
*/
public class PDF2RDF {
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.out.println( "Usage: java PDF2RDF " );
System.exit(1);
}
Model model = ModelFactory.createDefaultModel();
new PDF2RDF().addPDFResource(model, args[0]);
// w ramach testu wyświetlam RDF odczytany z PDF
// w zoptymalizowanej notacji RDF/XML
System.out.println("*** RDF/XML:\n");
model.write(System.out, "RDF/XML-ABBREV");
// i w notacji N3
System.out.println("\n*** N3:\n");
model.write(System.out, "N3");
}
/**
* @param model Model RDF, do którego dodajemy zasób PDF
* @param filename Nazwa pliku PDF-a
*/
public Resource addPDFResource(Model model, String filename) throws IOException {
Resource pdf = null;
PDDocument document = null;
FileInputStream file = null;
try {
file = new FileInputStream(filename);
PDFParser parser = new PDFParser(file);
parser.parse();
document = parser.getPDDocument();
String nsPdf = "http://ns.adobe.com/pdf/1.3/";
model.setNsPrefix("rdf", RDF.getURI());
model.setNsPrefix("dc", DC.NS);
model.setNsPrefix("dct", DCTerms.NS);
model.setNsPrefix("pdf", nsPdf);
pdf = model.createResource(URIref.encode("file://" + filename));
pdf.addProperty(DC.type, DCTypes.Text);
pdf.addProperty(DC.format, "application/pdf" );
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (Exception e) {
System.err.println("Error: Failed to decrypt document.");
return pdf;
}
}
PDDocumentInformation info = document.getDocumentInformation();
addProperty(model, pdf, nsPdf+"Title", info.getTitle());
addProperty(model, pdf, nsPdf+"Subject", info.getSubject());
addProperty(model, pdf, nsPdf+"Author", info.getAuthor());
addProperty(model, pdf, nsPdf+"CreationDate", formatDate(info.getCreationDate()));
addProperty(model, pdf, nsPdf+"ModDate", formatDate(info.getModificationDate()));
addProperty(model, pdf, nsPdf+"Creator", info.getCreator());
addProperty(model, pdf, nsPdf+"Producer", info.getProducer());
addProperty(model, pdf, nsPdf+"Keywords", info.getKeywords());
if ( info.getKeywords() != null )
{
String[] k = info.getKeywords().split(" ");
for (int i = 0; i < k.length; i++) {
pdf.addProperty(DC.subject, k[i]);
}
}
} finally {
if (file != null) {
file.close();
}
if (document != null) {
document.close();
}
}
return pdf;
}
private void addProperty(Model m, Resource r, String p, String s) {
if ( s != null )
r.addProperty(m.createProperty(p), s);
}
private String formatDate(Calendar date) {
String retval = null;
if (date != null) {
SimpleDateFormat formatter = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss");
retval = formatter.format(date.getTime());
}
return retval;
}
}