//default package import java.io.FileInputStream; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Calendar; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.util.URIref; import com.hp.hpl.jena.vocabulary.DC; import com.hp.hpl.jena.vocabulary.DCTerms; import com.hp.hpl.jena.vocabulary.DCTypes; import com.hp.hpl.jena.vocabulary.RDF; /** * quick'n'dirty PDF to RDF converter * * @author mimas@mimas.ceti.pl * @version 1.0 * * @see PDFBox - Java PDF Library * @see Jena - Semantic Web Framework */ public class PDF2RDF { public static void main(String[] args) throws Exception { if (args.length < 1) { System.out.println( "Usage: java PDF2RDF " ); System.exit(1); } Model model = ModelFactory.createDefaultModel(); new PDF2RDF().addPDFResource(model, args[0]); // w ramach testu wyświetlam RDF odczytany z PDF // w zoptymalizowanej notacji RDF/XML System.out.println("*** RDF/XML:\n"); model.write(System.out, "RDF/XML-ABBREV"); // i w notacji N3 System.out.println("\n*** N3:\n"); model.write(System.out, "N3"); } /** * @param model Model RDF, do którego dodajemy zasób PDF * @param filename Nazwa pliku PDF-a */ public Resource addPDFResource(Model model, String filename) throws IOException { Resource pdf = null; PDDocument document = null; FileInputStream file = null; try { file = new FileInputStream(filename); PDFParser parser = new PDFParser(file); parser.parse(); document = parser.getPDDocument(); String nsPdf = "http://ns.adobe.com/pdf/1.3/"; model.setNsPrefix("rdf", RDF.getURI()); model.setNsPrefix("dc", DC.NS); model.setNsPrefix("dct", DCTerms.NS); model.setNsPrefix("pdf", nsPdf); pdf = model.createResource(URIref.encode("file://" + filename)); pdf.addProperty(DC.type, DCTypes.Text); pdf.addProperty(DC.format, "application/pdf" ); if (document.isEncrypted()) { try { document.decrypt(""); } catch (Exception e) { System.err.println("Error: Failed to decrypt document."); return pdf; } } PDDocumentInformation info = document.getDocumentInformation(); addProperty(model, pdf, nsPdf+"Title", info.getTitle()); addProperty(model, pdf, nsPdf+"Subject", info.getSubject()); addProperty(model, pdf, nsPdf+"Author", info.getAuthor()); addProperty(model, pdf, nsPdf+"CreationDate", formatDate(info.getCreationDate())); addProperty(model, pdf, nsPdf+"ModDate", formatDate(info.getModificationDate())); addProperty(model, pdf, nsPdf+"Creator", info.getCreator()); addProperty(model, pdf, nsPdf+"Producer", info.getProducer()); addProperty(model, pdf, nsPdf+"Keywords", info.getKeywords()); if ( info.getKeywords() != null ) { String[] k = info.getKeywords().split(" "); for (int i = 0; i < k.length; i++) { pdf.addProperty(DC.subject, k[i]); } } } finally { if (file != null) { file.close(); } if (document != null) { document.close(); } } return pdf; } private void addProperty(Model m, Resource r, String p, String s) { if ( s != null ) r.addProperty(m.createProperty(p), s); } private String formatDate(Calendar date) { String retval = null; if (date != null) { SimpleDateFormat formatter = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss"); retval = formatter.format(date.getTime()); } return retval; } }