#!/usr/bin/env bash # PDF2ARCHIVE 0.3.2 # (C) 2018 Matteo Seclì # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . #=====# INITIALIZE VARIABLES #=====# VERSION="0.3.2" INPUT="" OUTPUT="" QUALITYOPTS="" DEBUG=false VALIDATE=false MSGOPTS="-dQUIET -sstdout=/dev/null" VERAMSGOPTS="" #ERROPTS="2>/dev/null" #=====# HELP FUNCTION #=====# help() { TOTLEN="38" # Adjust this TITLESTRING="PDF2ARCHIVE, version $VERSION" SPACEL=$(echo "($TOTLEN-${#TITLESTRING})/2 + (36-${#TITLESTRING})%2 - 1" | bc) SPACER=$(echo "($TOTLEN-${#TITLESTRING})/2 - 1" | bc) TITLESTRING=$(printf "|%-${SPACEL}s%s%-${SPACER}s|" "" "$TITLESTRING" "") DASHSTRING=$(eval printf "%.0s-" {1..$TOTLEN}) echo \ "$DASHSTRING $TITLESTRING $DASHSTRING OVERVIEW: A simple Ghostscript-based PDF to PDF/A-1B converter. USAGE: $0 [options] input.pdf [output.pdf] EXAMPLES: Convert 'input.pdf' in PDF/A-1B format; the output is 'input-PDFA.pdf': $0 input.pdf Convert 'input.pdf' in PDF/A-1B format; the output is 'output.pdf': $0 input.pdf output.pdf Convert 'input.pdf' in PDF/A-1B format and perform a high-quality compression: $0 --quality=high input.pdf Convert 'input.pdf' in PDF/A-1B format and specify the document title: $0 --title=\"Title of your nice document\" input.pdf Convert 'input.pdf' in PDF/A-1B format and validate the result: $0 --validate input.pdf OPTIONS: -h, --help Show the help --quality= Set the quality of the output when downsampling. The possible values are 'high', 'medium' and 'low', where 'high' gives the highest output quality. By specifying no option, no additional downsampling is done. --title= Title of the resulting PDF/A file --author= Author of the resulting PDF/A file --subject= Subject of the resulting PDF/A file --keywords= Comma-separated keywords of the resulting PDF/A file --cleanmetadata Clean all the standard metadata fields, except the ones specified via the command line options. --validate Validate the resulting file. The validation is done with VeraPDF, you need a working Java installation. --validate-only Perform only the validation on the input file, again using VeraPDF --debug Write additional debug information on screen -v, --version Show the program version LICENSE: GPLv3 AUTHORS: (C) 2017-2018 Matteo Seclì" } #=====# RUN HELPER FUNCTION #=====# run() { if $DEBUG; then #v=$(exec 2>&1 && set -x && set -- "$@") #echo "#${v#*--}" "$@" else "$@" 2>/dev/null #>/dev/null 2>&1 fi } #=====# CHECKS #=====# if [ "$(which gs)" == "" ]; then echo " ERROR: Ghostscript is not installed or it's not in the path" exit fi #=====# VALIDATION #=====# javaCheck() { if [ "$(which java)" == "" ]; then echo " ERROR: Java is not installed or it's not in the path" echo " Cannot perform validation" exit 1 fi } validate() { echo " Validating..." echo " $(./verapdf/verapdf "$1" --extract --flavour 1b --format text "$2")" } #=====# INPUT PARSER #=====# if [ "$1" == "" ]; then help exit fi while [ "$1" != "" ]; do PARAM=`echo $1 | awk -F= '{print $1}'` VALUE=`echo $1 | awk -F= '{print $2}'` case $PARAM in -h | --help) help exit ;; -v | --version) echo $VERSION exit ;; --debug) DEBUG=true MSGOPTS="" VERAMSGOPTS="--verbose" #ERROPTS="" ;; --quality) if [ "$VALUE" == "high" ]; then QUALITYOPTS="-dPDFSETTINGS=/printer" elif [ "$VALUE" == "medium" ]; then QUALITYOPTS="-dPDFSETTINGS=/ebook" elif [ "$VALUE" == "low" ]; then QUALITYOPTS="-dPDFSETTINGS=/screen" else echo " ERROR: unknown quality option '$VALUE'" help exit 1 fi ;; --cleanmetadata) [ -z ${PDFTITLE+x} ] && PDFTITLE="" [ -z ${PDFAUTHOR+x} ] && PDFAUTHOR="" [ -z ${PDFSUBJECT+x} ] && PDFSUBJECT="" [ -z ${PDFKEYWORDS+x} ] && PDFKEYWORDS="" [ -z ${PDFCREATOR+x} ] && PDFCREATOR="" [ -z ${PDFPRODUCER+x} ] && PDFPRODUCER="" [ -z ${PDFCREATIONDATE+x} ] && PDFCREATIONDATE="" [ -z ${PDFMODDATE+x} ] && PDFMODDATE="" [ -z ${PDFTRAPPED+x} ] && PDFTRAPPED="" ;; --title) PDFTITLE=$VALUE ;; --author) PDFAUTHOR=$VALUE ;; --subject) PDFSUBJECT=$VALUE ;; --keywords) PDFKEYWORDS=$VALUE ;; --validate) javaCheck VALIDATE=true ;; --validate-only) javaCheck validate $VERAMSGOPTS $2 exit ;; *.pdf) if [ "$INPUT" == "" ]; then INPUT=$PARAM elif [ "$OUTPUT" == "" ]; then OUTPUT=$PARAM else echo " ERROR: too many PDF files as input!" help exit 1 fi ;; *) echo " ERROR: unknown parameter \"$PARAM\"" help exit 1 ;; esac shift done #=====# SET UP ALL THE STUFF #=====# echo "=== Welcome to PDF2ARCHIVE ===" if [ "$OUTPUT" == "" ]; then OUTPUT="${INPUT%.pdf}-PDFA.pdf" fi TMPFILE=$(mktemp) TMPDIR=$(mktemp -d) PSTMPFILE=$TMPDIR/PDFA_def.ps ICCTMPFILE=$TMPDIR/AdobeRGB1998.icc INFOTMPFILE=$TMPDIR/pdf_minimal_info.ps echo \ "%!PS % Extract PDF info in a minimal way. % Inspired by 'toolbin/pdf_info.ps'. /QUIET true def File dup (r) file runpdfbegin Trailer /Info knownoget { dup /Title knownoget { (__knowninfoTitle: ) print = flush } if dup /Author knownoget { (__knowninfoAuthor: ) print = flush } if dup /Subject knownoget { (__knowninfoSubject: ) print = flush } if dup /Keywords knownoget { (__knowninfoKeywords: ) print = flush } if dup /Creator knownoget { (__knowninfoCreator: ) print = flush } if dup /Producer knownoget { (__knowninfoProducer: ) print = flush } if dup /CreationDate knownoget { (__knowninfoCreationDate: ) print = flush } if dup /ModDate knownoget { (__knowninfoModDate: ) print = flush } if dup /Trapped knownoget { (__knowninfoTrapped: ) print = flush } if } if quit " > $INFOTMPFILE #=====# PRESERVE UNSPECIFIED KNOWN STANDARD METADATA #=====# # Notes: # 'iconv' is necessary to filter out all the invalid bytes. # If it's not used, sed (unless it's GNU sed) will fail with # 'RE error: illegal byte sequence'. A solution to this is to # use 'LC_CTYPE=C && LANG=C && echo "$METADUMP" ...' in the # variable assignments; however, this produces bad PDF files. # METADUMP=$(gs -dNOSAFER -dNODISPLAY -q -sFile="$INPUT" $INFOTMPFILE | iconv -f utf-8 -t utf-8 -c) [ -z ${PDFTITLE+x} ] && PDFTITLE=$(echo "$METADUMP" | grep "__knowninfoTitle: " | sed "s/^__knowninfoTitle: //g") [ -z ${PDFAUTHOR+x} ] && PDFAUTHOR=$(echo "$METADUMP" | grep "__knowninfoAuthor: " | sed "s/^__knowninfoAuthor: //g") [ -z ${PDFSUBJECT+x} ] && PDFSUBJECT=$(echo "$METADUMP" | grep "__knowninfoSubject: " | sed "s/^__knowninfoSubject: //g") [ -z ${PDFKEYWORDS+x} ] && PDFKEYWORDS=$(echo "$METADUMP" | grep "__knowninfoKeywords: " | sed "s/^__knowninfoKeywords: //g") [ -z ${PDFCREATOR+x} ] && PDFCREATOR=$(echo "$METADUMP" | grep "__knowninfoCreator: " | sed "s/^__knowninfoCreator: //g") [ -z ${PDFPRODUCER+x} ] && PDFPRODUCER=$(echo "$METADUMP" | grep "__knowninfoProducer: " | sed "s/^__knowninfoProducer: //g") [ -z ${PDFCREATIONDATE+x} ] && PDFCREATIONDATE=$(echo "$METADUMP" | grep "__knowninfoCreationDate: " | sed "s/^__knowninfoCreationDate: //g") [ -z ${PDFMODDATE+x} ] && PDFMODDATE=$(echo "$METADUMP" | grep "__knowninfoModDate: " | sed "s/^__knowninfoModDate: //g") [ -z ${PDFTRAPPED+x} ] && PDFTRAPPED=$(echo "$METADUMP" | grep "__knowninfoTrapped: " | sed "s/^__knowninfoTrapped: //g") # Replace "Trapped" string, if not empty, with an operator. Fixes 3Heights. if [ "$PDFTRAPPED" != "" ]; then PDFTRAPPED="/$(tr '[:lower:]' '[:upper:]' <<< ${PDFTRAPPED:0:1})$(tr '[:upper:]' '[:lower:]' <<< ${PDFTRAPPED:1})" fi # Check if the operator is allowed, otherwise empty variable. if [ "$PDFTRAPPED" != "/True" ] && [ "$PDFTRAPPED" != "/False" ]; then PDFTRAPPED="" fi #=====# PRINT DEBUG INFO #=====# if $DEBUG; then echo " DEBUG: running PDF2ARCHIVE, version $VERSION" echo " DEBUG: using Ghostscript binary at $(which gs), version $(gs --version)" echo " DEBUG: the input file is '$INPUT'" echo " DEBUG: the output file is '$OUTPUT'" echo " DEBUG: the intermediate processing file is $TMPFILE" echo " DEBUG: the temporary directory is $TMPDIR" echo " DEBUG: the current quality options are '$QUALITYOPTS'" echo " DEBUG: PDF title '$PDFTITLE'" echo " DEBUG: PDF author '$PDFAUTHOR'" echo " DEBUG: PDF subject '$PDFSUBJECT'" echo " DEBUG: PDF keywords '$PDFKEYWORDS'" echo " DEBUG: PDF creator '$PDFCREATOR'" echo " DEBUG: PDF producer '$PDFPRODUCER'" echo " DEBUG: PDF creation date '$PDFCREATIONDATE'" echo " DEBUG: PDF modification date '$PDFMODDATE'" echo " DEBUG: PDF trapping '$PDFTRAPPED'" fi #=====# CREATE THE PS DEFINITION FILE #=====# echo " Creating the definition file..." echo \ "%! % This prefix file for creating a PDF/A document is derived from % the sample included with Ghostscript 9.07, released under the % GNU Affero General Public License. % Modified 4/15/2013 by MCB Systems. % Feel free to modify entries marked with \"Customize\". % This assumes an ICC profile to reside in the file (AdobeRGB1998.icc), % unless the user modifies the corresponding line below. % The color space described by the ICC profile must correspond to the % ProcessColorModel specified when using this prefix file (GRAY with % DeviceGray, RGB with DeviceRGB, and CMYK with DeviceCMYK). % Define entries in the document Info dictionary : /ICCProfile ($ICCTMPFILE) % Customize. def [ /Title ($PDFTITLE) % Customize." > $PSTMPFILE if [ "$PDFAUTHOR" != "" ]; then echo " /Author ($PDFAUTHOR)" >> $PSTMPFILE fi if [ "$PDFSUBJECT" != "" ]; then echo " /Subject ($PDFSUBJECT)" >> $PSTMPFILE fi if [ "$PDFKEYWORDS" != "" ]; then echo " /Keywords ($PDFKEYWORDS)" >> $PSTMPFILE fi if [ "$PDFCREATOR" != "" ]; then echo " /Creator ($PDFCREATOR)" >> $PSTMPFILE fi echo \ "% /Producer % Reserved to GS % /CreationDate % Reserved to GS % /ModDate % Reserved to GS" >> $PSTMPFILE if [ "$PDFTRAPPED" != "" ]; then echo " /Trapped $PDFTRAPPED" >> $PSTMPFILE fi echo \ " /DOCINFO pdfmark % Define an ICC profile : [/_objdef {icc_PDFA} /type /stream /OBJ pdfmark [{icc_PDFA} <> /PUT pdfmark [{icc_PDFA} ICCProfile (r) file /PUT pdfmark % Define the output intent dictionary : [/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark [{OutputIntent_PDFA} << /Type /OutputIntent % Must be so (the standard requires). /S /GTS_PDFA1 % Must be so (the standard requires). /DestOutputProfile {icc_PDFA} % Must be so (see above). /OutputConditionIdentifier (AdobeRGB1998) % Customize >> /PUT pdfmark [{Catalog} <> /PUT pdfmark " >> $PSTMPFILE #=====# CREATE THE COLOR PROFILE FILE #=====# echo -n -e "\\x00\\x00\\x02\\x30\\x41\\x44\\x42\\x45\\x02\\x10\\x00\\x00\\x6d\\x6e\\x74\\x72\\x52\\x47\\x42\\x20\\x58\\x59\\x5a\\x20\\x07\\xd0\\x00\\x08\\x00\\x0b\\x00\\x13\\x00\\x33\\x00\\x3b\\x61\\x63\\x73\\x70\\x41\\x50\\x50\\x4c\\x00\\x00\\x00\\x00\\x6e\\x6f\\x6e\\x65\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xf6\\xd6\\x00\\x01\\x00\\x00\\x00\\x00\\xd3\\x2d\\x41\\x44\\x42\\x45\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x0a\\x63\\x70\\x72\\x74\\x00\\x00\\x00\\xfc\\x00\\x00\\x00\\x32\\x64\\x65\\x73\\x63\\x00\\x00\\x01\\x30\\x00\\x00\\x00\\x6b\\x77\\x74\\x70\\x74\\x00\\x00\\x01\\x9c\\x00\\x00\\x00\\x14\\x62\\x6b\\x70\\x74\\x00\\x00\\x01\\xb0\\x00\\x00\\x00\\x14\\x72\\x54\\x52\\x43\\x00\\x00\\x01\\xc4\\x00\\x00\\x00\\x0e\\x67\\x54\\x52\\x43\\x00\\x00\\x01\\xd4\\x00\\x00\\x00\\x0e\\x62\\x54\\x52\\x43\\x00\\x00\\x01\\xe4\\x00\\x00\\x00\\x0e\\x72\\x58\\x59\\x5a\\x00\\x00\\x01\\xf4\\x00\\x00\\x00\\x14\\x67\\x58\\x59\\x5a\\x00\\x00\\x02\\x08\\x00\\x00\\x00\\x14\\x62\\x58\\x59\\x5a\\x00\\x00\\x02\\x1c\\x00\\x00\\x00\\x14\\x74\\x65\\x78\\x74\\x00\\x00\\x00\\x00\\x43\\x6f\\x70\\x79\\x72\\x69\\x67\\x68\\x74\\x20\\x32\\x30\\x30\\x30\\x20\\x41\\x64\\x6f\\x62\\x65\\x20\\x53\\x79\\x73\\x74\\x65\\x6d\\x73\\x20\\x49\\x6e\\x63\\x6f\\x72\\x70\\x6f\\x72\\x61\\x74\\x65\\x64\\x00\\x00\\x00\\x64\\x65\\x73\\x63\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x11\\x41\\x64\\x6f\\x62\\x65\\x20\\x52\\x47\\x42\\x20\\x28\\x31\\x39\\x39\\x38\\x29\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x58\\x59\\x5a\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\xf3\\x51\\x00\\x01\\x00\\x00\\x00\\x01\\x16\\xcc\\x58\\x59\\x5a\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x63\\x75\\x72\\x76\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x02\\x33\\x00\\x00\\x63\\x75\\x72\\x76\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x02\\x33\\x00\\x00\\x63\\x75\\x72\\x76\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x02\\x33\\x00\\x00\\x58\\x59\\x5a\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x9c\\x18\\x00\\x00\\x4f\\xa5\\x00\\x00\\x04\\xfc\\x58\\x59\\x5a\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x34\\x8d\\x00\\x00\\xa0\\x2c\\x00\\x00\\x0f\\x95\\x58\\x59\\x5a\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x26\\x31\\x00\\x00\\x10\\x2f\\x00\\x00\\xbe\\x9c" > $ICCTMPFILE #=====# DO THE ACTUAL CONVERSION #=====# echo " Compressing PDF & embedding fonts..." run gs $MSGOPTS \ -dBATCH -dNOPAUSE -dNOOUTERSAVE \ -dCompatibilityLevel=1.4 \ -dEmbedAllFonts=true -dSubsetFonts=true \ -dCompressFonts=true -dCompressPages=true \ -sColorConversionStrategy=RGB \ -dDownsampleMonoImages=false -dDownsampleGrayImages=false -dDownsampleColorImages=false \ -dAutoFilterColorImages=false -dAutoFilterGrayImages=false \ -sDEVICE=pdfwrite \ -sOutputFile="$TMPFILE" "$INPUT" echo " Converting to PDF/A-1B..." run gs $MSGOPTS \ -dPDFA=1 -dBATCH -dNOPAUSE -dNOOUTERSAVE \ $QUALITYOPTS \ -dCompatibilityLevel=1.4 -dPDFACompatibilityPolicy=1 \ -sProcessColorModel=DeviceRGB -sColorConversionStrategy=RGB \ -sOutputICCProfile=$ICCTMPFILE \ -sDEVICE=pdfwrite \ -sOutputFile="$OUTPUT" "$TMPFILE" $PSTMPFILE echo " Removing temporary files..." rm $TMPFILE echo " Done, now ESSE3 is happy! ;)" #=====# VALIDATE THE RESULT #=====# if $VALIDATE; then validate $VERAMSGOPTS "$OUTPUT" else echo " Suggestion: validate the resulting PDF to be sure it's PDF/A-1B compliant." fi