diff --git a/DeDRM_plugin/ignoblepdf.py b/DeDRM_plugin/ignoblepdf.py new file mode 100644 index 0000000..02fa24d --- /dev/null +++ b/DeDRM_plugin/ignoblepdf.py @@ -0,0 +1,2179 @@ +#! /usr/bin/python +# -*- coding: utf-8 -*- + +from __future__ import with_statement + +# ignoblepdf.pyw, version 0.1 +# Copyright © 2009-2010 by i♥cabbages + +# Released under the terms of the GNU General Public Licence, version 3 +# + +# Modified 2010–2012 by some_updates, DiapDealer and Apprentice Alf +# Modified 2015-2017 by Apprentice Harper +# Modified 2020 by Pu D. Pud + +# Windows users: Before running this program, you must first install Python 2.7 +# from and PyCrypto from +# (make sure to +# install the version for Python 2.7). Save this script file as +# ineptpdf.pyw and double-click on it to run it. +# +# Mac OS X users: Save this script file as ineptpdf.pyw. You can run this +# program from the command line (pythonw ineptpdf.pyw) or by double-clicking +# it when it has been associated with PythonLauncher. + +# Revision history: +# 0.1 - Alpha testing release + + +""" +Decrypts Barnes & Noble encrypted PDF files. +""" + +__license__ = 'GPL v3' +__version__ = "0.1" + +import sys +import os +import re +import zlib +import struct +import hashlib +from decimal import * +from itertools import chain, islice +import xml.etree.ElementTree as etree + +# Wrap a stream so that output gets flushed immediately +# and also make sure that any unicode strings get +# encoded using "replace" before writing them. +class SafeUnbuffered: + def __init__(self, stream): + self.stream = stream + self.encoding = stream.encoding + if self.encoding == None: + self.encoding = "utf-8" + def write(self, data): + if isinstance(data,unicode): + data = data.encode(self.encoding,"replace") + self.stream.write(data) + self.stream.flush() + def __getattr__(self, attr): + return getattr(self.stream, attr) + +iswindows = sys.platform.startswith('win') +isosx = sys.platform.startswith('darwin') + +def unicode_argv(): + if iswindows: + # Uses shell32.GetCommandLineArgvW to get sys.argv as a list of Unicode + # strings. + + # Versions 2.x of Python don't support Unicode in sys.argv on + # Windows, with the underlying Windows API instead replacing multi-byte + # characters with '?'. + + + from ctypes import POINTER, byref, cdll, c_int, windll + from ctypes.wintypes import LPCWSTR, LPWSTR + + GetCommandLineW = cdll.kernel32.GetCommandLineW + GetCommandLineW.argtypes = [] + GetCommandLineW.restype = LPCWSTR + + CommandLineToArgvW = windll.shell32.CommandLineToArgvW + CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] + CommandLineToArgvW.restype = POINTER(LPWSTR) + + cmd = GetCommandLineW() + argc = c_int(0) + argv = CommandLineToArgvW(cmd, byref(argc)) + if argc.value > 0: + # Remove Python executable and commands if present + start = argc.value - len(sys.argv) + return [argv[i] for i in + xrange(start, argc.value)] + return [u"ignoblepdf.py"] + else: + argvencoding = sys.stdin.encoding + if argvencoding == None: + argvencoding = "utf-8" + return [arg if (type(arg) == unicode) else unicode(arg,argvencoding) for arg in sys.argv] + + +class IGNOBLEError(Exception): + pass + + +import hashlib + +def SHA256(message): + ctx = hashlib.sha256() + ctx.update(message) + return ctx.digest() + + +def _load_crypto_libcrypto(): + from ctypes import CDLL, POINTER, c_void_p, c_char_p, c_int, c_long, \ + Structure, c_ulong, create_string_buffer, cast + from ctypes.util import find_library + + if sys.platform.startswith('win'): + libcrypto = find_library('libeay32') + else: + libcrypto = find_library('crypto') + + if libcrypto is None: + raise IGNOBLEError('libcrypto not found') + libcrypto = CDLL(libcrypto) + + AES_MAXNR = 14 + + c_char_pp = POINTER(c_char_p) + c_int_p = POINTER(c_int) + + class AES_KEY(Structure): + _fields_ = [('rd_key', c_long * (4 * (AES_MAXNR + 1))), + ('rounds', c_int)] + AES_KEY_p = POINTER(AES_KEY) + + class RC4_KEY(Structure): + _fields_ = [('x', c_int), ('y', c_int), ('box', c_int * 256)] + RC4_KEY_p = POINTER(RC4_KEY) + + def F(restype, name, argtypes): + func = getattr(libcrypto, name) + func.restype = restype + func.argtypes = argtypes + return func + + AES_cbc_encrypt = F(None, 'AES_cbc_encrypt', + [c_char_p, c_char_p, c_ulong, AES_KEY_p, c_char_p, + c_int]) + AES_set_decrypt_key = F(c_int, 'AES_set_decrypt_key', + [c_char_p, c_int, AES_KEY_p]) + + RC4_set_key = F(None,'RC4_set_key',[RC4_KEY_p, c_int, c_char_p]) + RC4_crypt = F(None,'RC4',[RC4_KEY_p, c_int, c_char_p, c_char_p]) + + class ARC4(object): + @classmethod + def new(cls, userkey): + self = ARC4() + self._blocksize = len(userkey) + key = self._key = RC4_KEY() + RC4_set_key(key, self._blocksize, userkey) + return self + def __init__(self): + self._blocksize = 0 + self._key = None + def decrypt(self, data): + out = create_string_buffer(len(data)) + RC4_crypt(self._key, len(data), data, out) + return out.raw + + class AES(object): + MODE_CBC = 0 + @classmethod + def new(cls, userkey, mode, iv): + self = AES() + self._blocksize = len(userkey) + # mode is ignored since CBCMODE is only thing supported/used so far + self._mode = mode + if (self._blocksize != 16) and (self._blocksize != 24) and (self._blocksize != 32) : + raise IGNOBLEError('AES improper key used') + return + keyctx = self._keyctx = AES_KEY() + self._iv = iv + rv = AES_set_decrypt_key(userkey, len(userkey) * 8, keyctx) + if rv < 0: + raise IGNOBLEError('Failed to initialize AES key') + return self + def __init__(self): + self._blocksize = 0 + self._keyctx = None + self._iv = 0 + self._mode = 0 + def decrypt(self, data): + out = create_string_buffer(len(data)) + rv = AES_cbc_encrypt(data, out, len(data), self._keyctx, self._iv, 0) + if rv == 0: + raise IGNOBLEError('AES decryption failed') + return out.raw + + return (ARC4, AES) + + +def _load_crypto_pycrypto(): + from Crypto.Cipher import ARC4 as _ARC4 + from Crypto.Cipher import AES as _AES + + class ARC4(object): + @classmethod + def new(cls, userkey): + self = ARC4() + self._arc4 = _ARC4.new(userkey) + return self + def __init__(self): + self._arc4 = None + def decrypt(self, data): + return self._arc4.decrypt(data) + + class AES(object): + MODE_CBC = _AES.MODE_CBC + @classmethod + def new(cls, userkey, mode, iv): + self = AES() + self._aes = _AES.new(userkey, mode, iv) + return self + def __init__(self): + self._aes = None + def decrypt(self, data): + return self._aes.decrypt(data) + + return (ARC4, AES) + +def _load_crypto(): + ARC4 = AES = None + cryptolist = (_load_crypto_libcrypto, _load_crypto_pycrypto) + if sys.platform.startswith('win'): + cryptolist = (_load_crypto_pycrypto, _load_crypto_libcrypto) + for loader in cryptolist: + try: + ARC4, AES = loader() + break + except (ImportError, IGNOBLEError): + pass + return (ARC4, AES) +ARC4, AES = _load_crypto() + + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + + +# Do we generate cross reference streams on output? +# 0 = never +# 1 = only if present in input +# 2 = always + +GEN_XREF_STM = 1 + +# This is the value for the current document +gen_xref_stm = False # will be set in PDFSerializer + +# PDF parsing routines from pdfminer, with changes for EBX_HANDLER + +# Utilities + +def choplist(n, seq): + '''Groups every n elements of the list.''' + r = [] + for x in seq: + r.append(x) + if len(r) == n: + yield tuple(r) + r = [] + return + +def nunpack(s, default=0): + '''Unpacks up to 4 bytes big endian.''' + l = len(s) + if not l: + return default + elif l == 1: + return ord(s) + elif l == 2: + return struct.unpack('>H', s)[0] + elif l == 3: + return struct.unpack('>L', '\x00'+s)[0] + elif l == 4: + return struct.unpack('>L', s)[0] + else: + return TypeError('invalid length: %d' % l) + + +STRICT = 0 + + +# PS Exceptions + +class PSException(Exception): pass +class PSEOF(PSException): pass +class PSSyntaxError(PSException): pass +class PSTypeError(PSException): pass +class PSValueError(PSException): pass + + +# Basic PostScript Types + + +# PSLiteral +class PSObject(object): pass + +class PSLiteral(PSObject): + ''' + PS literals (e.g. "/Name"). + Caution: Never create these objects directly. + Use PSLiteralTable.intern() instead. + ''' + def __init__(self, name): + self.name = name + return + + def __repr__(self): + name = [] + for char in self.name: + if not char.isalnum(): + char = '#%02x' % ord(char) + name.append(char) + return '/%s' % ''.join(name) + +# PSKeyword +class PSKeyword(PSObject): + ''' + PS keywords (e.g. "showpage"). + Caution: Never create these objects directly. + Use PSKeywordTable.intern() instead. + ''' + def __init__(self, name): + self.name = name + return + + def __repr__(self): + return self.name + +# PSSymbolTable +class PSSymbolTable(object): + + ''' + Symbol table that stores PSLiteral or PSKeyword. + ''' + + def __init__(self, classe): + self.dic = {} + self.classe = classe + return + + def intern(self, name): + if name in self.dic: + lit = self.dic[name] + else: + lit = self.classe(name) + self.dic[name] = lit + return lit + +PSLiteralTable = PSSymbolTable(PSLiteral) +PSKeywordTable = PSSymbolTable(PSKeyword) +LIT = PSLiteralTable.intern +KWD = PSKeywordTable.intern +KEYWORD_BRACE_BEGIN = KWD('{') +KEYWORD_BRACE_END = KWD('}') +KEYWORD_ARRAY_BEGIN = KWD('[') +KEYWORD_ARRAY_END = KWD(']') +KEYWORD_DICT_BEGIN = KWD('<<') +KEYWORD_DICT_END = KWD('>>') + + +def literal_name(x): + if not isinstance(x, PSLiteral): + if STRICT: + raise PSTypeError('Literal required: %r' % x) + else: + return str(x) + return x.name + +def keyword_name(x): + if not isinstance(x, PSKeyword): + if STRICT: + raise PSTypeError('Keyword required: %r' % x) + else: + return str(x) + return x.name + + +## PSBaseParser +## +EOL = re.compile(r'[\r\n]') +SPC = re.compile(r'\s') +NONSPC = re.compile(r'\S') +HEX = re.compile(r'[0-9a-fA-F]') +END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]') +END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]') +HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.') +END_NUMBER = re.compile(r'[^0-9]') +END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') +END_STRING = re.compile(r'[()\134]') +OCT_STRING = re.compile(r'[0-7]') +ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } + +class PSBaseParser(object): + + ''' + Most basic PostScript parser that performs only basic tokenization. + ''' + BUFSIZ = 4096 + + def __init__(self, fp): + self.fp = fp + self.seek(0) + return + + def __repr__(self): + return '' % (self.fp, self.bufpos) + + def flush(self): + return + + def close(self): + self.flush() + return + + def tell(self): + return self.bufpos+self.charpos + + def poll(self, pos=None, n=80): + pos0 = self.fp.tell() + if not pos: + pos = self.bufpos+self.charpos + self.fp.seek(pos) + ##print >>sys.stderr, 'poll(%d): %r' % (pos, self.fp.read(n)) + self.fp.seek(pos0) + return + + def seek(self, pos): + ''' + Seeks the parser to the given position. + ''' + self.fp.seek(pos) + # reset the status for nextline() + self.bufpos = pos + self.buf = '' + self.charpos = 0 + # reset the status for nexttoken() + self.parse1 = self.parse_main + self.tokens = [] + return + + def fillbuf(self): + if self.charpos < len(self.buf): return + # fetch next chunk. + self.bufpos = self.fp.tell() + self.buf = self.fp.read(self.BUFSIZ) + if not self.buf: + raise PSEOF('Unexpected EOF') + self.charpos = 0 + return + + def parse_main(self, s, i): + m = NONSPC.search(s, i) + if not m: + return (self.parse_main, len(s)) + j = m.start(0) + c = s[j] + self.tokenstart = self.bufpos+j + if c == '%': + self.token = '%' + return (self.parse_comment, j+1) + if c == '/': + self.token = '' + return (self.parse_literal, j+1) + if c in '-+' or c.isdigit(): + self.token = c + return (self.parse_number, j+1) + if c == '.': + self.token = c + return (self.parse_decimal, j+1) + if c.isalpha(): + self.token = c + return (self.parse_keyword, j+1) + if c == '(': + self.token = '' + self.paren = 1 + return (self.parse_string, j+1) + if c == '<': + self.token = '' + return (self.parse_wopen, j+1) + if c == '>': + self.token = '' + return (self.parse_wclose, j+1) + self.add_token(KWD(c)) + return (self.parse_main, j+1) + + def add_token(self, obj): + self.tokens.append((self.tokenstart, obj)) + return + + def parse_comment(self, s, i): + m = EOL.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_comment, len(s)) + j = m.start(0) + self.token += s[i:j] + # We ignore comments. + #self.tokens.append(self.token) + return (self.parse_main, j) + + def parse_literal(self, s, i): + m = END_LITERAL.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_literal, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '#': + self.hex = '' + return (self.parse_literal_hex, j+1) + self.add_token(LIT(self.token)) + return (self.parse_main, j) + + def parse_literal_hex(self, s, i): + c = s[i] + if HEX.match(c) and len(self.hex) < 2: + self.hex += c + return (self.parse_literal_hex, i+1) + if self.hex: + self.token += chr(int(self.hex, 16)) + return (self.parse_literal, i) + + def parse_number(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_number, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '.': + self.token += c + return (self.parse_decimal, j+1) + try: + self.add_token(int(self.token)) + except ValueError: + pass + return (self.parse_main, j) + + def parse_decimal(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_decimal, len(s)) + j = m.start(0) + self.token += s[i:j] + self.add_token(Decimal(self.token)) + return (self.parse_main, j) + + def parse_keyword(self, s, i): + m = END_KEYWORD.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_keyword, len(s)) + j = m.start(0) + self.token += s[i:j] + if self.token == 'true': + token = True + elif self.token == 'false': + token = False + else: + token = KWD(self.token) + self.add_token(token) + return (self.parse_main, j) + + def parse_string(self, s, i): + m = END_STRING.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_string, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '\\': + self.oct = '' + return (self.parse_string_1, j+1) + if c == '(': + self.paren += 1 + self.token += c + return (self.parse_string, j+1) + if c == ')': + self.paren -= 1 + if self.paren: + self.token += c + return (self.parse_string, j+1) + self.add_token(self.token) + return (self.parse_main, j+1) + def parse_string_1(self, s, i): + c = s[i] + if OCT_STRING.match(c) and len(self.oct) < 3: + self.oct += c + return (self.parse_string_1, i+1) + if self.oct: + self.token += chr(int(self.oct, 8)) + return (self.parse_string, i) + if c in ESC_STRING: + self.token += chr(ESC_STRING[c]) + return (self.parse_string, i+1) + + def parse_wopen(self, s, i): + c = s[i] + if c.isspace() or HEX.match(c): + return (self.parse_hexstring, i) + if c == '<': + self.add_token(KEYWORD_DICT_BEGIN) + i += 1 + return (self.parse_main, i) + + def parse_wclose(self, s, i): + c = s[i] + if c == '>': + self.add_token(KEYWORD_DICT_END) + i += 1 + return (self.parse_main, i) + + def parse_hexstring(self, s, i): + m = END_HEX_STRING.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_hexstring, len(s)) + j = m.start(0) + self.token += s[i:j] + token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), + SPC.sub('', self.token)) + self.add_token(token) + return (self.parse_main, j) + + def nexttoken(self): + while not self.tokens: + self.fillbuf() + (self.parse1, self.charpos) = self.parse1(self.buf, self.charpos) + token = self.tokens.pop(0) + return token + + def nextline(self): + ''' + Fetches a next line that ends either with \\r or \\n. + ''' + linebuf = '' + linepos = self.bufpos + self.charpos + eol = False + while 1: + self.fillbuf() + if eol: + c = self.buf[self.charpos] + # handle '\r\n' + if c == '\n': + linebuf += c + self.charpos += 1 + break + m = EOL.search(self.buf, self.charpos) + if m: + linebuf += self.buf[self.charpos:m.end(0)] + self.charpos = m.end(0) + if linebuf[-1] == '\r': + eol = True + else: + break + else: + linebuf += self.buf[self.charpos:] + self.charpos = len(self.buf) + return (linepos, linebuf) + + def revreadlines(self): + ''' + Fetches a next line backword. This is used to locate + the trailers at the end of a file. + ''' + self.fp.seek(0, 2) + pos = self.fp.tell() + buf = '' + while 0 < pos: + prevpos = pos + pos = max(0, pos-self.BUFSIZ) + self.fp.seek(pos) + s = self.fp.read(prevpos-pos) + if not s: break + while 1: + n = max(s.rfind('\r'), s.rfind('\n')) + if n == -1: + buf = s + buf + break + yield s[n:]+buf + s = s[:n] + buf = '' + return + + +## PSStackParser +## +class PSStackParser(PSBaseParser): + + def __init__(self, fp): + PSBaseParser.__init__(self, fp) + self.reset() + return + + def reset(self): + self.context = [] + self.curtype = None + self.curstack = [] + self.results = [] + return + + def seek(self, pos): + PSBaseParser.seek(self, pos) + self.reset() + return + + def push(self, *objs): + self.curstack.extend(objs) + return + def pop(self, n): + objs = self.curstack[-n:] + self.curstack[-n:] = [] + return objs + def popall(self): + objs = self.curstack + self.curstack = [] + return objs + def add_results(self, *objs): + self.results.extend(objs) + return + + def start_type(self, pos, type): + self.context.append((pos, self.curtype, self.curstack)) + (self.curtype, self.curstack) = (type, []) + return + def end_type(self, type): + if self.curtype != type: + raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) + objs = [ obj for (_,obj) in self.curstack ] + (pos, self.curtype, self.curstack) = self.context.pop() + return (pos, objs) + + def do_keyword(self, pos, token): + return + + def nextobject(self, direct=False): + ''' + Yields a list of objects: keywords, literals, strings, + numbers, arrays and dictionaries. Arrays and dictionaries + are represented as Python sequence and dictionaries. + ''' + while not self.results: + (pos, token) = self.nexttoken() + ##print (pos,token), (self.curtype, self.curstack) + if (isinstance(token, int) or + isinstance(token, Decimal) or + isinstance(token, bool) or + isinstance(token, str) or + isinstance(token, PSLiteral)): + # normal token + self.push((pos, token)) + elif token == KEYWORD_ARRAY_BEGIN: + # begin array + self.start_type(pos, 'a') + elif token == KEYWORD_ARRAY_END: + # end array + try: + self.push(self.end_type('a')) + except PSTypeError: + if STRICT: raise + elif token == KEYWORD_DICT_BEGIN: + # begin dictionary + self.start_type(pos, 'd') + elif token == KEYWORD_DICT_END: + # end dictionary + try: + (pos, objs) = self.end_type('d') + if len(objs) % 2 != 0: + print "Incomplete dictionary construct" + objs.append("") # this isn't necessary. + # temporary fix. is this due to rental books? + # raise PSSyntaxError( + # 'Invalid dictionary construct: %r' % objs) + d = dict((literal_name(k), v) \ + for (k,v) in choplist(2, objs)) + self.push((pos, d)) + except PSTypeError: + if STRICT: raise + else: + self.do_keyword(pos, token) + if self.context: + continue + else: + if direct: + return self.pop(1)[0] + self.flush() + obj = self.results.pop(0) + return obj + + +LITERAL_CRYPT = PSLiteralTable.intern('Crypt') +LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl')) +LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW')) +LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85')) + + +## PDF Objects +## +class PDFObject(PSObject): pass + +class PDFException(PSException): pass +class PDFTypeError(PDFException): pass +class PDFValueError(PDFException): pass +class PDFNotImplementedError(PSException): pass + + +## PDFObjRef +## +class PDFObjRef(PDFObject): + + def __init__(self, doc, objid, genno): + if objid == 0: + if STRICT: + raise PDFValueError('PDF object id cannot be 0.') + self.doc = doc + self.objid = objid + self.genno = genno + return + + def __repr__(self): + return '' % (self.objid, self.genno) + + def resolve(self): + return self.doc.getobj(self.objid) + + +# resolve +def resolve1(x): + ''' + Resolve an object. If this is an array or dictionary, + it may still contains some indirect objects inside. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + return x + +def resolve_all(x): + ''' + Recursively resolve X and all the internals. + Make sure there is no indirect reference within the nested object. + This procedure might be slow. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + if isinstance(x, list): + x = [ resolve_all(v) for v in x ] + elif isinstance(x, dict): + for (k,v) in x.iteritems(): + x[k] = resolve_all(v) + return x + +def decipher_all(decipher, objid, genno, x): + ''' + Recursively decipher X. + ''' + if isinstance(x, str): + return decipher(objid, genno, x) + decf = lambda v: decipher_all(decipher, objid, genno, v) + if isinstance(x, list): + x = [decf(v) for v in x] + elif isinstance(x, dict): + x = dict((k, decf(v)) for (k, v) in x.iteritems()) + return x + + +# Type cheking +def int_value(x): + x = resolve1(x) + if not isinstance(x, int): + if STRICT: + raise PDFTypeError('Integer required: %r' % x) + return 0 + return x + +def decimal_value(x): + x = resolve1(x) + if not isinstance(x, Decimal): + if STRICT: + raise PDFTypeError('Decimal required: %r' % x) + return 0.0 + return x + +def num_value(x): + x = resolve1(x) + if not (isinstance(x, int) or isinstance(x, Decimal)): + if STRICT: + raise PDFTypeError('Int or Float required: %r' % x) + return 0 + return x + +def str_value(x): + x = resolve1(x) + if not isinstance(x, str): + if STRICT: + raise PDFTypeError('String required: %r' % x) + return '' + return x + +def list_value(x): + x = resolve1(x) + if not (isinstance(x, list) or isinstance(x, tuple)): + if STRICT: + raise PDFTypeError('List required: %r' % x) + return [] + return x + +def dict_value(x): + x = resolve1(x) + if not isinstance(x, dict): + if STRICT: + raise PDFTypeError('Dict required: %r' % x) + return {} + return x + +def stream_value(x): + x = resolve1(x) + if not isinstance(x, PDFStream): + if STRICT: + raise PDFTypeError('PDFStream required: %r' % x) + return PDFStream({}, '') + return x + +# ascii85decode(data) +def ascii85decode(data): + n = b = 0 + out = '' + for c in data: + if '!' <= c and c <= 'u': + n += 1 + b = b*85+(ord(c)-33) + if n == 5: + out += struct.pack('>L',b) + n = b = 0 + elif c == 'z': + assert n == 0 + out += '\0\0\0\0' + elif c == '~': + if n: + for _ in range(5-n): + b = b*85+84 + out += struct.pack('>L',b)[:n-1] + break + return out + + +## PDFStream type +class PDFStream(PDFObject): + def __init__(self, dic, rawdata, decipher=None): + length = int_value(dic.get('Length', 0)) + eol = rawdata[length:] + # quick and dirty fix for false length attribute, + # might not work if the pdf stream parser has a problem + if decipher != None and decipher.__name__ == 'decrypt_aes': + if (len(rawdata) % 16) != 0: + cutdiv = len(rawdata) // 16 + rawdata = rawdata[:16*cutdiv] + else: + if eol in ('\r', '\n', '\r\n'): + rawdata = rawdata[:length] + + self.dic = dic + self.rawdata = rawdata + self.decipher = decipher + self.data = None + self.decdata = None + self.objid = None + self.genno = None + return + + def set_objid(self, objid, genno): + self.objid = objid + self.genno = genno + return + + def __repr__(self): + if self.rawdata: + return '' % \ + (self.objid, len(self.rawdata), self.dic) + else: + return '' % \ + (self.objid, len(self.data), self.dic) + + def decode(self): + assert self.data is None and self.rawdata is not None + data = self.rawdata + if self.decipher: + # Handle encryption + data = self.decipher(self.objid, self.genno, data) + if gen_xref_stm: + self.decdata = data # keep decrypted data + if 'Filter' not in self.dic: + self.data = data + self.rawdata = None + ##print self.dict + return + filters = self.dic['Filter'] + if not isinstance(filters, list): + filters = [ filters ] + for f in filters: + if f in LITERALS_FLATE_DECODE: + # will get errors if the document is encrypted. + data = zlib.decompress(data) + elif f in LITERALS_LZW_DECODE: + data = ''.join(LZWDecoder(StringIO(data)).run()) + elif f in LITERALS_ASCII85_DECODE: + data = ascii85decode(data) + elif f == LITERAL_CRYPT: + raise PDFNotImplementedError('/Crypt filter is unsupported') + else: + raise PDFNotImplementedError('Unsupported filter: %r' % f) + # apply predictors + if 'DP' in self.dic: + params = self.dic['DP'] + else: + params = self.dic.get('DecodeParms', {}) + if 'Predictor' in params: + pred = int_value(params['Predictor']) + if pred: + if pred != 12: + raise PDFNotImplementedError( + 'Unsupported predictor: %r' % pred) + if 'Columns' not in params: + raise PDFValueError( + 'Columns undefined for predictor=12') + columns = int_value(params['Columns']) + buf = '' + ent0 = '\x00' * columns + for i in xrange(0, len(data), columns+1): + pred = data[i] + ent1 = data[i+1:i+1+columns] + if pred == '\x02': + ent1 = ''.join(chr((ord(a)+ord(b)) & 255) \ + for (a,b) in zip(ent0,ent1)) + buf += ent1 + ent0 = ent1 + data = buf + self.data = data + self.rawdata = None + return + + def get_data(self): + if self.data is None: + self.decode() + return self.data + + def get_rawdata(self): + return self.rawdata + + def get_decdata(self): + if self.decdata is not None: + return self.decdata + data = self.rawdata + if self.decipher and data: + # Handle encryption + data = self.decipher(self.objid, self.genno, data) + return data + + +## PDF Exceptions +## +class PDFSyntaxError(PDFException): pass +class PDFNoValidXRef(PDFSyntaxError): pass +class PDFEncryptionError(PDFException): pass +class PDFPasswordIncorrect(PDFEncryptionError): pass + +# some predefined literals and keywords. +LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') +LITERAL_XREF = PSLiteralTable.intern('XRef') +LITERAL_PAGE = PSLiteralTable.intern('Page') +LITERAL_PAGES = PSLiteralTable.intern('Pages') +LITERAL_CATALOG = PSLiteralTable.intern('Catalog') + + +## XRefs +## + +## PDFXRef +## +class PDFXRef(object): + + def __init__(self): + self.offsets = None + return + + def __repr__(self): + return '' % len(self.offsets) + + def objids(self): + return self.offsets.iterkeys() + + def load(self, parser): + self.offsets = {} + while 1: + try: + (pos, line) = parser.nextline() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF - file corrupted?') + if not line: + raise PDFNoValidXRef('Premature eof: %r' % parser) + if line.startswith('trailer'): + parser.seek(pos) + break + f = line.strip().split(' ') + if len(f) != 2: + raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) + try: + (start, nobjs) = map(int, f) + except ValueError: + raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) + for objid in xrange(start, start+nobjs): + try: + (_, line) = parser.nextline() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF - file corrupted?') + f = line.strip().split(' ') + if len(f) != 3: + raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) + (pos, genno, use) = f + if use != 'n': continue + self.offsets[objid] = (int(genno), int(pos)) + self.load_trailer(parser) + return + + KEYWORD_TRAILER = PSKeywordTable.intern('trailer') + def load_trailer(self, parser): + try: + (_,kwd) = parser.nexttoken() + assert kwd is self.KEYWORD_TRAILER + (_,dic) = parser.nextobject(direct=True) + except PSEOF: + x = parser.pop(1) + if not x: + raise PDFNoValidXRef('Unexpected EOF - file corrupted') + (_,dic) = x[0] + self.trailer = dict_value(dic) + return + + def getpos(self, objid): + try: + (genno, pos) = self.offsets[objid] + except KeyError: + raise + return (None, pos) + + +## PDFXRefStream +## +class PDFXRefStream(object): + + def __init__(self): + self.index = None + self.data = None + self.entlen = None + self.fl1 = self.fl2 = self.fl3 = None + return + + def __repr__(self): + return '' % self.index + + def objids(self): + for first, size in self.index: + for objid in xrange(first, first + size): + yield objid + + def load(self, parser, debug=0): + (_,objid) = parser.nexttoken() # ignored + (_,genno) = parser.nexttoken() # ignored + (_,kwd) = parser.nexttoken() + (_,stream) = parser.nextobject() + if not isinstance(stream, PDFStream) or \ + stream.dic['Type'] is not LITERAL_XREF: + raise PDFNoValidXRef('Invalid PDF stream spec.') + size = stream.dic['Size'] + index = stream.dic.get('Index', (0,size)) + self.index = zip(islice(index, 0, None, 2), + islice(index, 1, None, 2)) + (self.fl1, self.fl2, self.fl3) = stream.dic['W'] + self.data = stream.get_data() + self.entlen = self.fl1+self.fl2+self.fl3 + self.trailer = stream.dic + return + + def getpos(self, objid): + offset = 0 + for first, size in self.index: + if first <= objid and objid < (first + size): + break + offset += size + else: + raise KeyError(objid) + i = self.entlen * ((objid - first) + offset) + ent = self.data[i:i+self.entlen] + f1 = nunpack(ent[:self.fl1], 1) + if f1 == 1: + pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) + genno = nunpack(ent[self.fl1+self.fl2:]) + return (None, pos) + elif f1 == 2: + objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) + index = nunpack(ent[self.fl1+self.fl2:]) + return (objid, index) + # this is a free object + raise KeyError(objid) + + +## PDFDocument +## +## A PDFDocument object represents a PDF document. +## Since a PDF file is usually pretty big, normally it is not loaded +## at once. Rather it is parsed dynamically as processing goes. +## A PDF parser is associated with the document. +## +class PDFDocument(object): + + def __init__(self): + self.xrefs = [] + self.objs = {} + self.parsed_objs = {} + self.root = None + self.catalog = None + self.parser = None + self.encryption = None + self.decipher = None + return + + # set_parser(parser) + # Associates the document with an (already initialized) parser object. + def set_parser(self, parser): + if self.parser: return + self.parser = parser + # The document is set to be temporarily ready during collecting + # all the basic information about the document, e.g. + # the header, the encryption information, and the access rights + # for the document. + self.ready = True + # Retrieve the information of each header that was appended + # (maybe multiple times) at the end of the document. + self.xrefs = parser.read_xref() + for xref in self.xrefs: + trailer = xref.trailer + if not trailer: continue + + # If there's an encryption info, remember it. + if 'Encrypt' in trailer: + #assert not self.encryption + try: + self.encryption = (list_value(trailer['ID']), + dict_value(trailer['Encrypt'])) + # fix for bad files + except: + self.encryption = ('ffffffffffffffffffffffffffffffffffff', + dict_value(trailer['Encrypt'])) + if 'Root' in trailer: + self.set_root(dict_value(trailer['Root'])) + break + else: + raise PDFSyntaxError('No /Root object! - Is this really a PDF?') + # The document is set to be non-ready again, until all the + # proper initialization (asking the password key and + # verifying the access permission, so on) is finished. + self.ready = False + return + + # set_root(root) + # Set the Root dictionary of the document. + # Each PDF file must have exactly one /Root dictionary. + def set_root(self, root): + self.root = root + self.catalog = dict_value(self.root) + if self.catalog.get('Type') is not LITERAL_CATALOG: + if STRICT: + raise PDFSyntaxError('Catalog not found!') + return + # initialize(password='') + # Perform the initialization with a given password. + # This step is mandatory even if there's no password associated + # with the document. + def initialize(self, password=''): + if not self.encryption: + self.is_printable = self.is_modifiable = self.is_extractable = True + self.ready = True + raise PDFEncryptionError('Document is not encrypted.') + return + (docid, param) = self.encryption + type = literal_name(param['Filter']) + if type == 'Adobe.APS': + return self.initialize_adobe_ps(password, docid, param) + if type == 'Standard': + return self.initialize_standard(password, docid, param) + if type == 'EBX_HANDLER': + return self.initialize_ebx(password, docid, param) + raise PDFEncryptionError('Unknown filter: param=%r' % param) + + def initialize_adobe_ps(self, password, docid, param): + global KEYFILEPATH + self.decrypt_key = self.genkey_adobe_ps(param) + self.genkey = self.genkey_v4 + self.decipher = self.decrypt_aes + self.ready = True + return + + def genkey_adobe_ps(self, param): + # nice little offline principal keys dictionary + # global static principal key for German Onleihe / Bibliothek Digital + principalkeys = { 'bibliothek-digital.de': 'rRwGv2tbpKov1krvv7PO0ws9S436/lArPlfipz5Pqhw='.decode('base64')} + self.is_printable = self.is_modifiable = self.is_extractable = True + length = int_value(param.get('Length', 0)) / 8 + edcdata = str_value(param.get('EDCData')).decode('base64') + pdrllic = str_value(param.get('PDRLLic')).decode('base64') + pdrlpol = str_value(param.get('PDRLPol')).decode('base64') + edclist = [] + for pair in edcdata.split('\n'): + edclist.append(pair) + # principal key request + for key in principalkeys: + if key in pdrllic: + principalkey = principalkeys[key] + else: + raise IGNOBLEError('Cannot find principal key for this pdf') + shakey = SHA256(principalkey) + ivector = 16 * chr(0) + plaintext = AES.new(shakey,AES.MODE_CBC,ivector).decrypt(edclist[9].decode('base64')) + if plaintext[-16:] != 16 * chr(16): + raise IGNOBLEError('Offlinekey cannot be decrypted, aborting ...') + pdrlpol = AES.new(plaintext[16:32],AES.MODE_CBC,edclist[2].decode('base64')).decrypt(pdrlpol) + if ord(pdrlpol[-1]) < 1 or ord(pdrlpol[-1]) > 16: + raise IGNOBLEError('Could not decrypt PDRLPol, aborting ...') + else: + cutter = -1 * ord(pdrlpol[-1]) + pdrlpol = pdrlpol[:cutter] + return plaintext[:16] + + PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..' \ + '\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' + # experimental aes pw support + def initialize_standard(self, password, docid, param): + # copy from a global variable + V = int_value(param.get('V', 0)) + if (V <=0 or V > 4): + raise PDFEncryptionError('Unknown algorithm: param=%r' % param) + length = int_value(param.get('Length', 40)) # Key length (bits) + O = str_value(param['O']) + R = int_value(param['R']) # Revision + if 5 <= R: + raise PDFEncryptionError('Unknown revision: %r' % R) + U = str_value(param['U']) + P = int_value(param['P']) + try: + EncMetadata = str_value(param['EncryptMetadata']) + except: + EncMetadata = 'True' + self.is_printable = bool(P & 4) + self.is_modifiable = bool(P & 8) + self.is_extractable = bool(P & 16) + self.is_annotationable = bool(P & 32) + self.is_formsenabled = bool(P & 256) + self.is_textextractable = bool(P & 512) + self.is_assemblable = bool(P & 1024) + self.is_formprintable = bool(P & 2048) + # Algorithm 3.2 + password = (password+self.PASSWORD_PADDING)[:32] # 1 + hash = hashlib.md5(password) # 2 + hash.update(O) # 3 + hash.update(struct.pack('= 3: + # Algorithm 3.5 + hash = hashlib.md5(self.PASSWORD_PADDING) # 2 + hash.update(docid[0]) # 3 + x = ARC4.new(key).decrypt(hash.digest()[:16]) # 4 + for i in xrange(1,19+1): + k = ''.join( chr(ord(c) ^ i) for c in key ) + x = ARC4.new(k).decrypt(x) + u1 = x+x # 32bytes total + if R == 2: + is_authenticated = (u1 == U) + else: + is_authenticated = (u1[:16] == U[:16]) + if not is_authenticated: + raise IGNOBLEError('Password is not correct.') + self.decrypt_key = key + # genkey method + if V == 1 or V == 2: + self.genkey = self.genkey_v2 + elif V == 3: + self.genkey = self.genkey_v3 + elif V == 4: + self.genkey = self.genkey_v2 + #self.genkey = self.genkey_v3 if V == 3 else self.genkey_v2 + # rc4 + if V != 4: + self.decipher = self.decipher_rc4 # XXX may be AES + # aes + elif V == 4 and Length == 128: + elf.decipher = self.decipher_aes + elif V == 4 and Length == 256: + raise PDFNotImplementedError('AES256 encryption is currently unsupported') + self.ready = True + return + + def initialize_ebx(self, keyb64, docid, param): + self.is_printable = self.is_modifiable = self.is_extractable = True + key = keyb64.decode('base64')[:16] + aes = AES.new(key,AES.MODE_CBC,"\x00" * len(key)) + length = int_value(param.get('Length', 0)) / 8 + rights = str_value(param.get('ADEPT_LICENSE')).decode('base64') + rights = zlib.decompress(rights, -15) + rights = etree.fromstring(rights) + expr = './/{http://ns.adobe.com/adept}encryptedKey' + bookkey = ''.join(rights.findtext(expr)).decode('base64') + bookkey = aes.decrypt(bookkey) + bookkey = bookkey[:-ord(bookkey[-1])] + bookkey = bookkey[-16:] + ebx_V = int_value(param.get('V', 4)) + ebx_type = int_value(param.get('EBX_ENCRYPTIONTYPE', 6)) + # added because of improper booktype / decryption book session key errors + if length > 0: + if len(bookkey) == length: + if ebx_V == 3: + V = 3 + else: + V = 2 + elif len(bookkey) == length + 1: + V = ord(bookkey[0]) + bookkey = bookkey[1:] + else: + print "ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type) + print "length is %d and len(bookkey) is %d" % (length, len(bookkey)) + print "bookkey[0] is %d" % ord(bookkey[0]) + raise IGNOBLEError('error decrypting book session key - mismatched length') + else: + # proper length unknown try with whatever you have + print "ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type) + print "length is %d and len(bookkey) is %d" % (length, len(bookkey)) + print "bookkey[0] is %d" % ord(bookkey[0]) + if ebx_V == 3: + V = 3 + else: + V = 2 + self.decrypt_key = bookkey + self.genkey = self.genkey_v3 if V == 3 else self.genkey_v2 + self.decipher = self.decrypt_rc4 + self.ready = True + return + + # genkey functions + def genkey_v2(self, objid, genno): + objid = struct.pack(' PDFObjStmRef.maxindex: + PDFObjStmRef.maxindex = index + + +## PDFParser +## +class PDFParser(PSStackParser): + + def __init__(self, doc, fp): + PSStackParser.__init__(self, fp) + self.doc = doc + self.doc.set_parser(self) + return + + def __repr__(self): + return '' + + KEYWORD_R = PSKeywordTable.intern('R') + KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') + KEYWORD_STREAM = PSKeywordTable.intern('stream') + KEYWORD_XREF = PSKeywordTable.intern('xref') + KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') + def do_keyword(self, pos, token): + if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): + self.add_results(*self.pop(1)) + return + if token is self.KEYWORD_ENDOBJ: + self.add_results(*self.pop(4)) + return + + if token is self.KEYWORD_R: + # reference to indirect object + try: + ((_,objid), (_,genno)) = self.pop(2) + (objid, genno) = (int(objid), int(genno)) + obj = PDFObjRef(self.doc, objid, genno) + self.push((pos, obj)) + except PSSyntaxError: + pass + return + + if token is self.KEYWORD_STREAM: + # stream object + ((_,dic),) = self.pop(1) + dic = dict_value(dic) + try: + objlen = int_value(dic['Length']) + except KeyError: + if STRICT: + raise PDFSyntaxError('/Length is undefined: %r' % dic) + objlen = 0 + self.seek(pos) + try: + (_, line) = self.nextline() # 'stream' + except PSEOF: + if STRICT: + raise PDFSyntaxError('Unexpected EOF') + return + pos += len(line) + self.fp.seek(pos) + data = self.fp.read(objlen) + self.seek(pos+objlen) + while 1: + try: + (linepos, line) = self.nextline() + except PSEOF: + if STRICT: + raise PDFSyntaxError('Unexpected EOF') + break + if 'endstream' in line: + i = line.index('endstream') + objlen += i + data += line[:i] + break + objlen += len(line) + data += line + self.seek(pos+objlen) + obj = PDFStream(dic, data, self.doc.decipher) + self.push((pos, obj)) + return + + # others + self.push((pos, token)) + return + + def find_xref(self): + # search the last xref table by scanning the file backwards. + prev = None + for line in self.revreadlines(): + line = line.strip() + if line == 'startxref': break + if line: + prev = line + else: + raise PDFNoValidXRef('Unexpected EOF') + return int(prev) + + # read xref table + def read_xref_from(self, start, xrefs): + self.seek(start) + self.reset() + try: + (pos, token) = self.nexttoken() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF') + if isinstance(token, int): + # XRefStream: PDF-1.5 + if GEN_XREF_STM == 1: + global gen_xref_stm + gen_xref_stm = True + self.seek(pos) + self.reset() + xref = PDFXRefStream() + xref.load(self) + else: + if token is not self.KEYWORD_XREF: + raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % + (pos, token)) + self.nextline() + xref = PDFXRef() + xref.load(self) + xrefs.append(xref) + trailer = xref.trailer + if 'XRefStm' in trailer: + pos = int_value(trailer['XRefStm']) + self.read_xref_from(pos, xrefs) + if 'Prev' in trailer: + # find previous xref + pos = int_value(trailer['Prev']) + self.read_xref_from(pos, xrefs) + return + + # read xref tables and trailers + def read_xref(self): + xrefs = [] + trailerpos = None + try: + pos = self.find_xref() + self.read_xref_from(pos, xrefs) + except PDFNoValidXRef: + # fallback + self.seek(0) + pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') + offsets = {} + xref = PDFXRef() + while 1: + try: + (pos, line) = self.nextline() + except PSEOF: + break + if line.startswith('trailer'): + trailerpos = pos # remember last trailer + m = pat.match(line) + if not m: continue + (objid, genno) = m.groups() + offsets[int(objid)] = (0, pos) + if not offsets: raise + xref.offsets = offsets + if trailerpos: + self.seek(trailerpos) + xref.load_trailer(self) + xrefs.append(xref) + return xrefs + +## PDFObjStrmParser +## +class PDFObjStrmParser(PDFParser): + + def __init__(self, data, doc): + PSStackParser.__init__(self, StringIO(data)) + self.doc = doc + return + + def flush(self): + self.add_results(*self.popall()) + return + + KEYWORD_R = KWD('R') + def do_keyword(self, pos, token): + if token is self.KEYWORD_R: + # reference to indirect object + try: + ((_,objid), (_,genno)) = self.pop(2) + (objid, genno) = (int(objid), int(genno)) + obj = PDFObjRef(self.doc, objid, genno) + self.push((pos, obj)) + except PSSyntaxError: + pass + return + # others + self.push((pos, token)) + return + +### +### My own code, for which there is none else to blame + +class PDFSerializer(object): + def __init__(self, inf, userkey): + global GEN_XREF_STM, gen_xref_stm + gen_xref_stm = GEN_XREF_STM > 1 + self.version = inf.read(8) + inf.seek(0) + self.doc = doc = PDFDocument() + parser = PDFParser(doc, inf) + doc.initialize(userkey) + self.objids = objids = set() + for xref in reversed(doc.xrefs): + trailer = xref.trailer + for objid in xref.objids(): + objids.add(objid) + trailer = dict(trailer) + trailer.pop('Prev', None) + trailer.pop('XRefStm', None) + if 'Encrypt' in trailer: + objids.remove(trailer.pop('Encrypt').objid) + self.trailer = trailer + + def dump(self, outf): + self.outf = outf + self.write(self.version) + self.write('\n%\xe2\xe3\xcf\xd3\n') + doc = self.doc + objids = self.objids + xrefs = {} + maxobj = max(objids) + trailer = dict(self.trailer) + trailer['Size'] = maxobj + 1 + for objid in objids: + obj = doc.getobj(objid) + if isinstance(obj, PDFObjStmRef): + xrefs[objid] = obj + continue + if obj is not None: + try: + genno = obj.genno + except AttributeError: + genno = 0 + xrefs[objid] = (self.tell(), genno) + self.serialize_indirect(objid, obj) + startxref = self.tell() + + if not gen_xref_stm: + self.write('xref\n') + self.write('0 %d\n' % (maxobj + 1,)) + for objid in xrange(0, maxobj + 1): + if objid in xrefs: + # force the genno to be 0 + self.write("%010d 00000 n \n" % xrefs[objid][0]) + else: + self.write("%010d %05d f \n" % (0, 65535)) + + self.write('trailer\n') + self.serialize_object(trailer) + self.write('\nstartxref\n%d\n%%%%EOF' % startxref) + + else: # Generate crossref stream. + + # Calculate size of entries + maxoffset = max(startxref, maxobj) + maxindex = PDFObjStmRef.maxindex + fl2 = 2 + power = 65536 + while maxoffset >= power: + fl2 += 1 + power *= 256 + fl3 = 1 + power = 256 + while maxindex >= power: + fl3 += 1 + power *= 256 + + index = [] + first = None + prev = None + data = [] + # Put the xrefstream's reference in itself + startxref = self.tell() + maxobj += 1 + xrefs[maxobj] = (startxref, 0) + for objid in sorted(xrefs): + if first is None: + first = objid + elif objid != prev + 1: + index.extend((first, prev - first + 1)) + first = objid + prev = objid + objref = xrefs[objid] + if isinstance(objref, PDFObjStmRef): + f1 = 2 + f2 = objref.stmid + f3 = objref.index + else: + f1 = 1 + f2 = objref[0] + # we force all generation numbers to be 0 + # f3 = objref[1] + f3 = 0 + + data.append(struct.pack('>B', f1)) + data.append(struct.pack('>L', f2)[-fl2:]) + data.append(struct.pack('>L', f3)[-fl3:]) + index.extend((first, prev - first + 1)) + data = zlib.compress(''.join(data)) + dic = {'Type': LITERAL_XREF, 'Size': prev + 1, 'Index': index, + 'W': [1, fl2, fl3], 'Length': len(data), + 'Filter': LITERALS_FLATE_DECODE[0], + 'Root': trailer['Root'],} + if 'Info' in trailer: + dic['Info'] = trailer['Info'] + xrefstm = PDFStream(dic, data) + self.serialize_indirect(maxobj, xrefstm) + self.write('startxref\n%d\n%%%%EOF' % startxref) + def write(self, data): + self.outf.write(data) + self.last = data[-1:] + + def tell(self): + return self.outf.tell() + + def escape_string(self, string): + string = string.replace('\\', '\\\\') + string = string.replace('\n', r'\n') + string = string.replace('(', r'\(') + string = string.replace(')', r'\)') + # get rid of ciando id + regularexp = re.compile(r'http://www.ciando.com/index.cfm/intRefererID/\d{5}') + if regularexp.match(string): return ('http://www.ciando.com') + return string + + def serialize_object(self, obj): + if isinstance(obj, dict): + # Correct malformed Mac OS resource forks for Stanza + if 'ResFork' in obj and 'Type' in obj and 'Subtype' not in obj \ + and isinstance(obj['Type'], int): + obj['Subtype'] = obj['Type'] + del obj['Type'] + # end - hope this doesn't have bad effects + self.write('<<') + for key, val in obj.items(): + self.write('/%s' % key) + self.serialize_object(val) + self.write('>>') + elif isinstance(obj, list): + self.write('[') + for val in obj: + self.serialize_object(val) + self.write(']') + elif isinstance(obj, str): + self.write('(%s)' % self.escape_string(obj)) + elif isinstance(obj, bool): + if self.last.isalnum(): + self.write(' ') + self.write(str(obj).lower()) + elif isinstance(obj, (int, long)): + if self.last.isalnum(): + self.write(' ') + self.write(str(obj)) + elif isinstance(obj, Decimal): + if self.last.isalnum(): + self.write(' ') + self.write(str(obj)) + elif isinstance(obj, PDFObjRef): + if self.last.isalnum(): + self.write(' ') + self.write('%d %d R' % (obj.objid, 0)) + elif isinstance(obj, PDFStream): + ### If we don't generate cross ref streams the object streams + ### are no longer useful, as we have extracted all objects from + ### them. Therefore leave them out from the output. + if obj.dic.get('Type') == LITERAL_OBJSTM and not gen_xref_stm: + self.write('(deleted)') + else: + data = obj.get_decdata() + self.serialize_object(obj.dic) + self.write('stream\n') + self.write(data) + self.write('\nendstream') + else: + data = str(obj) + if data[0].isalnum() and self.last.isalnum(): + self.write(' ') + self.write(data) + + def serialize_indirect(self, objid, obj): + self.write('%d 0 obj' % (objid,)) + self.serialize_object(obj) + if self.last.isalnum(): + self.write('\n') + self.write('endobj\n') + + + + +def decryptBook(userkey, inpath, outpath): + if AES is None: + raise IGNOBLEError(u"PyCrypto or OpenSSL must be installed.") + with open(inpath, 'rb') as inf: + #try: + serializer = PDFSerializer(inf, userkey) + #except: + # print u"Error serializing pdf {0}. Probably wrong key.".format(os.path.basename(inpath)) + # return 2 + # hope this will fix the 'bad file descriptor' problem + with open(outpath, 'wb') as outf: + # help construct to make sure the method runs to the end + try: + serializer.dump(outf) + except Exception, e: + print u"error writing pdf: {0}".format(e.args[0]) + return 2 + return 0 + + +def cli_main(): + sys.stdout=SafeUnbuffered(sys.stdout) + sys.stderr=SafeUnbuffered(sys.stderr) + argv=unicode_argv() + progname = os.path.basename(argv[0]) + if len(argv) != 4: + print u"usage: {0} ".format(progname) + return 1 + keypath, inpath, outpath = argv[1:] + userkey = open(keypath,'rb').read() + result = decryptBook(userkey, inpath, outpath) + if result == 0: + print u"Successfully decrypted {0:s} as {1:s}".format(os.path.basename(inpath),os.path.basename(outpath)) + return result + + +def gui_main(): + try: + import Tkinter + import Tkconstants + import tkFileDialog + import tkMessageBox + import traceback + except: + return cli_main() + + class DecryptionDialog(Tkinter.Frame): + def __init__(self, root): + Tkinter.Frame.__init__(self, root, border=5) + self.status = Tkinter.Label(self, text=u"Select files for decryption") + self.status.pack(fill=Tkconstants.X, expand=1) + body = Tkinter.Frame(self) + body.pack(fill=Tkconstants.X, expand=1) + sticky = Tkconstants.E + Tkconstants.W + body.grid_columnconfigure(1, weight=2) + Tkinter.Label(body, text=u"Key file").grid(row=0) + self.keypath = Tkinter.Entry(body, width=30) + self.keypath.grid(row=0, column=1, sticky=sticky) + if os.path.exists(u"bnpdfkey.b64"): + self.keypath.insert(0, u"bnpdfkey.b64") + button = Tkinter.Button(body, text=u"...", command=self.get_keypath) + button.grid(row=0, column=2) + Tkinter.Label(body, text=u"Input file").grid(row=1) + self.inpath = Tkinter.Entry(body, width=30) + self.inpath.grid(row=1, column=1, sticky=sticky) + button = Tkinter.Button(body, text=u"...", command=self.get_inpath) + button.grid(row=1, column=2) + Tkinter.Label(body, text=u"Output file").grid(row=2) + self.outpath = Tkinter.Entry(body, width=30) + self.outpath.grid(row=2, column=1, sticky=sticky) + button = Tkinter.Button(body, text=u"...", command=self.get_outpath) + button.grid(row=2, column=2) + buttons = Tkinter.Frame(self) + buttons.pack() + botton = Tkinter.Button( + buttons, text=u"Decrypt", width=10, command=self.decrypt) + botton.pack(side=Tkconstants.LEFT) + Tkinter.Frame(buttons, width=10).pack(side=Tkconstants.LEFT) + button = Tkinter.Button( + buttons, text=u"Quit", width=10, command=self.quit) + button.pack(side=Tkconstants.RIGHT) + + def get_keypath(self): + keypath = tkFileDialog.askopenfilename( + parent=None, title=u"Select Barnes & Noble \'.b64\' key file", + defaultextension=u".b64", + filetypes=[('base64-encoded files', '.b64'), + ('All Files', '.*')]) + if keypath: + keypath = os.path.normpath(keypath) + self.keypath.delete(0, Tkconstants.END) + self.keypath.insert(0, keypath) + return + + def get_inpath(self): + inpath = tkFileDialog.askopenfilename( + parent=None, title=u"Select B&N-encrypted PDF file to decrypt", + defaultextension=u".pdf", filetypes=[('PDF files', '.pdf')]) + if inpath: + inpath = os.path.normpath(inpath) + self.inpath.delete(0, Tkconstants.END) + self.inpath.insert(0, inpath) + return + + def get_outpath(self): + outpath = tkFileDialog.asksaveasfilename( + parent=None, title=u"Select unencrypted PDF file to produce", + defaultextension=u".pdf", filetypes=[('PDF files', '.pdf')]) + if outpath: + outpath = os.path.normpath(outpath) + self.outpath.delete(0, Tkconstants.END) + self.outpath.insert(0, outpath) + return + + def decrypt(self): + keypath = self.keypath.get() + inpath = self.inpath.get() + outpath = self.outpath.get() + if not keypath or not os.path.exists(keypath): + self.status['text'] = u"Specified key file does not exist" + return + if not inpath or not os.path.exists(inpath): + self.status['text'] = u"Specified input file does not exist" + return + if not outpath: + self.status['text'] = u"Output file not specified" + return + if inpath == outpath: + self.status['text'] = u"Must have different input and output files" + return + userkey = open(keypath,'rb').read() + self.status['text'] = u"Decrypting..." + try: + decrypt_status = decryptBook(userkey, inpath, outpath) + except Exception, e: + self.status['text'] = u"Error; {0}".format(e.args[0]) + return + if decrypt_status == 0: + self.status['text'] = u"File successfully decrypted" + else: + self.status['text'] = u"The was an error decrypting the file." + + + root = Tkinter.Tk() + if AES is None: + root.withdraw() + tkMessageBox.showerror( + "IGNOBLE PDF", + "This script requires OpenSSL or PyCrypto, which must be installed " + "separately. Read the top-of-script comment for details.") + return 1 + root.title(u"Barnes & Noble PDF Decrypter v.{0}".format(__version__)) + root.resizable(True, False) + root.minsize(370, 0) + DecryptionDialog(root).pack(fill=Tkconstants.X, expand=1) + root.mainloop() + return 0 + + +if __name__ == '__main__': + if len(sys.argv) > 1: + sys.exit(cli_main()) + sys.exit(gui_main())