DeDRM_tools/DeDRM_plugin/ineptpdf.py

2500 lines
83 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2009-02-26 16:42:22 +06:00
# ineptpdf.py
# Copyright © 2009-2020 by i♥cabbages, Apprentice Harper et al.
# Copyright © 2021-2022 by noDRM et al.
2010-12-01 13:16:54 +06:00
2013-10-03 00:59:40 +06:00
# Released under the terms of the GNU General Public Licence, version 3
# <http://www.gnu.org/licenses/>
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
# Revision history:
# 1 - Initial release
# 2 - Improved determination of key-generation algorithm
# 3 - Correctly handle PDF >=1.5 cross-reference streams
# 4 - Removal of ciando's personal ID
# 5 - Automated decryption of a complete directory
# 6.1 - backward compatibility for 1.7.1 and old adeptkey.der
# 7 - Get cross reference streams and object streams working for input.
# Not yet supported on output but this only effects file size,
# not functionality. (anon2)
# 7.1 - Correct a problem when an old trailer is not followed by startxref
# 7.2 - Correct malformed Mac OS resource forks for Stanza (anon2)
# - Support for cross ref streams on output (decreases file size)
# 7.3 - Correct bug in trailer with cross ref stream that caused the error
# "The root object is missing or invalid" in Adobe Reader. (anon2)
# 7.4 - Force all generation numbers in output file to be 0, like in v6.
# Fallback code for wrong xref improved (search till last trailer
# instead of first) (anon2)
# 7.5 - allow support for OpenSSL to replace pycrypto on all platforms
# implemented ARC4 interface to OpenSSL
# fixed minor typos
# 7.6 - backported AES and other fixes from version 8.4.48
# 7.7 - On Windows try PyCrypto first and OpenSSL next
# 7.8 - Modify interface to allow use of import
# 7.9 - Bug fix for some session key errors when len(bookkey) > length required
# 7.10 - Various tweaks to fix minor problems.
# 7.11 - More tweaks to fix minor problems.
# 7.12 - Revised to allow use in calibre plugins to eliminate need for duplicate code
# 7.13 - Fixed erroneous mentions of ineptepub
# 7.14 - moved unicode_argv call inside main for Windows DeDRM compatibility
# 8.0 - Work if TkInter is missing
# 8.0.1 - Broken Metadata fix.
# 8.0.2 - Add additional check on DER file sanity
2016-01-14 23:15:43 +06:00
# 8.0.3 - Remove erroneous check on DER file sanity
2016-01-15 12:30:54 +06:00
# 8.0.4 - Completely remove erroneous check on DER file sanity
# 8.0.5 - Do not process DRM-free documents
# 8.0.6 - Replace use of float by Decimal for greater precision, and import tkFileDialog
# 9.0.0 - Add Python 3 compatibility for calibre 5
# 9.1.0 - Support for decrypting with owner password, support for V=5, R=5 and R=6 PDF files, support for AES256-encrypted PDFs.
# 9.1.1 - Only support PyCryptodome; clean up the code
# 10.0.0 - Add support for "hardened" Adobe DRM (RMSDK >= 10)
# 10.0.2 - Fix some Python2 stuff
2022-08-06 19:29:05 +06:00
# 10.0.4 - Fix more Python2 stuff
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
"""
Decrypts Adobe ADEPT-encrypted PDF files.
"""
__license__ = 'GPL v3'
2022-08-06 19:29:05 +06:00
__version__ = "10.0.4"
2013-10-03 00:59:40 +06:00
import codecs
import hashlib
2013-10-03 00:59:40 +06:00
import sys
import os
import re
import zlib
import struct
import binascii
import base64
from io import BytesIO
from decimal import Decimal
import itertools
2013-10-03 00:59:40 +06:00
import xml.etree.ElementTree as etree
import traceback
from uuid import UUID
try:
from Cryptodome.Cipher import AES, ARC4, PKCS1_v1_5
from Cryptodome.PublicKey import RSA
except ImportError:
from Crypto.Cipher import AES, ARC4, PKCS1_v1_5
from Crypto.PublicKey import RSA
2022-03-19 15:14:45 +06:00
def unpad(data, padding=16):
if sys.version_info[0] == 2:
pad_len = ord(data[-1])
else:
pad_len = data[-1]
return data[:-pad_len]
2023-08-06 19:49:52 +06:00
#@@CALIBRE_COMPAT_CODE@@
2023-08-04 00:45:06 +06:00
from .utilities import SafeUnbuffered
2023-08-06 19:49:52 +06:00
from .argv_utils import unicode_argv
iswindows = sys.platform.startswith('win')
isosx = sys.platform.startswith('darwin')
2013-10-03 00:59:40 +06:00
class ADEPTError(Exception):
pass
class ADEPTInvalidPasswordError(Exception):
pass
class ADEPTNewVersionError(Exception):
pass
2013-10-03 00:59:40 +06:00
def SHA256(message):
return hashlib.sha256(message).digest()
2013-10-03 00:59:40 +06:00
# Do we generate cross reference streams on output?
# 0 = never
# 1 = only if present in input
# 2 = always
2022-08-07 19:58:01 +06:00
GEN_XREF_STM = 1
2013-10-03 00:59:40 +06:00
# This is the value for the current document
gen_xref_stm = False # will be set in PDFSerializer
# PDF parsing routines from pdfminer, with changes for EBX_HANDLER
# Utilities
def choplist(n, seq):
'''Groups every n elements of the list.'''
r = []
for x in seq:
r.append(x)
if len(r) == n:
yield tuple(r)
r = []
return
def nunpack(s, default=0):
'''Unpacks up to 4 bytes big endian.'''
l = len(s)
if not l:
return default
elif l == 1:
return ord(s)
2013-10-03 00:59:40 +06:00
elif l == 2:
return struct.unpack('>H', s)[0]
elif l == 3:
2022-08-06 19:29:05 +06:00
if sys.version_info[0] == 2:
return struct.unpack('>L', '\x00'+s)[0]
else:
return struct.unpack('>L', bytes([0]) + s)[0]
2013-10-03 00:59:40 +06:00
elif l == 4:
return struct.unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)
STRICT = 0
# PS Exceptions
class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
# Basic PostScript Types
# PSLiteral
class PSObject(object): pass
class PSLiteral(PSObject):
'''
PS literals (e.g. "/Name").
Caution: Never create these objects directly.
Use PSLiteralTable.intern() instead.
'''
def __init__(self, name):
self.name = name.decode('utf-8')
2013-10-03 00:59:40 +06:00
return
def __repr__(self):
name = []
for char in self.name:
if not char.isalnum():
char = '#%02x' % ord(char)
2013-10-03 00:59:40 +06:00
name.append(char)
return '/%s' % ''.join(name)
2013-10-03 00:59:40 +06:00
# PSKeyword
class PSKeyword(PSObject):
'''
PS keywords (e.g. "showpage").
Caution: Never create these objects directly.
Use PSKeywordTable.intern() instead.
'''
def __init__(self, name):
self.name = name.decode('utf-8')
2013-10-03 00:59:40 +06:00
return
def __repr__(self):
return self.name
# PSSymbolTable
class PSSymbolTable(object):
'''
Symbol table that stores PSLiteral or PSKeyword.
'''
def __init__(self, classe):
self.dic = {}
self.classe = classe
return
def intern(self, name):
if name in self.dic:
lit = self.dic[name]
else:
lit = self.classe(name)
self.dic[name] = lit
return lit
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_BRACE_BEGIN = KWD(b'{')
KEYWORD_BRACE_END = KWD(b'}')
KEYWORD_ARRAY_BEGIN = KWD(b'[')
KEYWORD_ARRAY_END = KWD(b']')
KEYWORD_DICT_BEGIN = KWD(b'<<')
KEYWORD_DICT_END = KWD(b'>>')
2013-10-03 00:59:40 +06:00
def literal_name(x):
if not isinstance(x, PSLiteral):
if STRICT:
raise PSTypeError('Literal required: %r' % x)
else:
return str(x)
return x.name
def keyword_name(x):
if not isinstance(x, PSKeyword):
if STRICT:
raise PSTypeError('Keyword required: %r' % x)
else:
return str(x)
return x.name
## PSBaseParser
##
2021-11-16 16:09:03 +06:00
EOL = re.compile(br'[\r\n]')
SPC = re.compile(br'\s')
NONSPC = re.compile(br'\S')
HEX = re.compile(br'[0-9a-fA-F]')
END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
END_NUMBER = re.compile(br'[^0-9]')
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(br'[()\\]')
OCT_STRING = re.compile(br'[0-7]')
ESC_STRING = { b'b':8, b't':9, b'n':10, b'f':12, b'r':13, b'(':40, b')':41, b'\\':92 }
2013-10-03 00:59:40 +06:00
class EmptyArrayValue(object):
def __str__(self):
return "<>"
2013-10-03 00:59:40 +06:00
class PSBaseParser(object):
'''
Most basic PostScript parser that performs only basic tokenization.
'''
BUFSIZ = 4096
def __init__(self, fp):
self.fp = fp
self.seek(0)
return
def __repr__(self):
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
def flush(self):
return
def close(self):
self.flush()
return
def tell(self):
return self.bufpos+self.charpos
def poll(self, pos=None, n=80):
pos0 = self.fp.tell()
if not pos:
pos = self.bufpos+self.charpos
self.fp.seek(pos)
self.fp.seek(pos0)
return
def seek(self, pos):
'''
Seeks the parser to the given position.
'''
self.fp.seek(pos)
# reset the status for nextline()
self.bufpos = pos
self.buf = b''
2013-10-03 00:59:40 +06:00
self.charpos = 0
# reset the status for nexttoken()
self.parse1 = self.parse_main
self.tokens = []
return
def fillbuf(self):
if self.charpos < len(self.buf): return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if not self.buf:
raise PSEOF('Unexpected EOF')
self.charpos = 0
return
def parse_main(self, s, i):
m = NONSPC.search(s, i)
if not m:
return (self.parse_main, len(s))
j = m.start(0)
2021-11-16 16:09:03 +06:00
if isinstance(s[j], str):
# Python 2
c = s[j]
else:
2021-11-16 16:09:03 +06:00
# Python 3
c = bytes([s[j]])
2013-10-03 00:59:40 +06:00
self.tokenstart = self.bufpos+j
if c == b'%':
self.token = c
2013-10-03 00:59:40 +06:00
return (self.parse_comment, j+1)
if c == b'/':
self.token = b''
2013-10-03 00:59:40 +06:00
return (self.parse_literal, j+1)
if c in b'-+' or c.isdigit():
2013-10-03 00:59:40 +06:00
self.token = c
return (self.parse_number, j+1)
if c == b'.':
2013-10-03 00:59:40 +06:00
self.token = c
return (self.parse_decimal, j+1)
2013-10-03 00:59:40 +06:00
if c.isalpha():
self.token = c
return (self.parse_keyword, j+1)
if c == b'(':
self.token = b''
2013-10-03 00:59:40 +06:00
self.paren = 1
return (self.parse_string, j+1)
if c == b'<':
self.token = b''
2013-10-03 00:59:40 +06:00
return (self.parse_wopen, j+1)
if c == b'>':
self.token = b''
2013-10-03 00:59:40 +06:00
return (self.parse_wclose, j+1)
self.add_token(KWD(c))
return (self.parse_main, j+1)
def add_token(self, obj):
self.tokens.append((self.tokenstart, obj))
return
def parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_comment, len(s))
j = m.start(0)
self.token += s[i:j]
# We ignore comments.
#self.tokens.append(self.token)
return (self.parse_main, j)
def parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_literal, len(s))
j = m.start(0)
self.token += s[i:j]
2021-11-16 16:09:03 +06:00
if isinstance(s[j], str):
c = s[j]
else:
c = bytes([s[j]])
if c == b'#':
self.hex = b''
2013-10-03 00:59:40 +06:00
return (self.parse_literal_hex, j+1)
self.add_token(LIT(self.token))
return (self.parse_main, j)
def parse_literal_hex(self, s, i):
2021-11-16 16:09:03 +06:00
if isinstance(s[i], str):
c = s[i]
else:
c = bytes([s[i]])
2013-10-03 00:59:40 +06:00
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self.parse_literal_hex, i+1)
if self.hex:
2022-08-06 19:29:05 +06:00
if sys.version_info[0] == 2:
self.token += chr(int(self.hex, 16))
else:
self.token += bytes([int(self.hex, 16)])
2013-10-03 00:59:40 +06:00
return (self.parse_literal, i)
def parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_number, len(s))
j = m.start(0)
self.token += s[i:j]
2021-11-16 16:09:03 +06:00
if isinstance(s[j], str):
c = s[j]
else:
c = bytes([s[j]])
if c == b'.':
2013-10-03 00:59:40 +06:00
self.token += c
return (self.parse_decimal, j+1)
2013-10-03 00:59:40 +06:00
try:
self.add_token(int(self.token))
except ValueError:
pass
return (self.parse_main, j)
def parse_decimal(self, s, i):
2013-10-03 00:59:40 +06:00
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_decimal, len(s))
2013-10-03 00:59:40 +06:00
j = m.start(0)
self.token += s[i:j]
self.add_token(Decimal(self.token.decode('utf-8')))
2013-10-03 00:59:40 +06:00
return (self.parse_main, j)
def parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_keyword, len(s))
j = m.start(0)
self.token += s[i:j]
if self.token == 'true':
token = True
elif self.token == 'false':
token = False
else:
token = KWD(self.token)
self.add_token(token)
return (self.parse_main, j)
def parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_string, len(s))
j = m.start(0)
self.token += s[i:j]
2021-11-16 16:09:03 +06:00
if isinstance(s[j], str):
c = s[j]
else:
c = bytes([s[j]])
if c == b'\\':
2013-10-03 00:59:40 +06:00
self.oct = ''
return (self.parse_string_1, j+1)
if c == b'(':
2013-10-03 00:59:40 +06:00
self.paren += 1
self.token += c
return (self.parse_string, j+1)
if c == b')':
2013-10-03 00:59:40 +06:00
self.paren -= 1
if self.paren:
self.token += c
return (self.parse_string, j+1)
self.add_token(self.token)
return (self.parse_main, j+1)
2013-10-03 00:59:40 +06:00
def parse_string_1(self, s, i):
2021-11-16 16:09:03 +06:00
if isinstance(s[i], str):
c = s[i]
else:
c = bytes([s[i]])
2013-10-03 00:59:40 +06:00
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self.parse_string_1, i+1)
if self.oct:
2022-08-06 19:29:05 +06:00
if sys.version_info[0] == 2:
self.token += chr(int(self.oct, 8))
else:
self.token += bytes([int(self.oct, 8)])
2013-10-03 00:59:40 +06:00
return (self.parse_string, i)
if c in ESC_STRING:
2022-08-06 19:29:05 +06:00
if sys.version_info[0] == 2:
self.token += chr(ESC_STRING[c])
else:
self.token += bytes([ESC_STRING[c]])
2013-10-03 00:59:40 +06:00
return (self.parse_string, i+1)
def parse_wopen(self, s, i):
2021-11-16 16:09:03 +06:00
if isinstance(s[i], str):
c = s[i]
else:
c = bytes([s[i]])
2013-10-03 00:59:40 +06:00
if c.isspace() or HEX.match(c):
return (self.parse_hexstring, i)
if c == b'<':
2013-10-03 00:59:40 +06:00
self.add_token(KEYWORD_DICT_BEGIN)
i += 1
if c == b'>':
# Empty array without any contents. Why though?
# We need to add some dummy python object that will serialize to
# nothing, otherwise the code removes the whole array.
self.add_token(EmptyArrayValue())
i += 1
2013-10-03 00:59:40 +06:00
return (self.parse_main, i)
def parse_wclose(self, s, i):
2021-11-16 16:09:03 +06:00
if isinstance(s[i], str):
c = s[i]
else:
c = bytes([s[i]])
if c == b'>':
2013-10-03 00:59:40 +06:00
self.add_token(KEYWORD_DICT_END)
i += 1
return (self.parse_main, i)
def parse_hexstring(self, s, i):
2022-08-06 19:29:05 +06:00
m = END_HEX_STRING.search(s, i)
if not m:
2013-10-03 00:59:40 +06:00
self.token += s[i:]
return (self.parse_hexstring, len(s))
2022-08-06 19:29:05 +06:00
j = m.start(0)
2013-10-03 00:59:40 +06:00
self.token += s[i:j]
2022-08-06 19:29:05 +06:00
if sys.version_info[0] == 2:
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
SPC.sub('', self.token))
else:
token = HEX_PAIR.sub(lambda m: bytes([int(m.group(0), 16)]),
SPC.sub(b'', self.token))
2013-10-03 00:59:40 +06:00
self.add_token(token)
return (self.parse_main, j)
def nexttoken(self):
while not self.tokens:
self.fillbuf()
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
token = self.tokens.pop(0)
return token
def nextline(self):
'''
Fetches a next line that ends either with \\r or \\n.
'''
linebuf = b''
2013-10-03 00:59:40 +06:00
linepos = self.bufpos + self.charpos
eol = False
while 1:
self.fillbuf()
if eol:
2022-08-07 19:58:01 +06:00
if sys.version_info[0] == 2:
c = self.buf[self.charpos]
else:
c = bytes([self.buf[self.charpos]])
2013-10-03 00:59:40 +06:00
# handle '\r\n'
if c == b'\n':
2013-10-03 00:59:40 +06:00
linebuf += c
self.charpos += 1
break
m = EOL.search(self.buf, self.charpos)
if m:
linebuf += self.buf[self.charpos:m.end(0)]
self.charpos = m.end(0)
2022-08-07 19:58:01 +06:00
if sys.version_info[0] == 2:
if linebuf[-1] == b'\r':
eol = True
else:
break
else:
if bytes([linebuf[-1]]) == b'\r':
eol = True
else:
break
2013-10-03 00:59:40 +06:00
else:
linebuf += self.buf[self.charpos:]
self.charpos = len(self.buf)
return (linepos, linebuf)
def revreadlines(self):
'''
Fetches a next line backword. This is used to locate
the trailers at the end of a file.
'''
self.fp.seek(0, 2)
pos = self.fp.tell()
buf = b''
2013-10-03 00:59:40 +06:00
while 0 < pos:
prevpos = pos
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(prevpos-pos)
if not s: break
while 1:
n = max(s.rfind(b'\r'), s.rfind(b'\n'))
2013-10-03 00:59:40 +06:00
if n == -1:
buf = s + buf
break
yield s[n:]+buf
s = s[:n]
buf = b''
2013-10-03 00:59:40 +06:00
return
## PSStackParser
##
class PSStackParser(PSBaseParser):
def __init__(self, fp):
PSBaseParser.__init__(self, fp)
self.reset()
return
def reset(self):
self.context = []
self.curtype = None
self.curstack = []
self.results = []
return
def seek(self, pos):
PSBaseParser.seek(self, pos)
self.reset()
return
def push(self, *objs):
self.curstack.extend(objs)
return
def pop(self, n):
objs = self.curstack[-n:]
self.curstack[-n:] = []
return objs
def popall(self):
objs = self.curstack
self.curstack = []
return objs
def add_results(self, *objs):
self.results.extend(objs)
return
def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
return
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ]
(pos, self.curtype, self.curstack) = self.context.pop()
return (pos, objs)
def do_keyword(self, pos, token):
return
def nextobject(self, direct=False):
'''
Yields a list of objects: keywords, literals, strings (byte arrays),
2013-10-03 00:59:40 +06:00
numbers, arrays and dictionaries. Arrays and dictionaries
are represented as Python sequence and dictionaries.
'''
while not self.results:
(pos, token) = self.nexttoken()
if (isinstance(token, int) or
isinstance(token, Decimal) or
2013-10-03 00:59:40 +06:00
isinstance(token, bool) or
isinstance(token, bytearray) or
isinstance(token, bytes) or
2021-11-16 16:09:03 +06:00
isinstance(token, str) or
2013-10-03 00:59:40 +06:00
isinstance(token, PSLiteral)):
# normal token
self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN:
# begin array
self.start_type(pos, 'a')
elif token == KEYWORD_ARRAY_END:
# end array
try:
self.push(self.end_type('a'))
except PSTypeError:
if STRICT: raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
elif token == KEYWORD_DICT_END:
# end dictionary
try:
(pos, objs) = self.end_type('d')
if len(objs) % 2 != 0:
print("Incomplete dictionary construct")
2013-10-03 00:59:40 +06:00
objs.append("") # this isn't necessary.
# temporary fix. is this due to rental books?
# raise PSSyntaxError(
# 'Invalid dictionary construct: %r' % objs)
d = dict((literal_name(k), v) \
for (k,v) in choplist(2, objs))
self.push((pos, d))
except PSTypeError:
if STRICT: raise
else:
self.do_keyword(pos, token)
if self.context:
continue
else:
if direct:
return self.pop(1)[0]
self.flush()
obj = self.results.pop(0)
return obj
LITERAL_CRYPT = LIT(b'Crypt')
LITERALS_FLATE_DECODE = (LIT(b'FlateDecode'), LIT(b'Fl'))
LITERALS_LZW_DECODE = (LIT(b'LZWDecode'), LIT(b'LZW'))
LITERALS_ASCII85_DECODE = (LIT(b'ASCII85Decode'), LIT(b'A85'))
2013-10-03 00:59:40 +06:00
## PDF Objects
##
class PDFObject(PSObject): pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
## PDFObjRef
##
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, genno):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
self.genno = genno
return
def __repr__(self):
return '<PDFObjRef:%d %d>' % (self.objid, self.genno)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolve_all(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in iter(x.items()):
2013-10-03 00:59:40 +06:00
x[k] = resolve_all(v)
return x
def decipher_all(decipher, objid, genno, x):
'''
Recursively decipher X.
'''
2021-11-16 16:09:03 +06:00
if isinstance(x, bytearray) or isinstance(x,bytes) or isinstance(x,str):
2013-10-03 00:59:40 +06:00
return decipher(objid, genno, x)
decf = lambda v: decipher_all(decipher, objid, genno, v)
if isinstance(x, list):
x = [decf(v) for v in x]
elif isinstance(x, dict):
x = dict((k, decf(v)) for (k, v) in iter(x.items()))
2013-10-03 00:59:40 +06:00
return x
# Type cheking
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
if STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
def decimal_value(x):
2013-10-03 00:59:40 +06:00
x = resolve1(x)
if not isinstance(x, Decimal):
2013-10-03 00:59:40 +06:00
if STRICT:
raise PDFTypeError('Decimal required: %r' % x)
2013-10-03 00:59:40 +06:00
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, Decimal)):
2013-10-03 00:59:40 +06:00
if STRICT:
raise PDFTypeError('Int or Decimal required: %r' % x)
2013-10-03 00:59:40 +06:00
return 0
return x
def str_value(x):
x = resolve1(x)
2021-11-16 16:09:03 +06:00
if not (isinstance(x, bytearray) or isinstance(x, bytes) or isinstance(x, str)):
2013-10-03 00:59:40 +06:00
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
if STRICT:
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
if STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return x
# ascii85decode(data)
def ascii85decode(data):
n = b = 0
out = b''
2013-10-03 00:59:40 +06:00
for c in data:
if b'!' <= c and c <= b'u':
2013-10-03 00:59:40 +06:00
n += 1
b = b*85+(c-33)
2013-10-03 00:59:40 +06:00
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == b'z':
2013-10-03 00:59:40 +06:00
assert n == 0
out += b'\0\0\0\0'
elif c == b'~':
2013-10-03 00:59:40 +06:00
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
## PDFStream type
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
length = int_value(dic.get('Length', 0))
eol = rawdata[length:]
# quick and dirty fix for false length attribute,
# might not work if the pdf stream parser has a problem
if decipher != None and decipher.__name__ == 'decrypt_aes':
if (len(rawdata) % 16) != 0:
cutdiv = len(rawdata) // 16
rawdata = rawdata[:16*cutdiv]
else:
if eol in (b'\r', b'\n', b'\r\n'):
2013-10-03 00:59:40 +06:00
rawdata = rawdata[:length]
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.decdata = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def __repr__(self):
if self.rawdata:
return '<PDFStream(%r): raw=%d, %r>' % \
(self.objid, len(self.rawdata), self.dic)
else:
return '<PDFStream(%r): data=%d, %r>' % \
(self.objid, len(self.data), self.dic)
def decode(self):
assert self.data is None and self.rawdata is not None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if gen_xref_stm:
self.decdata = data # keep decrypted data
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
data = b''.join(LZWDecoder(BytesIO(data)).run())
2013-10-03 00:59:40 +06:00
elif f in LITERALS_ASCII85_DECODE:
data = ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError(
'Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError(
'Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = b''
ent0 = b'\x00' * columns
for i in range(0, len(data), columns+1):
2013-10-03 00:59:40 +06:00
pred = data[i]
ent1 = data[i+1:i+1+columns]
2022-08-07 19:58:01 +06:00
if sys.version_info[0] == 2:
if pred == '\x02':
ent1 = ''.join(chr((ord(a)+ord(b)) & 255) \
for (a,b) in zip(ent0,ent1))
else:
if pred == 2:
ent1 = b''.join(bytes([(a+b) & 255]) \
for (a,b) in zip(ent0,ent1))
2013-10-03 00:59:40 +06:00
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data is None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata
def get_decdata(self):
if self.decdata is not None:
return self.decdata
data = self.rawdata
if self.decipher and data:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
return data
## PDF Exceptions
##
class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT(b'ObjStm')
LITERAL_XREF = LIT(b'XRef')
LITERAL_PAGE = LIT(b'Page')
LITERAL_PAGES = LIT(b'Pages')
LITERAL_CATALOG = LIT(b'Catalog')
2013-10-03 00:59:40 +06:00
## XRefs
##
## PDFXRef
##
class PDFXRef(object):
def __init__(self):
self.offsets = None
return
def __repr__(self):
return '<PDFXRef: objs=%d>' % len(self.offsets)
def objids(self):
return iter(self.offsets.keys())
2013-10-03 00:59:40 +06:00
def load(self, parser):
self.offsets = {}
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith(b'trailer'):
2013-10-03 00:59:40 +06:00
parser.seek(pos)
break
f = line.strip().split(b' ')
2013-10-03 00:59:40 +06:00
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
(start, nobjs) = map(int, f)
2013-04-05 22:44:48 +06:00
except ValueError:
2013-10-03 00:59:40 +06:00
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
for objid in range(start, start+nobjs):
2013-10-03 00:59:40 +06:00
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(b' ')
2013-10-03 00:59:40 +06:00
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != b'n':
continue
self.offsets[objid] = (int(genno.decode('utf-8')), int(pos.decode('utf-8')))
2013-10-03 00:59:40 +06:00
self.load_trailer(parser)
return
KEYWORD_TRAILER = KWD(b'trailer')
2013-10-03 00:59:40 +06:00
def load_trailer(self, parser):
try:
(_,kwd) = parser.nexttoken()
assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject(direct=True)
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0]
self.trailer = dict_value(dic)
return
def getpos(self, objid):
try:
(genno, pos) = self.offsets[objid]
except KeyError:
raise
return (None, pos)
2022-09-10 15:44:55 +06:00
## PDFXRefStream
##
2013-10-03 00:59:40 +06:00
class PDFXRefStream(object):
def __init__(self):
self.index = None
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
return
def __repr__(self):
return '<PDFXRef: objids=%s>' % self.index
def objids(self):
for first, size in self.index:
for objid in range(first, first + size):
2013-10-03 00:59:40 +06:00
yield objid
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or \
stream.dic['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
index = stream.dic.get('Index', (0,size))
2021-05-02 01:11:53 +06:00
self.index = list(zip(itertools.islice(index, 0, None, 2),
itertools.islice(index, 1, None, 2)))
2013-10-03 00:59:40 +06:00
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
return
def getpos(self, objid):
offset = 0
for first, size in self.index:
if first <= objid and objid < (first + size):
break
offset += size
else:
raise KeyError(objid)
i = self.entlen * ((objid - first) + offset)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
# this is a free object
raise KeyError(objid)
## PDFDocument
##
## A PDFDocument object represents a PDF document.
## Since a PDF file is usually pretty big, normally it is not loaded
## at once. Rather it is parsed dynamically as processing goes.
## A PDF parser is associated with the document.
##
class PDFDocument(object):
def __init__(self):
self.xrefs = []
self.objs = {}
self.parsed_objs = {}
self.root = None
self.catalog = None
self.parser = None
self.encryption = None
self.decipher = None
return
# set_parser(parser)
# Associates the document with an (already initialized) parser object.
def set_parser(self, parser):
if self.parser:
return
2013-10-03 00:59:40 +06:00
self.parser = parser
# The document is set to be temporarily ready during collecting
# all the basic information about the document, e.g.
# the header, the encryption information, and the access rights
# for the document.
self.ready = True
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
self.xrefs = parser.read_xref()
for xref in self.xrefs:
trailer = xref.trailer
if not trailer: continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
try:
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
# fix for bad files
except:
self.encryption = (b'ffffffffffffffffffffffffffffffffffff',
2013-10-03 00:59:40 +06:00
dict_value(trailer['Encrypt']))
if 'Root' in trailer:
self.set_root(dict_value(trailer['Root']))
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
2013-10-03 00:59:40 +06:00
# The document is set to be non-ready again, until all the
# proper initialization (asking the password key and
# verifying the access permission, so on) is finished.
self.ready = False
return
# set_root(root)
# Set the Root dictionary of the document.
# Each PDF file must have exactly one /Root dictionary.
def set_root(self, root):
self.root = root
self.catalog = dict_value(self.root)
if self.catalog.get('Type') is not LITERAL_CATALOG:
if STRICT:
raise PDFSyntaxError('Catalog not found!')
return
# initialize(password='')
# Perform the initialization with a given password.
# This step is mandatory even if there's no password associated
# with the document.
def initialize(self, password=b'', inept=True):
2013-10-03 00:59:40 +06:00
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
self.ready = True
raise PDFEncryptionError('Document is not encrypted.')
2013-10-03 00:59:40 +06:00
return
(docid, param) = self.encryption
type = literal_name(param['Filter'])
if type == 'Adobe.APS':
return self.initialize_adobe_ps(password, docid, param)
if type == 'Standard':
return self.initialize_standard(password, docid, param)
if type == 'EBX_HANDLER' and inept is True:
return self.initialize_ebx_inept(password, docid, param)
if type == 'EBX_HANDLER' and inept is False:
return self.initialize_ebx_ignoble(password, docid, param)
2013-10-03 00:59:40 +06:00
raise PDFEncryptionError('Unknown filter: param=%r' % param)
def initialize_and_return_filter(self):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
self.ready = True
return None
(docid, param) = self.encryption
type = literal_name(param['Filter'])
return type
2013-10-03 00:59:40 +06:00
def initialize_adobe_ps(self, password, docid, param):
global KEYFILEPATH
self.decrypt_key = self.genkey_adobe_ps(param)
self.genkey = self.genkey_v4
self.decipher = self.decrypt_aes
self.ready = True
return
def genkey_adobe_ps(self, param):
# nice little offline principal keys dictionary
# global static principal key for German Onleihe / Bibliothek Digital
principalkeys = { b'bibliothek-digital.de': codecs.decode(b'rRwGv2tbpKov1krvv7PO0ws9S436/lArPlfipz5Pqhw=','base64')}
2013-10-03 00:59:40 +06:00
self.is_printable = self.is_modifiable = self.is_extractable = True
length = int_value(param.get('Length', 0)) // 8
2013-10-03 00:59:40 +06:00
edcdata = str_value(param.get('EDCData')).decode('base64')
pdrllic = str_value(param.get('PDRLLic')).decode('base64')
pdrlpol = str_value(param.get('PDRLPol')).decode('base64')
edclist = []
for pair in edcdata.split(b'\n'):
2013-10-03 00:59:40 +06:00
edclist.append(pair)
# principal key request
for key in principalkeys:
if key in pdrllic:
principalkey = principalkeys[key]
else:
raise ADEPTError('Cannot find principal key for this pdf')
shakey = SHA256(principalkey)
ivector = bytes(16) # 16 zero bytes
2013-10-03 00:59:40 +06:00
plaintext = AES.new(shakey,AES.MODE_CBC,ivector).decrypt(edclist[9].decode('base64'))
if plaintext[-16:] != bytearray(b'\0x10')*16:
2013-10-03 00:59:40 +06:00
raise ADEPTError('Offlinekey cannot be decrypted, aborting ...')
pdrlpol = AES.new(plaintext[16:32],AES.MODE_CBC,edclist[2].decode('base64')).decrypt(pdrlpol)
if pdrlpol[-1] < 1 or pdrlpol[-1] > 16:
2013-10-03 00:59:40 +06:00
raise ADEPTError('Could not decrypt PDRLPol, aborting ...')
else:
cutter = -1 * pdrlpol[-1]
2013-10-03 00:59:40 +06:00
pdrlpol = pdrlpol[:cutter]
return plaintext[:16]
PASSWORD_PADDING = b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..' \
b'\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
2013-10-03 00:59:40 +06:00
# experimental aes pw support
def check_user_password(self, password, docid, param):
V = int_value(param.get('V', 0))
if V < 5:
return self.check_user_password_V4(password, docid, param)
else:
return self.check_user_password_V5(password, param)
def check_owner_password(self, password, docid, param):
V = int_value(param.get('V', 0))
if V < 5:
return self.check_owner_password_V4(password, docid, param)
else:
return self.check_owner_password_V5(password, param)
def check_user_password_V5(self, password, param):
U = str_value(param['U'])
userdata = U[:32]
salt = U[32:32+8]
# Truncate password:
password = password[:min(127, len(password))]
if self.hash_V5(password, salt, b"", param) == userdata:
return True
return None
def check_owner_password_V5(self, password, param):
U = str_value(param['U'])
O = str_value(param['O'])
userdata = U[:48]
ownerdata = O[:32]
salt = O[32:32+8]
# Truncate password:
password = password[:min(127, len(password))]
if self.hash_V5(password, salt, userdata, param) == ownerdata:
return True
return None
def recover_encryption_key_with_password(self, password, docid, param):
# Truncate password:
key_password = password[:min(127, len(password))]
if self.check_owner_password_V5(key_password, param):
O = str_value(param['O'])
U = str_value(param['U'])
OE = str_value(param['OE'])
key_salt = O[40:40+8]
user_data = U[:48]
encrypted_file_key = OE[:32]
elif self.check_user_password_V5(key_password, param):
U = str_value(param['U'])
UE = str_value(param['UE'])
key_salt = U[40:40+8]
user_data = b""
encrypted_file_key = UE[:32]
else:
raise Exception("Trying to recover key, but neither user nor owner pass is correct.")
intermediate_key = self.hash_V5(key_password, key_salt, user_data, param)
file_key = self.process_with_aes(intermediate_key, False, encrypted_file_key)
return file_key
def process_with_aes(self, key, encrypt, data, repetitions = 1, iv = None):
if iv is None:
iv = bytes(bytearray(16))
aes = AES.new(key, AES.MODE_CBC, iv)
if not encrypt:
plaintext = aes.decrypt(data)
return plaintext
else:
new_data = bytes(data * repetitions)
crypt = aes.encrypt(new_data)
return crypt
def hash_V5(self, password, salt, userdata, param):
R = int_value(param['R'])
K = SHA256(password + salt + userdata)
if R < 6:
return K
elif R == 6:
round_number = 0
done = False
while (not done):
round_number = round_number + 1
K1 = password + K + userdata
if len(K1) < 32:
raise Exception("K1 < 32 ...")
#def process_with_aes(self, key: bytes, encrypt: bool, data: bytes, repetitions: int = 1, iv: bytes = None):
E = self.process_with_aes(K[:16], True, K1, 64, K[16:32])
E = bytearray(E)
E_mod_3 = 0
for i in range(16):
E_mod_3 += E[i]
E_mod_3 %= 3
K = (hashlib.sha256, hashlib.sha384, hashlib.sha512)[E_mod_3](E).digest()
if round_number >= 64:
ch = E[-1:][0] # get last byte
if ch <= round_number - 32:
done = True
result = K[0:32]
return result
else:
raise NotImplementedError("Revision > 6 not supported.")
def check_owner_password_V4(self, password, docid, param):
# compute_O_rc4_key:
V = int_value(param.get('V', 0))
if V >= 5:
raise Exception("compute_O_rc4_key not possible with V>= 5")
R = int_value(param.get('R', 0))
length = int_value(param.get('Length', 40)) # Key length (bits)
password = (password+self.PASSWORD_PADDING)[:32]
hash = hashlib.md5(password)
if R >= 3:
for _ in range(50):
hash = hashlib.md5(hash.digest()[:length//8])
hash = hash.digest()[:length//8]
# "hash" is the return value of compute_O_rc4_key
Odata = str_value(param.get('O'))
# now call iterate_rc4 ...
x = ARC4.new(hash).decrypt(Odata) # 4
if R >= 3:
for i in range(1,19+1):
2022-08-07 19:58:01 +06:00
if sys.version_info[0] == 2:
k = b''.join(chr(ord(c) ^ i) for c in hash )
else:
k = b''.join(bytes([c ^ i]) for c in hash )
x = ARC4.new(k).decrypt(x)
# "x" is now the padded user password.
# If we wanted to recover / extract the user password,
# we'd need to trim off the padding string from the end.
# As we just want to get access to the encryption key,
# we can just hand the password into the check_user_password
# as it is, as that function would be adding padding anyways.
# This trick only works with V4 and lower.
enc_key = self.check_user_password(x, docid, param)
if enc_key is not None:
return enc_key
return False
def check_user_password_V4(self, password, docid, param):
2013-10-03 00:59:40 +06:00
V = int_value(param.get('V', 0))
length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O'])
R = int_value(param['R']) # Revision
U = str_value(param['U'])
P = int_value(param['P'])
2013-10-03 00:59:40 +06:00
# Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = hashlib.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
# aes special handling if metadata isn't encrypted
try:
EncMetadata = str_value(param['EncryptMetadata'])
except:
EncMetadata = b'True'
if (EncMetadata == ('False' or 'false') or V < 4) and R >= 4:
hash.update(codecs.decode(b'ffffffff','hex'))
# Finish hash:
hash = hash.digest()
if R >= 3:
2013-10-03 00:59:40 +06:00
# 8
for _ in range(50):
hash = hashlib.md5(hash[:length//8]).digest()
if R == 2:
# R=2 only uses first five bytes.
key = hash[:5]
else:
key = hash[:length//8]
2013-10-03 00:59:40 +06:00
if R == 2:
# Algorithm 3.4
u1 = ARC4.new(key).decrypt(self.PASSWORD_PADDING)
2013-10-03 00:59:40 +06:00
elif R >= 3:
# Algorithm 3.5
hash = hashlib.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = ARC4.new(key).decrypt(hash.digest()[:16]) # 4
for i in range(1,19+1):
2022-08-07 19:58:01 +06:00
if sys.version_info[0] == 2:
k = b''.join(chr(ord(c) ^ i) for c in key )
else:
k = b''.join(bytes([c ^ i]) for c in key )
2013-10-03 00:59:40 +06:00
x = ARC4.new(k).decrypt(x)
u1 = x+x # 32bytes total
2013-10-03 00:59:40 +06:00
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if is_authenticated:
return key
return None
def initialize_standard(self, password, docid, param):
self.decrypt_key = None
# copy from a global variable
V = int_value(param.get('V', 0))
if (V <=0 or V > 5):
raise PDFEncryptionError('Unknown algorithm: %r' % V)
R = int_value(param['R']) # Revision
if R >= 7:
raise PDFEncryptionError('Unknown revision: %r' % R)
# check owner pass:
retval = self.check_owner_password(password, docid, param)
2022-08-07 19:58:01 +06:00
if retval is True or (retval is not False and retval is not None):
#print("Owner pass is valid")
if retval is True:
self.decrypt_key = self.recover_encryption_key_with_password(password, docid, param)
else:
self.decrypt_key = retval
if self.decrypt_key is None or self.decrypt_key is True or self.decrypt_key is False:
# That's not the owner password. Check if it's the user password.
retval = self.check_user_password(password, docid, param)
2022-08-07 19:58:01 +06:00
if retval is True or (retval is not False and retval is not None):
#print("User pass is valid")
if retval is True:
self.decrypt_key = self.recover_encryption_key_with_password(password, docid, param)
else:
self.decrypt_key = retval
if self.decrypt_key is None or self.decrypt_key is True or self.decrypt_key is False:
raise ADEPTInvalidPasswordError("Password invalid.")
P = int_value(param['P'])
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
self.is_annotationable = bool(P & 32)
self.is_formsenabled = bool(P & 256)
self.is_textextractable = bool(P & 512)
self.is_assemblable = bool(P & 1024)
self.is_formprintable = bool(P & 2048)
2013-10-03 00:59:40 +06:00
# genkey method
if V == 1 or V == 2 or V == 4:
2013-10-03 00:59:40 +06:00
self.genkey = self.genkey_v2
elif V == 3:
self.genkey = self.genkey_v3
elif V >= 5:
self.genkey = self.genkey_v5
set_decipher = False
if V >= 4:
# Check if we need new genkey_v4 - only if we're using AES.
try:
for key in param['CF']:
algo = str(param["CF"][key]["CFM"])
if algo == "/AESV2":
if V == 4:
self.genkey = self.genkey_v4
set_decipher = True
self.decipher = self.decrypt_aes
elif algo == "/AESV3":
if V == 4:
self.genkey = self.genkey_v4
set_decipher = True
self.decipher = self.decrypt_aes
elif algo == "/V2":
set_decipher = True
self.decipher = self.decrypt_rc4
except:
pass
2013-10-03 00:59:40 +06:00
# rc4
if V < 4:
self.decipher = self.decrypt_rc4 # XXX may be AES
2013-10-03 00:59:40 +06:00
# aes
if not set_decipher:
# This should usually already be set by now.
# If it's not, assume that V4 and newer are using AES
if V >= 4:
self.decipher = self.decrypt_aes
2013-10-03 00:59:40 +06:00
self.ready = True
return
2021-11-15 16:59:56 +06:00
def initialize_ebx_ignoble(self, keyb64, docid, param):
self.is_printable = self.is_modifiable = self.is_extractable = True
try:
key = keyb64.decode('base64')[:16]
# This will probably always error, but I'm not 100% sure, so lets leave the old code in.
except AttributeError:
key = codecs.decode(keyb64.encode("ascii"), 'base64')[:16]
2022-03-19 21:02:33 +06:00
length = int_value(param.get('Length', 0)) / 8
rights = codecs.decode(str_value(param.get('ADEPT_LICENSE')), "base64")
rights = zlib.decompress(rights, -15)
rights = etree.fromstring(rights)
expr = './/{http://ns.adobe.com/adept}encryptedKey'
bookkey = ''.join(rights.findtext(expr))
bookkey = base64.b64decode(bookkey)
bookkey = AES.new(key, AES.MODE_CBC, b'\x00'*16).decrypt(bookkey)
bookkey = unpad(bookkey, 16) # PKCS#7
if len(bookkey) > 16:
bookkey = bookkey[-16:]
ebx_V = int_value(param.get('V', 4))
ebx_type = int_value(param.get('EBX_ENCRYPTIONTYPE', 6))
# added because of improper booktype / decryption book session key errors
if length > 0:
if len(bookkey) == length:
if ebx_V == 3:
V = 3
else:
V = 2
elif len(bookkey) == length + 1:
V = bookkey[0]
bookkey = bookkey[1:]
else:
print("ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type))
print("length is %d and len(bookkey) is %d" % (length, len(bookkey)))
if len(bookkey) > 0:
print("bookkey[0] is %d" % bookkey[0])
raise ADEPTError('error decrypting book session key - mismatched length')
else:
# proper length unknown try with whatever you have
print("ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type))
print("length is %d and len(bookkey) is %d" % (length, len(bookkey)))
if len(bookkey) > 0:
print("bookkey[0] is %d" % ord(bookkey[0]))
if ebx_V == 3:
V = 3
else:
V = 2
self.decrypt_key = bookkey
self.genkey = self.genkey_v3 if V == 3 else self.genkey_v2
self.decipher = self.decrypt_rc4
self.ready = True
return
@staticmethod
def removeHardening(rights, keytype, keydata):
adept = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
textGetter = lambda name: ''.join(rights.findtext('.//%s' % (adept(name),)))
# Gather what we need, and generate the IV
resourceuuid = UUID(textGetter("resource"))
deviceuuid = UUID(textGetter("device"))
fullfillmentuuid = UUID(textGetter("fulfillment")[:36])
kekiv = UUID(int=resourceuuid.int ^ deviceuuid.int ^ fullfillmentuuid.int).bytes
# Derive kek from just "keytype"
rem = int(keytype, 10) % 16
H = SHA256(keytype.encode("ascii"))
kek = H[2*rem : 16 + rem] + H[rem : 2*rem]
return unpad(AES.new(kek, AES.MODE_CBC, kekiv).decrypt(keydata), 16)
def initialize_ebx_inept(self, password, docid, param):
2013-10-03 00:59:40 +06:00
self.is_printable = self.is_modifiable = self.is_extractable = True
rsakey = RSA.importKey(password) # parses the ASN1 structure
length = int_value(param.get('Length', 0)) // 8
rights = codecs.decode(param.get('ADEPT_LICENSE'), 'base64')
2013-10-03 00:59:40 +06:00
rights = zlib.decompress(rights, -15)
rights = etree.fromstring(rights)
expr = './/{http://ns.adobe.com/adept}encryptedKey'
bookkeyelem = rights.find(expr)
bookkey = codecs.decode(bookkeyelem.text.encode('utf-8'),'base64')
keytype = bookkeyelem.attrib.get('keyType', '0')
if int(keytype, 10) > 2:
bookkey = PDFDocument.removeHardening(rights, keytype, bookkey)
try:
bookkey = PKCS1_v1_5.new(rsakey).decrypt(bookkey, None) # automatically unpads
except ValueError:
bookkey = None
2021-11-15 16:59:56 +06:00
if bookkey is None:
raise ADEPTError('error decrypting book session key')
2021-11-15 16:59:56 +06:00
2013-10-03 00:59:40 +06:00
ebx_V = int_value(param.get('V', 4))
ebx_type = int_value(param.get('EBX_ENCRYPTIONTYPE', 6))
# added because of improper booktype / decryption book session key errors
if length > 0:
if len(bookkey) == length:
if ebx_V == 3:
V = 3
else:
V = 2
elif len(bookkey) == length + 1:
V = bookkey[0]
2013-10-03 00:59:40 +06:00
bookkey = bookkey[1:]
else:
print("ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type))
print("length is %d and len(bookkey) is %d" % (length, len(bookkey)))
if len(bookkey) > 0:
print("bookkey[0] is %d" % bookkey[0])
2013-10-03 00:59:40 +06:00
raise ADEPTError('error decrypting book session key - mismatched length')
else:
# proper length unknown try with whatever you have
print("ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type))
print("length is %d and len(bookkey) is %d" % (length, len(bookkey)))
if len(bookkey) > 0:
print("bookkey[0] is %d" % bookkey[0])
2013-10-03 00:59:40 +06:00
if ebx_V == 3:
V = 3
else:
V = 2
self.decrypt_key = bookkey
self.genkey = self.genkey_v3 if V == 3 else self.genkey_v2
self.decipher = self.decrypt_rc4
self.ready = True
return
# genkey functions
def genkey_v2(self, objid, genno):
objid = struct.pack('<L', objid)[:3]
genno = struct.pack('<L', genno)[:2]
key = self.decrypt_key + objid + genno
hash = hashlib.md5(key)
key = hash.digest()[:min(len(self.decrypt_key) + 5, 16)]
return key
def genkey_v3(self, objid, genno):
objid = struct.pack('<L', objid ^ 0x3569ac)
genno = struct.pack('<L', genno ^ 0xca96)
key = self.decrypt_key
2022-03-18 22:26:17 +06:00
key += bytes([objid[0], genno[0], objid[1], genno[1], objid[2]]) + b'sAlT'
2013-10-03 00:59:40 +06:00
hash = hashlib.md5(key)
key = hash.digest()[:min(len(self.decrypt_key) + 5, 16)]
return key
# aes v2 and v4 algorithm
def genkey_v4(self, objid, genno):
objid = struct.pack('<L', objid)[:3]
genno = struct.pack('<L', genno)[:2]
key = self.decrypt_key + objid + genno + b'sAlT'
2013-10-03 00:59:40 +06:00
hash = hashlib.md5(key)
key = hash.digest()[:min(len(self.decrypt_key) + 5, 16)]
return key
def genkey_v5(self, objid, genno):
# Looks like they stopped this useless obfuscation.
return self.decrypt_key
2013-10-03 00:59:40 +06:00
def decrypt_aes(self, objid, genno, data):
2013-10-03 00:59:40 +06:00
key = self.genkey(objid, genno)
ivector = data[:16]
data = data[16:]
plaintext = AES.new(key,AES.MODE_CBC,ivector).decrypt(data)
# remove pkcs#5 aes padding
2022-08-07 19:58:01 +06:00
if sys.version_info[0] == 2:
cutter = -1 * ord(plaintext[-1])
else:
cutter = -1 * plaintext[-1]
2013-10-03 00:59:40 +06:00
plaintext = plaintext[:cutter]
return plaintext
def decrypt_rc4(self, objid, genno, data):
key = self.genkey(objid, genno)
return ARC4.new(key).decrypt(data)
KEYWORD_OBJ = KWD(b'obj')
2013-10-03 00:59:40 +06:00
def getobj(self, objid):
if not self.ready:
raise PDFException('PDFDocument not initialized')
#assert self.xrefs
if objid in self.objs:
genno = 0
obj = self.objs[objid]
2009-01-01 20:33:12 +06:00
else:
2013-10-03 00:59:40 +06:00
for xref in self.xrefs:
try:
(stmid, index) = xref.getpos(objid)
break
except KeyError:
pass
else:
#if STRICT:
# raise PDFSyntaxError('Cannot locate objid=%r' % objid)
return None
if stmid:
if gen_xref_stm:
return PDFObjStmRef(objid, stmid, index)
# Stuff from pdfminer: extract objects from object stream
stream = stream_value(self.getobj(stmid))
if stream.dic.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream.dic['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if stmid in self.parsed_objs:
objs = self.parsed_objs[stmid]
else:
parser = PDFObjStrmParser(stream.get_data(), self)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[stmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
# This IndexError used to just raise an exception.
# Unfortunately that seems to break some PDFs, see this issue:
# https://github.com/noDRM/DeDRM_tools/issues/233
# I'm not sure why this is the case, but lets try only raising that exception
# when in STRICT mode, and make it a warning otherwise.
if STRICT:
raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
print('Invalid object number: objid=%r' % (objid))
print("Continuing anyways?")
print("If the resulting PDF is corrupted, please open a bug report.")
return None
2013-10-03 00:59:40 +06:00
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else:
self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid
(_,genno) = self.parser.nexttoken() # genno
#assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nexttoken()
# #### hack around malformed pdf files
# assert objid1 == objid, (objid, objid1)
## if objid1 != objid:
## x = []
## while kwd is not self.KEYWORD_OBJ:
## (_,kwd) = self.parser.nexttoken()
## x.append(kwd)
## if x:
## objid1 = x[-2]
## genno = x[-1]
##
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError(
'Invalid object spec: offset=%r' % index)
(_,obj) = self.parser.nextobject()
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
self.objs[objid] = obj
return obj
class PDFObjStmRef(object):
maxindex = 0
def __init__(self, objid, stmid, index):
self.objid = objid
self.stmid = stmid
self.index = index
if index > PDFObjStmRef.maxindex:
PDFObjStmRef.maxindex = index
## PDFParser
##
class PDFParser(PSStackParser):
def __init__(self, doc, fp):
PSStackParser.__init__(self, fp)
self.doc = doc
self.doc.set_parser(self)
return
def __repr__(self):
return '<PDFParser>'
KEYWORD_R = KWD(b'R')
KEYWORD_ENDOBJ = KWD(b'endobj')
KEYWORD_STREAM = KWD(b'stream')
KEYWORD_XREF = KWD(b'xref')
KEYWORD_STARTXREF = KWD(b'startxref')
2013-10-03 00:59:40 +06:00
def do_keyword(self, pos, token):
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
return
if token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
return
if token is self.KEYWORD_R:
# reference to indirect object
2010-02-23 14:15:41 +06:00
try:
2013-10-03 00:59:40 +06:00
((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
2010-02-23 14:15:41 +06:00
pass
2013-10-03 00:59:40 +06:00
return
2010-02-28 03:37:34 +06:00
2013-10-03 00:59:40 +06:00
if token is self.KEYWORD_STREAM:
# stream object
((_,dic),) = self.pop(1)
dic = dict_value(dic)
try:
objlen = int_value(dic['Length'])
except KeyError:
if STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic)
objlen = 0
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
return
pos += len(line)
self.fp.seek(pos)
data = self.fp.read(objlen)
self.seek(pos+objlen)
while 1:
try:
(linepos, line) = self.nextline()
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if b'endstream' in line:
i = line.index(b'endstream')
2013-10-03 00:59:40 +06:00
objlen += i
data += line[:i]
break
objlen += len(line)
data += line
self.seek(pos+objlen)
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))
return
2010-02-28 03:37:34 +06:00
2013-10-03 00:59:40 +06:00
# others
self.push((pos, token))
return
2013-10-03 00:59:40 +06:00
def find_xref(self):
# search the last xref table by scanning the file backwards.
prev = None
for line in self.revreadlines():
line = line.strip()
if line == b'startxref': break
2013-10-03 00:59:40 +06:00
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
return int(prev)
2013-04-05 22:44:48 +06:00
2013-10-03 00:59:40 +06:00
# read xref table
def read_xref_from(self, start, xrefs):
self.seek(start)
self.reset()
try:
(pos, token) = self.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
if isinstance(token, int):
# XRefStream: PDF-1.5
if GEN_XREF_STM == 1:
global gen_xref_stm
gen_xref_stm = True
self.seek(pos)
self.reset()
xref = PDFXRefStream()
xref.load(self)
else:
if token is not self.KEYWORD_XREF:
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
(pos, token))
self.nextline()
xref = PDFXRef()
xref.load(self)
xrefs.append(xref)
trailer = xref.trailer
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(pos, xrefs)
if 'Prev' in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
self.read_xref_from(pos, xrefs)
return
2013-04-05 22:44:48 +06:00
2013-10-03 00:59:40 +06:00
# read xref tables and trailers
def read_xref(self):
xrefs = []
trailerpos = None
try:
pos = self.find_xref()
self.read_xref_from(pos, xrefs)
except PDFNoValidXRef:
# fallback
self.seek(0)
2023-12-21 16:57:39 +06:00
pat = re.compile(br'^(\\d+)\\s+(\\d+)\\s+obj\\b')
2013-10-03 00:59:40 +06:00
offsets = {}
xref = PDFXRef()
while 1:
try:
(pos, line) = self.nextline()
except PSEOF:
break
if line.startswith(b'trailer'):
2013-10-03 00:59:40 +06:00
trailerpos = pos # remember last trailer
m = pat.match(line)
if not m: continue
(objid, genno) = m.groups()
offsets[int(objid)] = (0, pos)
if not offsets: raise
xref.offsets = offsets
if trailerpos:
self.seek(trailerpos)
xref.load_trailer(self)
xrefs.append(xref)
return xrefs
2013-04-05 22:44:48 +06:00
2013-10-03 00:59:40 +06:00
## PDFObjStrmParser
##
class PDFObjStrmParser(PDFParser):
2013-04-05 22:44:48 +06:00
2013-10-03 00:59:40 +06:00
def __init__(self, data, doc):
PSStackParser.__init__(self, BytesIO(data))
2013-10-03 00:59:40 +06:00
self.doc = doc
return
2013-04-05 22:44:48 +06:00
2013-10-03 00:59:40 +06:00
def flush(self):
self.add_results(*self.popall())
return
2013-04-05 22:44:48 +06:00
KEYWORD_R = KWD(b'R')
2013-10-03 00:59:40 +06:00
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
# others
self.push((pos, token))
return
2021-11-15 18:38:39 +06:00
# Takes a PDF file name as input, and if this is an ADE-protected PDF,
# returns the UUID of the user that's licensed to open this file.
def adeptGetUserUUID(inf):
try:
2021-11-15 18:38:39 +06:00
doc = PDFDocument()
inf = open(inf, 'rb')
2021-11-15 18:38:39 +06:00
pars = PDFParser(doc, inf)
(docid, param) = doc.encryption
type = literal_name(param['Filter'])
if type != 'EBX_HANDLER':
# No EBX_HANDLER, no idea which user key can decrypt this.
inf.close()
2021-11-15 18:38:39 +06:00
return None
rights = codecs.decode(param.get('ADEPT_LICENSE'), 'base64')
inf.close()
2021-11-15 18:38:39 +06:00
rights = zlib.decompress(rights, -15)
rights = etree.fromstring(rights)
expr = './/{http://ns.adobe.com/adept}user'
user_uuid = ''.join(rights.findtext(expr))
if user_uuid[:9] != "urn:uuid:":
2021-11-15 18:38:39 +06:00
return None
return user_uuid[9:]
except:
2021-11-15 18:38:39 +06:00
return None
2013-10-03 00:59:40 +06:00
###
### My own code, for which there is none else to blame
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
class PDFSerializer(object):
def __init__(self, inf, userkey, inept=True):
2013-10-03 00:59:40 +06:00
global GEN_XREF_STM, gen_xref_stm
gen_xref_stm = GEN_XREF_STM > 1
self.version = inf.read(8)
inf.seek(0)
self.doc = doc = PDFDocument()
parser = PDFParser(doc, inf)
doc.initialize(userkey, inept)
2013-10-03 00:59:40 +06:00
self.objids = objids = set()
for xref in reversed(doc.xrefs):
trailer = xref.trailer
for objid in xref.objids():
objids.add(objid)
trailer = dict(trailer)
trailer.pop('Prev', None)
trailer.pop('XRefStm', None)
if 'Encrypt' in trailer:
objids.remove(trailer.pop('Encrypt').objid)
self.trailer = trailer
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
def dump(self, outf):
self.outf = outf
self.write(self.version)
self.write(b'\n%\xe2\xe3\xcf\xd3\n')
2013-10-03 00:59:40 +06:00
doc = self.doc
objids = self.objids
xrefs = {}
maxobj = max(objids)
trailer = dict(self.trailer)
trailer['Size'] = maxobj + 1
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFObjStmRef):
xrefs[objid] = obj
continue
if obj is not None:
try:
genno = obj.genno
except AttributeError:
genno = 0
xrefs[objid] = (self.tell(), genno)
self.serialize_indirect(objid, obj)
startxref = self.tell()
2013-10-03 00:59:40 +06:00
if not gen_xref_stm:
self.write(b'xref\n')
self.write(b'0 %d\n' % (maxobj + 1,))
for objid in range(0, maxobj + 1):
2013-10-03 00:59:40 +06:00
if objid in xrefs:
# force the genno to be 0
self.write(b"%010d 00000 n \n" % xrefs[objid][0])
2013-10-03 00:59:40 +06:00
else:
self.write(b"%010d %05d f \n" % (0, 65535))
self.write(b'trailer\n')
2013-10-03 00:59:40 +06:00
self.serialize_object(trailer)
self.write(b'\nstartxref\n%d\n%%%%EOF' % startxref)
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
else: # Generate crossref stream.
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
# Calculate size of entries
maxoffset = max(startxref, maxobj)
maxindex = PDFObjStmRef.maxindex
fl2 = 2
power = 65536
while maxoffset >= power:
fl2 += 1
power *= 256
fl3 = 1
power = 256
while maxindex >= power:
fl3 += 1
power *= 256
2009-02-26 16:42:22 +06:00
2013-10-03 00:59:40 +06:00
index = []
first = None
prev = None
data = []
# Put the xrefstream's reference in itself
startxref = self.tell()
maxobj += 1
xrefs[maxobj] = (startxref, 0)
for objid in sorted(xrefs):
if first is None:
first = objid
elif objid != prev + 1:
index.extend((first, prev - first + 1))
first = objid
prev = objid
objref = xrefs[objid]
if isinstance(objref, PDFObjStmRef):
f1 = 2
f2 = objref.stmid
f3 = objref.index
else:
f1 = 1
f2 = objref[0]
# we force all generation numbers to be 0
# f3 = objref[1]
f3 = 0
2013-10-03 00:59:40 +06:00
data.append(struct.pack('>B', f1))
data.append(struct.pack('>L', f2)[-fl2:])
data.append(struct.pack('>L', f3)[-fl3:])
index.extend((first, prev - first + 1))
data = zlib.compress(b''.join(data))
2013-10-03 00:59:40 +06:00
dic = {'Type': LITERAL_XREF, 'Size': prev + 1, 'Index': index,
'W': [1, fl2, fl3], 'Length': len(data),
'Filter': LITERALS_FLATE_DECODE[0],
'Root': trailer['Root'],}
if 'Info' in trailer:
dic['Info'] = trailer['Info']
xrefstm = PDFStream(dic, data)
self.serialize_indirect(maxobj, xrefstm)
self.write(b'startxref\n%d\n%%%%EOF' % startxref)
2013-10-03 00:59:40 +06:00
def write(self, data):
self.outf.write(data)
self.last = data[-1:]
2013-10-03 00:59:40 +06:00
def tell(self):
return self.outf.tell()
def escape_string(self, string):
string = string.replace(b'\\', b'\\\\')
2021-11-16 16:09:03 +06:00
string = string.replace(b'\n', b'\\n')
string = string.replace(b'(', b'\\(')
string = string.replace(b')', b'\\)')
2013-10-03 00:59:40 +06:00
return string
def serialize_object(self, obj):
if isinstance(obj, dict):
# Correct malformed Mac OS resource forks for Stanza
if 'ResFork' in obj and 'Type' in obj and 'Subtype' not in obj \
and isinstance(obj['Type'], int):
obj['Subtype'] = obj['Type']
del obj['Type']
# end - hope this doesn't have bad effects
self.write(b'<<')
2013-10-03 00:59:40 +06:00
for key, val in obj.items():
self.write(str(LIT(key.encode('utf-8'))).encode('utf-8'))
2013-10-03 00:59:40 +06:00
self.serialize_object(val)
self.write(b'>>')
2013-10-03 00:59:40 +06:00
elif isinstance(obj, list):
self.write(b'[')
2013-10-03 00:59:40 +06:00
for val in obj:
self.serialize_object(val)
self.write(b']')
elif isinstance(obj, bytearray):
self.write(b'(%s)' % self.escape_string(obj))
elif isinstance(obj, bytes):
2022-08-07 19:58:01 +06:00
self.write(b'<%s>' % binascii.hexlify(obj).upper())
2013-10-03 00:59:40 +06:00
elif isinstance(obj, str):
self.write(b'(%s)' % self.escape_string(obj.encode('utf-8')))
2013-10-03 00:59:40 +06:00
elif isinstance(obj, bool):
if self.last.isalnum():
self.write(b' ')
self.write(str(obj).lower().encode('utf-8'))
elif isinstance(obj, int):
if self.last.isalnum():
self.write(b' ')
self.write(str(obj).encode('utf-8'))
elif isinstance(obj, Decimal):
2013-10-03 00:59:40 +06:00
if self.last.isalnum():
self.write(b' ')
self.write(str(obj).encode('utf-8'))
2013-10-03 00:59:40 +06:00
elif isinstance(obj, PDFObjRef):
if self.last.isalnum():
self.write(b' ')
self.write(b'%d %d R' % (obj.objid, 0))
2013-10-03 00:59:40 +06:00
elif isinstance(obj, PDFStream):
### If we don't generate cross ref streams the object streams
### are no longer useful, as we have extracted all objects from
### them. Therefore leave them out from the output.
if obj.dic.get('Type') == LITERAL_OBJSTM and not gen_xref_stm:
self.write(b'(deleted)')
2013-10-03 00:59:40 +06:00
else:
data = obj.get_decdata()
2022-08-07 19:58:01 +06:00
# Fix length:
# We've decompressed and then recompressed the PDF stream.
# Depending on the algorithm, the implementation, and the compression level,
# the resulting recompressed stream is unlikely to have the same length as the original.
# So we need to update the PDF object to contain the new proper length.
# Without this change, all PDFs exported by this plugin are slightly corrupted -
# even though most if not all PDF readers can correct that on-the-fly.
if 'Length' in obj.dic:
obj.dic['Length'] = len(data)
2013-10-03 00:59:40 +06:00
self.serialize_object(obj.dic)
self.write(b'stream\n')
2013-10-03 00:59:40 +06:00
self.write(data)
self.write(b'\nendstream')
2013-10-03 00:59:40 +06:00
else:
data = str(obj).encode('utf-8')
if bytes([data[0]]).isalnum() and self.last.isalnum():
self.write(b' ')
2013-10-03 00:59:40 +06:00
self.write(data)
def serialize_indirect(self, objid, obj):
self.write(b'%d 0 obj' % (objid,))
2013-10-03 00:59:40 +06:00
self.serialize_object(obj)
if self.last.isalnum():
self.write(b'\n')
self.write(b'endobj\n')
2013-10-03 00:59:40 +06:00
def decryptBook(userkey, inpath, outpath, inept=True):
2013-10-03 00:59:40 +06:00
with open(inpath, 'rb') as inf:
serializer = PDFSerializer(inf, userkey, inept)
2013-10-03 00:59:40 +06:00
with open(outpath, 'wb') as outf:
# help construct to make sure the method runs to the end
try:
serializer.dump(outf)
except Exception as e:
print("error writing pdf: {0}".format(e))
traceback.print_exc()
2013-10-03 00:59:40 +06:00
return 2
return 0
2013-04-05 22:44:48 +06:00
def getPDFencryptionType(inpath):
with open(inpath, 'rb') as inf:
doc = doc = PDFDocument()
parser = PDFParser(doc, inf)
filter = doc.initialize_and_return_filter()
return filter
2013-03-26 22:38:18 +06:00
def cli_main():
2013-10-03 00:59:40 +06:00
sys.stdout=SafeUnbuffered(sys.stdout)
sys.stderr=SafeUnbuffered(sys.stderr)
2022-08-07 00:19:18 +06:00
argv=unicode_argv("ineptpdf.py")
progname = os.path.basename(argv[0])
2013-10-03 00:59:40 +06:00
if len(argv) != 4:
print("usage: {0} <keyfile.der> <inbook.pdf> <outbook.pdf>".format(progname))
2013-10-03 00:59:40 +06:00
return 1
keypath, inpath, outpath = argv[1:]
userkey = open(keypath,'rb').read()
result = decryptBook(userkey, inpath, outpath)
if result == 0:
print("Successfully decrypted {0:s} as {1:s}".format(os.path.basename(inpath),os.path.basename(outpath)))
2013-10-03 00:59:40 +06:00
return result
2013-04-05 22:44:48 +06:00
2013-10-03 00:59:40 +06:00
def gui_main():
2013-04-05 22:44:48 +06:00
try:
import tkinter
import tkinter.constants
import tkinter.filedialog
import tkinter.messagebox
2013-10-03 00:59:40 +06:00
import traceback
except:
return cli_main()
class DecryptionDialog(tkinter.Frame):
2013-10-03 00:59:40 +06:00
def __init__(self, root):
tkinter.Frame.__init__(self, root, border=5)
self.status = tkinter.Label(self, text="Select files for decryption")
self.status.pack(fill=tkinter.constants.X, expand=1)
body = tkinter.Frame(self)
body.pack(fill=tkinter.constants.X, expand=1)
sticky = tkinter.constants.E + tkinter.constants.W
2013-10-03 00:59:40 +06:00
body.grid_columnconfigure(1, weight=2)
tkinter.Label(body, text="Key file").grid(row=0)
self.keypath = tkinter.Entry(body, width=30)
2013-10-03 00:59:40 +06:00
self.keypath.grid(row=0, column=1, sticky=sticky)
if os.path.exists("adeptkey.der"):
self.keypath.insert(0, "adeptkey.der")
button = tkinter.Button(body, text="...", command=self.get_keypath)
2013-10-03 00:59:40 +06:00
button.grid(row=0, column=2)
tkinter.Label(body, text="Input file").grid(row=1)
self.inpath = tkinter.Entry(body, width=30)
2013-10-03 00:59:40 +06:00
self.inpath.grid(row=1, column=1, sticky=sticky)
button = tkinter.Button(body, text="...", command=self.get_inpath)
2013-10-03 00:59:40 +06:00
button.grid(row=1, column=2)
tkinter.Label(body, text="Output file").grid(row=2)
self.outpath = tkinter.Entry(body, width=30)
2013-10-03 00:59:40 +06:00
self.outpath.grid(row=2, column=1, sticky=sticky)
button = tkinter.Button(body, text="...", command=self.get_outpath)
2013-10-03 00:59:40 +06:00
button.grid(row=2, column=2)
buttons = tkinter.Frame(self)
2013-10-03 00:59:40 +06:00
buttons.pack()
botton = tkinter.Button(
buttons, text="Decrypt", width=10, command=self.decrypt)
botton.pack(side=tkinter.constants.LEFT)
tkinter.Frame(buttons, width=10).pack(side=tkinter.constants.LEFT)
button = tkinter.Button(
buttons, text="Quit", width=10, command=self.quit)
button.pack(side=tkinter.constants.RIGHT)
2013-10-03 00:59:40 +06:00
def get_keypath(self):
keypath = tkinter.filedialog.askopenfilename(
parent=None, title="Select Adobe Adept \'.der\' key file",
defaultextension=".der",
2013-10-03 00:59:40 +06:00
filetypes=[('Adobe Adept DER-encoded files', '.der'),
('All Files', '.*')])
if keypath:
keypath = os.path.normpath(keypath)
self.keypath.delete(0, tkinter.constants.END)
2013-10-03 00:59:40 +06:00
self.keypath.insert(0, keypath)
return
def get_inpath(self):
inpath = tkinter.filedialog.askopenfilename(
parent=None, title="Select ADEPT-encrypted PDF file to decrypt",
defaultextension=".pdf", filetypes=[('PDF files', '.pdf')])
2013-10-03 00:59:40 +06:00
if inpath:
inpath = os.path.normpath(inpath)
self.inpath.delete(0, tkinter.constants.END)
2013-10-03 00:59:40 +06:00
self.inpath.insert(0, inpath)
return
def get_outpath(self):
outpath = tkinter.filedialog.asksaveasfilename(
parent=None, title="Select unencrypted PDF file to produce",
defaultextension=".pdf", filetypes=[('PDF files', '.pdf')])
2013-10-03 00:59:40 +06:00
if outpath:
outpath = os.path.normpath(outpath)
self.outpath.delete(0, tkinter.constants.END)
2013-10-03 00:59:40 +06:00
self.outpath.insert(0, outpath)
return
def decrypt(self):
keypath = self.keypath.get()
inpath = self.inpath.get()
outpath = self.outpath.get()
if not keypath or not os.path.exists(keypath):
self.status['text'] = "Specified key file does not exist"
2013-10-03 00:59:40 +06:00
return
if not inpath or not os.path.exists(inpath):
self.status['text'] = "Specified input file does not exist"
2013-10-03 00:59:40 +06:00
return
if not outpath:
self.status['text'] = "Output file not specified"
2013-10-03 00:59:40 +06:00
return
if inpath == outpath:
self.status['text'] = "Must have different input and output files"
2013-10-03 00:59:40 +06:00
return
userkey = open(keypath,'rb').read()
self.status['text'] = "Decrypting..."
2013-10-03 00:59:40 +06:00
try:
decrypt_status = decryptBook(userkey, inpath, outpath)
except Exception as e:
self.status['text'] = "Error; {0}".format(e.args[0])
2013-10-03 00:59:40 +06:00
return
if decrypt_status == 0:
self.status['text'] = "File successfully decrypted"
2013-10-03 00:59:40 +06:00
else:
self.status['text'] = "The was an error decrypting the file."
2013-10-03 00:59:40 +06:00
root = tkinter.Tk()
root.title("Adobe Adept PDF Decrypter v.{0}".format(__version__))
2013-10-03 00:59:40 +06:00
root.resizable(True, False)
root.minsize(370, 0)
DecryptionDialog(root).pack(fill=tkinter.constants.X, expand=1)
2013-10-03 00:59:40 +06:00
root.mainloop()
return 0
2009-02-26 16:42:22 +06:00
if __name__ == '__main__':
2013-10-03 00:59:40 +06:00
if len(sys.argv) > 1:
sys.exit(cli_main())
sys.exit(gui_main())