Unknown date, late 2009/early 2010

This commit is contained in:
i♥cabbages 2009-01-01 14:33:12 +00:00 committed by Apprentice Alf
parent 93c2ccd2c2
commit 58833e7dc5

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# ineptpdf.pyw, version 2 # ineptpdf.pyw, version 3
# To run this program install Python 2.6 from http://www.python.org/download/ # To run this program install Python 2.6 from http://www.python.org/download/
# and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto # and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto
@ -10,6 +10,7 @@
# Revision history: # Revision history:
# 1 - Initial release # 1 - Initial release
# 2 - Improved determination of key-generation algorithm # 2 - Improved determination of key-generation algorithm
# 3 - Correctly handle PDF >=1.5 cross-reference streams
""" """
Decrypt Adobe ADEPT-encrypted PDF files. Decrypt Adobe ADEPT-encrypted PDF files.
@ -25,7 +26,7 @@ import re
import zlib import zlib
import struct import struct
import hashlib import hashlib
from itertools import chain from itertools import chain, islice
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
import Tkinter import Tkinter
import Tkconstants import Tkconstants
@ -163,11 +164,11 @@ def nunpack(s, default=0):
elif l == 1: elif l == 1:
return ord(s) return ord(s)
elif l == 2: elif l == 2:
return unpack('>H', s)[0] return struct.unpack('>H', s)[0]
elif l == 3: elif l == 3:
return unpack('>L', '\x00'+s)[0] return struct.unpack('>L', '\x00'+s)[0]
elif l == 4: elif l == 4:
return unpack('>L', s)[0] return struct.unpack('>L', s)[0]
else: else:
return TypeError('invalid length: %d' % l) return TypeError('invalid length: %d' % l)
@ -680,6 +681,12 @@ class PSStackParser(PSBaseParser):
return obj return obj
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
## PDF Objects ## PDF Objects
## ##
class PDFObject(PSObject): pass class PDFObject(PSObject): pass
@ -741,11 +748,11 @@ def decipher_all(decipher, objid, genno, x):
''' '''
if isinstance(x, str): if isinstance(x, str):
return decipher(objid, genno, x) return decipher(objid, genno, x)
decf = lambda v: decipher_all(decipher, objid, genno, v)
if isinstance(x, list): if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ] x = [decf(v) for v in x]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): x = dict((k, decf(v)) for (k, v) in x.iteritems())
x[k] = decipher_all(decipher, objid, genno, v)
return x return x
# Type cheking # Type cheking
@ -805,6 +812,28 @@ def stream_value(x):
return PDFStream({}, '') return PDFStream({}, '')
return x return x
# ascii85decode(data)
def ascii85decode(data):
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
## PDFStream type ## PDFStream type
## ##
@ -834,12 +863,76 @@ class PDFStream(PDFObject):
return '<PDFStream(%r): raw=%d, %r>' % \ return '<PDFStream(%r): raw=%d, %r>' % \
(self.objid, len(self.rawdata), self.dic) (self.objid, len(self.rawdata), self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
data = ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError(
'Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError(
'Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join(chr((ord(a)+ord(b)) & 255) \
for (a,b) in zip(ent0,ent1))
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self): def get_rawdata(self):
return self.rawdata return self.rawdata
def get_decdata(self): def get_decdata(self):
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher and data:
# Handle encryption # Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data)
return data return data
@ -932,6 +1025,66 @@ class PDFXRef(object):
return (None, pos) return (None, pos)
## PDFXRefStream
##
class PDFXRefStream(object):
def __init__(self):
self.index = None
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
return
def __repr__(self):
return '<PDFXRef: objid=%d-%d>' % (self.objid_first, self.objid_last)
def objids(self):
for first, size in self.index:
for objid in xrange(first, first + size):
yield objid
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or \
stream.dic['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
index = stream.dic.get('Index', (0,size))
self.index = zip(islice(index, 0, None, 2),
islice(index, 1, None, 2))
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
return
def getpos(self, objid):
offset = 0
for first, size in self.index:
if first <= objid and objid < (first + size):
break
offset += size
else:
raise KeyError(objid)
i = self.entlen * ((objid - first) + offset)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
# this is a free object
raise KeyError(objid)
## PDFDocument ## PDFDocument
## ##
## A PDFDocument object represents a PDF document. ## A PDFDocument object represents a PDF document.
@ -1020,7 +1173,7 @@ class PDFDocument(object):
key = ASN1Parser([ord(x) for x in keyder]) key = ASN1Parser([ord(x) for x in keyder])
key = [bytesToNumber(key.getChild(x).value) for x in xrange(1, 4)] key = [bytesToNumber(key.getChild(x).value) for x in xrange(1, 4)]
rsa = RSA.construct(key) rsa = RSA.construct(key)
length = int_value(param.get('Length')) / 8 length = int_value(param.get('Length', 0)) / 8
rights = str_value(param.get('ADEPT_LICENSE')).decode('base64') rights = str_value(param.get('ADEPT_LICENSE')).decode('base64')
rights = zlib.decompress(rights, -15) rights = zlib.decompress(rights, -15)
rights = etree.fromstring(rights) rights = etree.fromstring(rights)
@ -1031,11 +1184,13 @@ class PDFDocument(object):
raise ADEPTError('error decrypting book session key') raise ADEPTError('error decrypting book session key')
index = bookkey.index('\0') + 1 index = bookkey.index('\0') + 1
bookkey = bookkey[index:] bookkey = bookkey[index:]
V = 2 ebx_V = int_value(param.get('V', 4))
if (length and len(bookkey) == (length + 1)) or \ ebx_type = int_value(param.get('EBX_ENCRYPTIONTYPE', 6))
(not length and len(bookkey) & 1 == 1): if ebx_V < 4 or ebx_type < 6:
V = ord(bookkey[0]) V = ord(bookkey[0])
bookkey = bookkey[1:] bookkey = bookkey[1:]
else:
V = 2
if length and len(bookkey) != length: if length and len(bookkey) != length:
raise ADEPTError('error decrypting book session key') raise ADEPTError('error decrypting book session key')
self.decrypt_key = bookkey self.decrypt_key = bookkey
@ -1131,7 +1286,7 @@ class PDFDocument(object):
else: else:
for xref in self.xrefs: for xref in self.xrefs:
try: try:
(strmid, index) = xref.getpos(objid) (stmid, index) = xref.getpos(objid)
break break
except KeyError: except KeyError:
pass pass
@ -1139,38 +1294,8 @@ class PDFDocument(object):
if STRICT: if STRICT:
raise PDFSyntaxError('Cannot locate objid=%r' % objid) raise PDFSyntaxError('Cannot locate objid=%r' % objid)
return None return None
if strmid: if stmid:
stream = stream_value(self.getobj(strmid)) return PDFObjStmRef(objid, stmid, index)
if stream.dic.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream.dic['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid]
else:
parser = PDFObjStrmParser(self, stream.get_data())
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[strmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError(
'Invalid object number: objid=%r' % (objid))
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else: else:
self.parser.seek(index) self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid (_,objid1) = self.parser.nexttoken() # objid
@ -1184,11 +1309,17 @@ class PDFDocument(object):
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
obj.set_objid(objid, genno) obj.set_objid(objid, genno)
self.objs[objid] = obj self.objs[objid] = obj
if self.decipher: if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj) obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj
class PDFObjStmRef(object):
def __init__(self, objid, stmid, index):
self.objid = objid
self.stmid = stmid
self.index = index
## PDFParser ## PDFParser
## ##
class PDFParser(PSStackParser): class PDFParser(PSStackParser):
@ -1290,14 +1421,24 @@ class PDFParser(PSStackParser):
(pos, token) = self.nexttoken() (pos, token) = self.nexttoken()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if token is not self.KEYWORD_XREF: if isinstance(token, int):
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % # XRefStream: PDF-1.5
(pos, token)) self.seek(pos)
self.nextline() self.reset()
xref = PDFXRef() xref = PDFXRefStream()
xref.load(self) xref.load(self)
else:
if token is not self.KEYWORD_XREF:
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
(pos, token))
self.nextline()
xref = PDFXRef()
xref.load(self)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.trailer trailer = xref.trailer
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(pos, xrefs)
if 'Prev' in trailer: if 'Prev' in trailer:
# find previous xref # find previous xref
pos = int_value(trailer['Prev']) pos = int_value(trailer['Prev'])
@ -1345,10 +1486,13 @@ class PDFSerializer(object):
parser = PDFParser(doc, inf) parser = PDFParser(doc, inf)
doc.initialize(keypath) doc.initialize(keypath)
self.objids = objids = set() self.objids = objids = set()
for xref in doc.xrefs: for xref in reversed(doc.xrefs):
trailer = xref.trailer trailer = xref.trailer
for objid in xref.objids(): for objid in xref.objids():
objids.add(objid) objids.add(objid)
trailer = dict(trailer)
trailer.pop('Prev', None)
trailer.pop('XRefStm', None)
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
objids.remove(trailer.pop('Encrypt').objid) objids.remove(trailer.pop('Encrypt').objid)
self.trailer = trailer self.trailer = trailer
@ -1360,26 +1504,64 @@ class PDFSerializer(object):
doc = self.doc doc = self.doc
objids = self.objids objids = self.objids
xrefs = {} xrefs = {}
xrefstm = {}
maxobj = max(objids) maxobj = max(objids)
trailer = dict(self.trailer)
trailer['Size'] = maxobj + 1
for objid in objids: for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFObjStmRef):
xrefstm[objid] = obj
continue
xrefs[objid] = self.tell() xrefs[objid] = self.tell()
self.serialize_indirect(objid, doc.getobj(objid)) self.serialize_indirect(objid, obj)
startxref = self.tell() startxref = self.tell()
self.write('xref\n') self.write('xref\n')
self.write('0 %d\n' % (maxobj + 1,)) self.write('0 %d\n' % (maxobj + 1,))
for objid in xrange(0, maxobj + 1): for objid in xrange(0, maxobj + 1):
if objid in objids: if objid in xrefs:
self.write("%010d %05d n \n" % (xrefs[objid], 0)) self.write("%010d %05d n \n" % (xrefs[objid], 0))
else: else:
self.write("%010d %05d f \n" % (0, 65535)) self.write("%010d %05d f \n" % (0, 65535))
self.write('trailer\n') self.write('trailer\n')
self.serialize_object(self.trailer) self.serialize_object(trailer)
self.write('\nstartxref\n%d\n%%%%EOF' % startxref) self.write('\nstartxref\n%d\n%%%%EOF' % startxref)
if not xrefstm:
def write(self, *data): return
for datum in data: index = []
self.outf.write(datum) first = None
self.last = data[-1][-1:] prev = None
data = []
for objid in sorted(xrefstm):
if first is None:
first = objid
elif objid != prev + 1:
index.extend((first, prev - first + 1))
first = objid
prev = objid
stmid = xrefstm[objid].stmid
data.append(struct.pack('>BHB', 2, stmid, 0))
index.extend((first, prev - first + 1))
data = zlib.compress(''.join(data))
dic = {'Type': LITERAL_XREF, 'Size': prev + 1, 'Index': index,
'W': [1, 2, 1], 'Length': len(data), 'Prev': startxref,
'Filter': LITERALS_FLATE_DECODE[0],}
obj = PDFStream(dic, data)
self.write('\n')
trailer['XRefStm'] = startxrefstm = self.tell()
self.serialize_indirect(maxobj + 1, obj)
trailer['Prev'] = startxref
startxref = self.tell()
self.write('xref\n')
self.write('%d 1\n' % (maxobj + 1,))
self.write("%010d %05d n \n" % (startxrefstm, 0))
self.write('trailer\n')
self.serialize_object(trailer)
self.write('\nstartxref\n%d\n%%%%EOF' % startxref)
def write(self, data):
self.outf.write(data)
self.last = data[-1:]
def tell(self): def tell(self):
return self.outf.tell() return self.outf.tell()
@ -1566,5 +1748,6 @@ def gui_main():
if __name__ == '__main__': if __name__ == '__main__':
# sys.exit(cli_main()) if len(sys.argv) > 1:
sys.exit(cli_main())
sys.exit(gui_main()) sys.exit(gui_main())