diff --git a/ineptpdf.pyw b/ineptpdf.pyw index abaf497..53aad40 100644 --- a/ineptpdf.pyw +++ b/ineptpdf.pyw @@ -1,6 +1,7 @@ #! /usr/bin/python -# ineptpdf.pyw, version 6.1 +# ineptpdf7.pyw +# ineptpdf, version 7 # To run this program install Python 2.6 from http://www.python.org/download/ # and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto @@ -15,6 +16,10 @@ # 5 - removing small bug with V3 ebooks (anon) # 6 - changed to adeptkey4.der format for 1.7.2 support (anon) # 6.1 - backward compatibility for 1.7.1 and old adeptkey.der +# 7 - Get cross reference streams and object streams working for input. +# Not yet supported on output but this only affects file size, +# not functionality. (by anon2) + """ Decrypt Adobe ADEPT-encrypted PDF files. """ @@ -42,6 +47,10 @@ try: except ImportError: ARC4 = None RSA = None +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO class ADEPTError(Exception): @@ -569,16 +578,17 @@ class PSBaseParser(object): pos = self.fp.tell() buf = '' while 0 < pos: + prevpos = pos pos = max(0, pos-self.BUFSIZ) self.fp.seek(pos) - s = self.fp.read(self.BUFSIZ) + s = self.fp.read(prevpos-pos) if not s: break while 1: n = max(s.rfind('\r'), s.rfind('\n')) if n == -1: buf = s + buf break - yield buf+s[n:] + yield s[n:]+buf s = s[:n] buf = '' return @@ -867,7 +877,7 @@ class PDFStream(PDFObject): (self.objid, len(self.rawdata), self.dic) def decode(self): - assert self.data == None and self.rawdata != None + assert self.data is None and self.rawdata is not None data = self.rawdata if self.decipher: # Handle encryption @@ -884,10 +894,6 @@ class PDFStream(PDFObject): # will get errors if the document is encrypted. data = zlib.decompress(data) elif f in LITERALS_LZW_DECODE: - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO data = ''.join(LZWDecoder(StringIO(data)).run()) elif f in LITERALS_ASCII85_DECODE: data = ascii85decode(data) @@ -926,7 +932,7 @@ class PDFStream(PDFObject): return def get_data(self): - if self.data == None: + if self.data is None: self.decode() return self.data @@ -934,6 +940,13 @@ class PDFStream(PDFObject): return self.rawdata def get_decdata(self): + if self.data is not None: + # Data has already been decrypted and decoded. This is the case + # for object streams. Note: this data is wrong to put in the + # output because it should be stored decrypted but + # uncompressed. This can be done by storing the intermediate + # data. For now object streams are useless in the output. + return self.data data = self.rawdata if self.decipher and data: # Handle encryption @@ -989,7 +1002,7 @@ class PDFXRef(object): if len(f) != 2: raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) try: - (start, nobjs) = map(long, f) + (start, nobjs) = map(int, f) except ValueError: raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) for objid in xrange(start, start+nobjs): @@ -1002,7 +1015,7 @@ class PDFXRef(object): raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) (pos, genno, use) = f if use != 'n': continue - self.offsets[objid] = (int(genno), long(pos)) + self.offsets[objid] = (int(genno), int(pos)) self.load_trailer(parser) return @@ -1040,7 +1053,7 @@ class PDFXRefStream(object): return def __repr__(self): - return '' % (self.objid_first, self.objid_last) + return '' % self.index def objids(self): for first, size in self.index: @@ -1298,12 +1311,45 @@ class PDFDocument(object): except KeyError: pass else: - return #if STRICT: # raise PDFSyntaxError('Cannot locate objid=%r' % objid) return None if stmid: - return PDFObjStmRef(objid, stmid, index) +# Later try to introduce PDFObjStmRef's +# return PDFObjStmRef(objid, stmid, index) +# Stuff from pdfminer + stream = stream_value(self.getobj(stmid)) + if stream.dic.get('Type') is not LITERAL_OBJSTM: + if STRICT: + raise PDFSyntaxError('Not a stream object: %r' % stream) + try: + n = stream.dic['N'] + except KeyError: + if STRICT: + raise PDFSyntaxError('N is not defined: %r' % stream) + n = 0 + + if stmid in self.parsed_objs: + objs = self.parsed_objs[stmid] + else: + parser = PDFObjStrmParser(stream.get_data(), self) + objs = [] + try: + while 1: + (_,obj) = parser.nextobject() + objs.append(obj) + except PSEOF: + pass + self.parsed_objs[stmid] = objs + genno = 0 + i = n*2+index + try: + obj = objs[i] + except IndexError: + raise PDFSyntaxError('Invalid object number: objid=%r' % (objid)) + if isinstance(obj, PDFStream): + obj.set_objid(objid, 0) +### else: self.parser.seek(index) (_,objid1) = self.parser.nexttoken() # objid @@ -1316,9 +1362,9 @@ class PDFDocument(object): (_,obj) = self.parser.nextobject() if isinstance(obj, PDFStream): obj.set_objid(objid, genno) + if self.decipher: + obj = decipher_all(self.decipher, objid, genno, obj) self.objs[objid] = obj - if self.decipher: - obj = decipher_all(self.decipher, objid, genno, obj) return obj class PDFObjStmRef(object): @@ -1419,7 +1465,7 @@ class PDFParser(PSStackParser): prev = line else: raise PDFNoValidXRef('Unexpected EOF') - return long(prev) + return int(prev) # read xref table def read_xref_from(self, start, xrefs): @@ -1482,6 +1528,34 @@ class PDFParser(PSStackParser): xrefs.append(xref) return xrefs +## PDFObjStrmParser +## +class PDFObjStrmParser(PDFParser): + + def __init__(self, data, doc): + PSStackParser.__init__(self, StringIO(data)) + self.doc = doc + return + + def flush(self): + self.add_results(*self.popall()) + return + + KEYWORD_R = KWD('R') + def do_keyword(self, pos, token): + if token is self.KEYWORD_R: + # reference to indirect object + try: + ((_,objid), (_,genno)) = self.pop(2) + (objid, genno) = (int(objid), int(genno)) + obj = PDFObjRef(self.doc, objid, genno) + self.push((pos, obj)) + except PSSyntaxError: + pass + return + # others + self.push((pos, token)) + return ### ### My own code, for which there is none else to blame @@ -1521,8 +1595,9 @@ class PDFSerializer(object): if isinstance(obj, PDFObjStmRef): xrefstm[objid] = obj continue - xrefs[objid] = self.tell() - self.serialize_indirect(objid, obj) + if obj is not None: + xrefs[objid] = self.tell() + self.serialize_indirect(objid, obj) startxref = self.tell() self.write('xref\n') self.write('0 %d\n' % (maxobj + 1,)) @@ -1611,11 +1686,18 @@ class PDFSerializer(object): self.write(' ') self.write('%d %d R' % (obj.objid, 0)) elif isinstance(obj, PDFStream): - data = obj.get_decdata() - self.serialize_object(obj.dic) - self.write('stream\n') - self.write(data) - self.write('\nendstream') + ### For now, we have extracted all objects from an Object Stream, + ### so we don't need these any more. Therefore leave them out + ### of the output. Later we could try to use object streams in + ### the output again to get smaller output. + if obj.dic.get('Type') == LITERAL_OBJSTM: + self.write('(deleted)') + else: + data = obj.get_decdata() + self.serialize_object(obj.dic) + self.write('stream\n') + self.write(data) + self.write('\nendstream') else: data = str(obj) if data[0].isalnum() and self.last.isalnum(): @@ -1697,7 +1779,7 @@ class DecryptionDialog(Tkinter.Frame): def get_inpath(self): inpath = tkFileDialog.askopenfilename( parent=None, title='Select ADEPT-encrypted PDF file to decrypt', - defaultextension='.epub', filetypes=[('PDF files', '.pdf'), + defaultextension='.pdf', filetypes=[('PDF files', '.pdf'), ('All files', '.*')]) if inpath: inpath = os.path.normpath(inpath) @@ -1708,7 +1790,7 @@ class DecryptionDialog(Tkinter.Frame): def get_outpath(self): outpath = tkFileDialog.asksaveasfilename( parent=None, title='Select unencrypted PDF file to produce', - defaultextension='.epub', filetypes=[('PDF files', '.pdf'), + defaultextension='.pdf', filetypes=[('PDF files', '.pdf'), ('All files', '.*')]) if outpath: outpath = os.path.normpath(outpath)