diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt index cc2f00a..5b2da8d 100644 --- a/Topaz_Tools/lib/changes.txt +++ b/Topaz_Tools/lib/changes.txt @@ -1,3 +1,8 @@ +Changes in version 1.6 + - support for books whose paragraphs have no styles + - support to run cmbtc_dump on Linux and Mac OSX provided you know your PID of your ipod or standalone Kindle + (contributed by DiapDealer) + Changes in version 1.5 - completely reworked generation of styles to use actual page heights and widths - added new script getpagedim.py to support the above diff --git a/Topaz_Tools/lib/cmbtc_dump_mac_linux.py b/Topaz_Tools/lib/cmbtc_dump_mac_linux.py new file mode 100644 index 0000000..8a03a3a --- /dev/null +++ b/Topaz_Tools/lib/cmbtc_dump_mac_linux.py @@ -0,0 +1,504 @@ +#! /usr/bin/python + +from __future__ import with_statement + +import csv +import sys +import os +import getopt +import zlib +from struct import pack +from struct import unpack + +MAX_PATH = 255 + +# Put the first 8 characters of your Kindle PID here +# or supply it with the -p option in the command line +#################################################### +kindlePID = "12345678" +#################################################### + +global bookFile +global bookPayloadOffset +global bookHeaderRecords +global bookMetadata +global bookKey +global command + +# +# Exceptions for all the problems that might happen during the script +# + +class CMBDTCError(Exception): + pass + +class CMBDTCFatal(Exception): + pass + + +# +# Open the book file at path +# + +def openBook(path): + try: + return open(path,'rb') + except: + raise CMBDTCFatal("Could not open book file: " + path) + +# +# Get a 7 bit encoded number from the book file +# + +def bookReadEncodedNumber(): + flag = False + data = ord(bookFile.read(1)) + + if data == 0xFF: + flag = True + data = ord(bookFile.read(1)) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + data = ord(bookFile.read(1)) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + +# +# Encode a number in 7 bit format +# + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + print("Using encodeNumber routine") + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + +# +# Get a length prefixed string from the file +# + +def bookReadString(): + stringLength = bookReadEncodedNumber() + return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0] + +# +# Returns a length prefixed string +# + +def lengthPrefixString(data): + return encodeNumber(len(data))+data + + +# +# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...] +# + +def bookReadHeaderRecordData(): + nbValues = bookReadEncodedNumber() + values = [] + for i in range (0,nbValues): + values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()]) + return values + +# +# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...] +# + +def parseTopazHeaderRecord(): + if ord(bookFile.read(1)) != 0x63: + raise CMBDTCFatal("Parse Error : Invalid Header") + + tag = bookReadString() + record = bookReadHeaderRecordData() + return [tag,record] + +# +# Parse the header of a Topaz file, get all the header records and the offset for the payload +# + +def parseTopazHeader(): + global bookHeaderRecords + global bookPayloadOffset + magic = unpack("4s",bookFile.read(4))[0] + + if magic != 'TPZ0': + raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file") + + nbRecords = bookReadEncodedNumber() + bookHeaderRecords = {} + + for i in range (0,nbRecords): + result = parseTopazHeaderRecord() + print result[0], result[1] + bookHeaderRecords[result[0]] = result[1] + + if ord(bookFile.read(1)) != 0x64 : + raise CMBDTCFatal("Parse Error : Invalid Header") + + bookPayloadOffset = bookFile.tell() + +# +# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed +# Correction, the record is correctly decompressed too +# + +def getBookPayloadRecord(name, index): + encrypted = False + compressed = False + + try: + recordOffset = bookHeaderRecords[name][index][0] + except: + raise CMBDTCFatal("Parse Error : Invalid Record, record not found") + + bookFile.seek(bookPayloadOffset + recordOffset) + + tag = bookReadString() + if tag != name : + raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match") + + recordIndex = bookReadEncodedNumber() + + if recordIndex < 0 : + encrypted = True + recordIndex = -recordIndex -1 + + if recordIndex != index : + raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match") + + if (bookHeaderRecords[name][index][2] > 0): + compressed = True + record = bookFile.read(bookHeaderRecords[name][index][2]) + else: + record = bookFile.read(bookHeaderRecords[name][index][1]) + + if encrypted: + ctx = topazCryptoInit(bookKey) + record = topazCryptoDecrypt(record,ctx) + + if compressed: + record = zlib.decompress(record) + + return record + +# +# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename" +# + +def extractBookPayloadRecord(name, index, filename): + compressed = False + + try: + compressed = bookHeaderRecords[name][index][2] != 0 + record = getBookPayloadRecord(name,index) + except: + print("Could not find record") + + # if compressed: + # try: + # record = zlib.decompress(record) + # except: + # raise CMBDTCFatal("Could not decompress record") + + if filename != "": + try: + file = open(filename,"wb") + file.write(record) + file.close() + except: + raise CMBDTCFatal("Could not write to destination file") + else: + print(record) + +# +# return next record [key,value] from the book metadata from the current book position +# + +def readMetadataRecord(): + return [bookReadString(),bookReadString()] + +# +# Parse the metadata record from the book payload and return a list of [key,values] +# + +def parseMetadata(): + global bookHeaderRecords + global bookPayloadAddress + global bookMetadata + bookMetadata = {} + bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0]) + tag = bookReadString() + if tag != "metadata" : + raise CMBDTCFatal("Parse Error : Record Names Don't Match") + + flags = ord(bookFile.read(1)) + nbRecords = ord(bookFile.read(1)) + + for i in range (0,nbRecords) : + record =readMetadataRecord() + bookMetadata[record[0]] = record[1] + +# +# Context initialisation for the Topaz Crypto +# + +def topazCryptoInit(key): + ctx1 = 0x0CAFFE19E + + for keyChar in key: + keyByte = ord(keyChar) + ctx2 = ctx1 + ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF ) + return [ctx1,ctx2] + +# +# decrypt data with the context prepared by topazCryptoInit() +# + +def topazCryptoDecrypt(data, ctx): + ctx1 = ctx[0] + ctx2 = ctx[1] + + plainText = "" + + for dataChar in data: + dataByte = ord(dataChar) + m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF + ctx2 = ctx1 + ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF) + plainText += chr(m) + + return plainText + +# +# Decrypt a payload record with the PID +# + +def decryptRecord(data,PID): + ctx = topazCryptoInit(PID) + return topazCryptoDecrypt(data, ctx) + +# +# Try to decrypt a dkey record (contains the book PID) +# + +def decryptDkeyRecord(data,PID): + record = decryptRecord(data,PID) + fields = unpack("3sB8sB8s3s",record) + + if fields[0] != "PID" or fields[5] != "pid" : + raise CMBDTCError("Didn't find PID magic numbers in record") + elif fields[1] != 8 or fields[3] != 8 : + raise CMBDTCError("Record didn't contain correct length fields") + elif fields[2] != PID : + raise CMBDTCError("Record didn't contain PID") + + return fields[4] + +# +# Decrypt all the book's dkey records (contain the book PID) +# + +def decryptDkeyRecords(data,PID): + nbKeyRecords = ord(data[0]) + records = [] + data = data[1:] + for i in range (0,nbKeyRecords): + length = ord(data[0]) + try: + key = decryptDkeyRecord(data[1:length+1],PID) + records.append(key) + except CMBDTCError: + pass + data = data[1+length:] + + return records + +# +# Create decrypted book payload +# + +def createDecryptedPayload(payload): + for headerRecord in bookHeaderRecords: + name = headerRecord + if name != "dkey" : + ext = '.dat' + if name == 'img' : ext = '.jpg' + for index in range (0,len(bookHeaderRecords[name])) : + fnum = "%04d" % index + fname = name + fnum + ext + destdir = payload + if name == 'img': + destdir = os.path.join(payload,'img') + if name == 'page': + destdir = os.path.join(payload,'page') + if name == 'glyphs': + destdir = os.path.join(payload,'glyphs') + outputFile = os.path.join(destdir,fname) + file(outputFile, 'wb').write(getBookPayloadRecord(name, index)) + + +# Create decrypted book +# + +def createDecryptedBook(outdir): + if not os.path.exists(outdir): + os.makedirs(outdir) + + destdir = os.path.join(outdir,'img') + if not os.path.exists(destdir): + os.makedirs(destdir) + + destdir = os.path.join(outdir,'page') + if not os.path.exists(destdir): + os.makedirs(destdir) + + destdir = os.path.join(outdir,'glyphs') + if not os.path.exists(destdir): + os.makedirs(destdir) + + createDecryptedPayload(outdir) + + +# +# Set the command to execute by the programm according to cmdLine parameters +# + +def setCommand(name) : + global command + if command != "" : + raise CMBDTCFatal("Invalid command line parameters") + else : + command = name + +# +# Program usage +# + +def usage(): + print("\nUsage:") + print("\ncmbtc_dump_linux.py [options] bookFileName\n") + print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)") + print("-d Dumps the unencrypted book as files to outdir") + print("-o Output directory to save book files to") + print("-v Verbose (can be used several times)") + + +# +# Main +# + +def main(argv=sys.argv): + global bookMetadata + global bookKey + global bookFile + global command + + progname = os.path.basename(argv[0]) + + verbose = 0 + recordName = "" + recordIndex = 0 + outdir = "" + PIDs = [] + command = "" + + # Preloads your Kindle pid from the top of the program. + PIDs.append(kindlePID) + + try: + opts, args = getopt.getopt(sys.argv[1:], "vo:p:d") + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o == "-v": + verbose+=1 + if o =="-o": + if a == None : + raise CMBDTCFatal("Invalid parameter for -o") + outdir = a + if o =="-p": + PIDs.append(a) + if o =="-d": + setCommand("doit") + + if command == "" : + raise CMBDTCFatal("No action supplied on command line") + + # + # Open book and parse metadata + # + + if len(args) == 1: + + bookFile = openBook(args[0]) + parseTopazHeader() + parseMetadata() + + # + # Decrypt book key + # + + dkey = getBookPayloadRecord('dkey', 0) + + bookKeys = [] + for PID in PIDs : + bookKeys+=decryptDkeyRecords(dkey,PID) + + if len(bookKeys) == 0 : + if verbose > 0 : + print ("Book key could not be found. Maybe this book is not registered with this device.") + else : + bookKey = bookKeys[0] + if verbose > 0: + print("Book key: " + bookKey.encode('hex')) + + + + if command == "printRecord" : + extractBookPayloadRecord(recordName,int(recordIndex),outputFile) + if outputFile != "" and verbose>0 : + print("Wrote record to file: "+outputFile) + elif command == "doit" : + if outdir != "" : + createDecryptedBook(outdir) + if verbose >0 : + print ("Decrypted book saved. Don't pirate!") + elif verbose > 0: + print("Output directory name was not supplied.") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index f2dd244..9e3512e 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -13,7 +13,8 @@ from struct import unpack class DocParser(object): def __init__(self, flatxml, classlst, fileid): self.id = os.path.basename(fileid).replace('.dat','') - self.flatdoc = flatxml.split('\n') + self.docList = flatxml.split('\n') + self.docSize = len(self.docList) self.classList = {} tmpList = classlst.split('\n') for pclass in tmpList: @@ -29,12 +30,10 @@ class DocParser(object): self.paracont_stemid = [] self.parastems_stemid = [] - # find tag if within pos to end inclusive + # return tag at line pos in document def lineinDoc(self, pos) : - docList = self.flatdoc - cnt = len(docList) - if (pos >= 0) and (pos < cnt) : - item = docList[pos] + if (pos >= 0) and (pos < self.docSize) : + item = self.docList[pos] if item.find('=') >= 0: (name, argres) = item.split('=',1) else : @@ -43,20 +42,18 @@ class DocParser(object): return name, argres - # find tag if within pos to end inclusive + # find tag in doc if within pos to end inclusive def findinDoc(self, tagpath, pos, end) : result = None - docList = self.flatdoc - cnt = len(docList) if end == -1 : - end = cnt + end = self.docSize else: - end = min(cnt,end) + end = min(self.docSize, end) foundat = -1 for j in xrange(pos, end): - item = docList[j] + item = self.docList[j] if item.find('=') >= 0: - (name, argres) = item.split('=') + (name, argres) = item.split('=',1) else : name = item argres = '' @@ -85,7 +82,7 @@ class DocParser(object): result = [] - # normal paragraph + # paragraph (pos, pclass) = self.findinDoc('paragraph.class',start,end) # class names are an issue given topaz may start them with numerals (not allowed), @@ -94,19 +91,20 @@ class DocParser(object): # from a base class (but then not actually provide all of these _reclustereed # classes in the stylesheet! - # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass + # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # that exists in the stylesheet first, and then adding this specific class # after - classres = '' - pclass = pclass.lower() - pclass = 'cl-' + pclass - p = pclass.find('_') - if p > 0 : - baseclass = pclass[0:p] - if baseclass in self.classList: - classres += baseclass + ' ' - classres += pclass - pclass = classres + if pclass != None : + classres = '' + pclass = pclass.lower() + pclass = 'cl-' + pclass + p = pclass.find('_') + if p > 0 : + baseclass = pclass[0:p] + if baseclass in self.classList: + classres += baseclass + ' ' + classres += pclass + pclass = classres # build up a description of the paragraph in result and return it # first check for the basic - all words paragraph @@ -128,9 +126,7 @@ class DocParser(object): # if end is -1 then we must search to end of document if end == -1 : - docList = self.flatdoc - cnt = len(docList) - end = cnt + end = self.docSize while (line < end) : @@ -171,20 +167,20 @@ class DocParser(object): return pclass, result - def buildParagraph(self, cname, pdesc, type, regtype) : + def buildParagraph(self, pclass, pdesc, type, regtype) : parares = '' sep ='' - br_lb = False - if (regtype == 'fixed') or (regtype == 'chapterheading'): - br_lb = True + classres = '' + if pclass : + classres = ' class="' + pclass + '"' - handle_links = False - if len(self.link_id) > 0: - handle_links = True + br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') + handle_links = len(self.link_id) > 0 + if (type == 'full') or (type == 'begin') : - parares += '

' + parares += '' if (type == 'end'): parares += ' ' @@ -218,10 +214,7 @@ class DocParser(object): if word == '_link_' : word = '' if word == '_lb_': - if (num-1) in self.dehyphen_rootid : - word = '' - sep = '' - elif handle_links : + if ((num-1) in self.dehyphen_rootid ) or handle_links: word = '' sep = '' elif br_lb : @@ -261,43 +254,51 @@ class DocParser(object): htmlpage = '' - # first collect information from the xml doc that describes this page + # get the ocr text (pos, argres) = self.findinDoc('info.word.ocrText',0,-1) if argres : self.ocrtext = argres.split('|') + # get information to dehyphenate the text (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1) if argres: argList = argres.split('|') self.dehyphen_rootid = [ int(strval) for strval in argList] + # determine if first paragraph is continued from previous page (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) - if self.parastems_stemid == None : self.parastems_stemid = [] - + first_para_continued = (self.parastems_stemid != None) + + # determine if last paragraph is continued onto the next page (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1) - if self.paracont_stemid == None : self.paracont_stemid = [] - + last_para_continued = (self.paracont_stemid != None) + # collect link ids (pos, argres) = self.findinDoc('info.word.link_id',0,-1) if argres: argList = argres.split('|') self.link_id = [ int(strval) for strval in argList] + # collect link destination page numbers (pos, argres) = self.findinDoc('info.links.page',0,-1) if argres : argList = argres.split('|') self.link_page = [ int(strval) for strval in argList] + # collect link titles (pos, argres) = self.findinDoc('info.links.title',0,-1) if argres : self.link_title = argres.split('|') else: self.link_title.append('') + + # get page type (pos, pagetype) = self.findinDoc('page.type',0,-1) # generate a list of each region starting point # each region has one paragraph,, or one image, or one chapterheading + regionList= self.posinDoc('region') regcnt = len(regionList) regionList.append(-1) @@ -308,47 +309,48 @@ class DocParser(object): # process each region tag and convert what you can to html for j in xrange(regcnt): + start = regionList[j] end = regionList[j+1] (pos, regtype) = self.findinDoc('region.type',start,end) + # set anchor for link target on this page + if not anchorSet and not first_para_continued: + htmlpage += '

 
\n' + anchorSet = True + if regtype == 'graphic' : - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '
' % int(simgsrc) + elif regtype == 'chapterheading' : (pclass, pdesc) = self.getParaDescription(start,end) if not breakSet: htmlpage += '
 
\n' breakSet = True - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True tag = 'h1' - if pclass[3:7] == 'ch1-' : tag = 'h1' - if pclass[3:7] == 'ch2-' : tag = 'h2' - if pclass[3:7] == 'ch3-' : tag = 'h3' - htmlpage += '<' + tag + ' class="' + pclass + '">' + if pclass and (len(pclass) >= 7): + if pclass[3:7] == 'ch1-' : tag = 'h1' + if pclass[3:7] == 'ch2-' : tag = 'h2' + if pclass[3:7] == 'ch3-' : tag = 'h3' + htmlpage += '<' + tag + ' class="' + pclass + '">' + else: + htmlpage += '<' + tag + '>' htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) htmlpage += '' + elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') : ptype = 'full' # check to see if this is a continution from the previous page - if (len(self.parastems_stemid) > 0): + if first_para_continued : ptype = 'end' - self.parastems_stemid=[] - else: - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True + first_para_continued = False (pclass, pdesc) = self.getParaDescription(start,end) - if ptype == 'full' : + if pclass and (len(pclass) >= 6) and (ptype == 'full'): tag = 'p' if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h2-' : tag = 'h5' @@ -359,28 +361,22 @@ class DocParser(object): else : htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) + elif (regtype == 'tocentry') : ptype = 'full' - # check to see if this is a continution from the previous page - if (len(self.parastems_stemid) > 0) and (j == 0): - # process the first paragraph as a continuation from the last page + if first_para_continued : ptype = 'end' - self.parastems_stemid = [] - else: - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True + first_para_continued = False (pclass, pdesc) = self.getParaDescription(start,end) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) + elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '
' % int(simgsrc) + else : print 'Warning: Unknown region type', regtype (pos, temp) = self.findinDoc('paragraph',start,end) @@ -389,15 +385,11 @@ class DocParser(object): regtype = 'fixed' ptype = 'full' # check to see if this is a continution from the previous page - if (len(self.parastems_stemid) > 0): + if first_para_continued : ptype = 'end' - self.parastems_stemid=[] - else: - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True + first_para_continued = False (pclass, pdesc) = self.getParaDescription(start,end) - if ptype == 'full' : + if pclass and (ptype == 'full') and (len(pclass) >= 6): tag = 'p' if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h2-' : tag = 'h5' @@ -408,24 +400,20 @@ class DocParser(object): else : htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) else : - print 'Treating this like a "image" region' - if not anchorSet: - htmlpage += '
 
\n' - anchorSet = True + print 'Treating this like a "graphic" region' (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '
' % int(simgsrc) - if len(self.paracont_stemid) > 0 : + + if last_para_continued : if htmlpage[-4:] == '

': - htmlpage = htmlpage[0:-4] + htmlpage = htmlpage[0:-4] + last_para_continued = False return htmlpage - return self.convert2HTML() - - def convert2HTML(flatxml, classlst, fileid): diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt index c9fcb61..210deb3 100644 --- a/Topaz_Tools/lib/readme.txt +++ b/Topaz_Tools/lib/readme.txt @@ -3,7 +3,7 @@ Contributors: clarknova - for all of the svg and glyph generation and many other bug fixes and improvements skindle - for figuing out the general case for the mode loops some updates - for conversion to xml, basic html - DiapDealer - for extensive testing and feedback + DiapDealer - for extensive testing and feedback, and standalone linux/macosx version of cmbtc_dump stewball - for extensive testing and feedback and others for posting, feedback and testing @@ -29,6 +29,17 @@ genxml.py - main program to convert everything to xml genhtml.py - main program to generate "book.html" gensvg.py - (author: clarknova) main program to create an svg grpahic of each page + +In addition there is now a new file: + +cmbtc_dump_mac_linux.py + +If you know the pid of your ipod and/or your standalone Kindle and your book +was meant for that device, you can use this program to dump the proper sections +on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed). +Thank DiapDealer for creating it! + + Please note, gensvg.py, genhtml.py, and genxml.py import and use decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py so please keep all of these python scripts together in the same place.