#! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import with_statement import csv import sys import os import getopt from struct import pack from struct import unpack # Get a 7 bit encoded number from string. The most # significant byte comes first and has the high bit (8th) set def readEncodedNumber(file): flag = False c = file.read(1) if (len(c) == 0): return None data = ord(c) if data == 0xFF: flag = True c = file.read(1) if (len(c) == 0): return None data = ord(c) if data >= 0x80: datax = (data & 0x7F) while data >= 0x80 : c = file.read(1) if (len(c) == 0): return None data = ord(c) datax = (datax <<7) + (data & 0x7F) data = datax if flag: data = -data return data # returns a binary string that encodes a number into 7 bits # most significant byte first which has the high bit set def encodeNumber(number): result = "" negative = False flag = 0 if number < 0 : number = -number + 1 negative = True while True: byte = number & 0x7F number = number >> 7 byte += flag result += chr(byte) flag = 0x80 if number == 0 : break if negative: result += chr(0xFF) return result[::-1] # create / read a length prefixed string from the file def lengthPrefixString(data): return encodeNumber(len(data))+data def readString(file): stringLength = readEncodedNumber(file) if (stringLength == None): return "" sv = file.read(stringLength) if (len(sv) != stringLength): return "" return unpack(str(stringLength)+"s",sv)[0] # convert a binary string generated by encodeNumber (7 bit encoded number) # to the value you would find inside the page*.dat files to be processed def convert(i): result = '' val = encodeNumber(i) for j in xrange(len(val)): c = ord(val[j:j+1]) result += '%02x' % c return result # the complete string table used to store all book text content # as well as the xml tokens and values that make sense out of it class Dictionary(object): def __init__(self, dictFile): self.filename = dictFile self.size = 0 self.fo = file(dictFile,'rb') self.stable = [] self.size = readEncodedNumber(self.fo) for i in xrange(self.size): self.stable.append(self.escapestr(readString(self.fo))) self.pos = 0 def escapestr(self, str): str = str.replace('&','&') str = str.replace('<','<') str = str.replace('>','>') str = str.replace('=','=') return str def lookup(self,val): if ((val >= 0) and (val < self.size)) : self.pos = val return self.stable[self.pos] else: print "Error - %d outside of string table limits" % val sys.exit(-1) def getSize(self): return self.size def getPos(self): return self.pos def dumpDict(self): for i in xrange(self.size): print "%d %s %s" % (i, convert(i), self.stable[i]) return # parses the xml snippets that are represented by each page*.dat file. # also parses the other0.dat file - the main stylesheet # and information used to inject the xml snippets into page*.dat files class PageParser(object): def __init__(self, filename, dict, debug, flat_xml): self.fo = file(filename,'rb') self.id = os.path.basename(filename).replace('.dat','') self.dict = dict self.debug = debug self.flat_xml = flat_xml self.tagpath = [] self.doc = [] self.snippetList = [] # hash table used to enable the decoding process # This has all been developed by trial and error so it may still have omissions or # contain errors # Format: # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) token_tags = { 'book' : (1, 'snippets', 1, 0), 'version' : (1, 'snippets', 1, 0), 'stylesheet' : (1, 'snippets', 1, 0), 'links' : (0, 'number', 0, 1), 'pages' : (0, 'number', 0, 1), 'page' : (1, 'snippets', 1, 0), 'group' : (1, 'snippets', 1, 0), 'region' : (1, 'snippets', 1, 0), 'reflow' : (1, 'number', 1, 0), 'img' : (1, 'snippets', 1, 0), 'paragraph' : (1, 'snippets', 1, 0), 'extratokens' : (1, 'snippets', 1, 0), 'style' : (1, 'snippets', 1, 0), 'rule' : (1, 'snippets', 1, 0), '_span' : (1, 'snippets', 1, 0), 'word_semantic': (1, 'snippets', 1, 1), 'value' : (1, 'scalar_text', 0, 0), 'h' : (1, 'scalar_number', 0, 0), 'w' : (1, 'scalar_number', 0, 0), 'firstWord' : (1, 'scalar_number', 0, 0), 'lastWord' : (1, 'scalar_number', 0, 0), 'x' : (1, 'number', 0, 0), 'y' : (1, 'number', 0, 0), 'links.page' : (1, 'number', 0, 0), 'link_id' : (1, 'number', 0, 0), 'glyph' : (0, 'number', 1, 1), 'glyph.h' : (1, 'number', 0, 0), 'glyph.w' : (1, 'number', 0, 0), 'sh' : (1, 'number', 0, 0), 'word' : (0, 'number', 1, 1), 'src' : (1, 'scalar_number', 0, 0), 'rel' : (1, 'number', 0, 0), 'row' : (1, 'number', 0, 0), 'startID' : (1, 'number', 0, 1), 'startID.page' : (1, 'number', 0, 0), 'glyphID' : (1, 'number', 0, 0), 'rootID' : (1, 'number', 0, 0), 'stemID' : (1, 'number', 0, 0), 'margin-top' : (1, 'number', 0, 0), 'stemPage' : (1, 'number', 0, 0), 'dehyphen' : (1, 'number', 1, 1), 'rootID' : (1, 'number', 0, 0), 'paraCont' : (1, 'number', 1, 1), 'paraStems' : (1, 'number', 1, 1), 'wordStems' : (1, 'number', 1, 1), 'original' : (0, 'number', 0, 1), 'use' : (1, 'number', 0, 0), 'vtx' : (1, 'number', 0, 1), 'len' : (1, 'number', 0, 1), 'dpi' : (1, 'number', 0, 0), 'n' : (1, 'number', 0, 0), 'id' : (1, 'number', 0, 0), 'ref' : (1, 'number', 0, 0), 'pnum' : (1, 'number', 0, 0), 'pid' : (1, 'text', 0, 0), 'info' : (0, 'number', 1, 0), 'bl' : (1, 'raw', 0, 0), 'firstGlyph' : (1, 'raw', 0, 0), 'lastGlyph' : (1, 'raw', 0, 0), 'ocrText' : (1, 'text', 0, 0), 'title' : (1, 'text', 0, 0), 'href' : (1, 'text', 0, 0), '_parent_type' : (1, 'text', 0, 0), 'attr' : (1, 'scalar_text', 0, 0), 'justify' : (1, 'scalar_text', 0, 0), 'align' : (1, 'scalar_text', 0, 0), 'layout' : (1, 'scalar_text', 0, 0), 'pageid' : (1, 'scalar_text', 0, 0), 'pagelabel' : (1, 'scalar_text', 0, 0), 'type' : (1, 'text', 0, 0), 'class' : (1, 'scalar_text', 0, 0), 'container' : (1, 'scalar_text', 0, 0), '_after_class' : (1, 'scalar_text', 0, 0), '_tag' : (1, 'scalar_text', 0, 0), 'pos' : (1, 'scalar_text', 0, 0), 'page_num' : (1, 'scalar_text', 0, 0), 'page_type' : (1, 'scalar_text', 0, 0), 'findlists' : (1, 'scalar_text', 0, 0), 'FlowEdit_1_id' : (1, 'scalar_text', 0, 0), 'FlowEdit_1_version' : (1, 'scalar_text', 0, 0), 'Schema_id' : (1, 'scalar_text', 0, 0), 'Schema_version' : (1, 'scalar_text', 0, 0), 'Topaz_version' : (1, 'scalar_text', 0, 0), 'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), 'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), 'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), 'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), 'chapterheaders' : (1, 'scalar_text', 0, 0), 'creation_date' : (1, 'scalar_text', 0, 0), 'header_footer' : (1, 'scalar_text', 0, 0), 'init_from_ocr' : (1, 'scalar_text', 0, 0), 'letter_insertion' : (1, 'scalar_text', 0, 0), 'xmlinj_convert' : (1, 'scalar_text', 0, 0), 'xmlinj_reflow' : (1, 'scalar_text', 0, 0), 'xmlinj_transform' : (1, 'scalar_text', 0, 0), } # full tag path record keeping routines def tag_push(self, token): self.tagpath.append(token) def tag_pop(self): if len(self.tagpath) > 0 : self.tagpath.pop() def tagpath_len(self): return len(self.tagpath) def get_tagpath(self, i): cnt = len(self.tagpath) if i < cnt : result = self.tagpath[i] for j in xrange(i+1, cnt) : result += '.' + self.tagpath[j] return result # list of absolute command byte values values that indicate # various types of loop meachanisms typically used to generate vectors cmd_list = (0x76, 0x76) # peek at and return 1 byte that is ahead by i bytes def peek(self, aheadi): c = self.fo.read(aheadi) if (len(c) == 0): return None self.fo.seek(-aheadi,1) c = c[-1:] return ord(c) # get the next value from the file being processed def getNext(self): nbyte = self.peek(1); if (nbyte == None): return None val = readEncodedNumber(self.fo) return val # format an arg by argtype def formatArg(self, arg, argtype): if (argtype == 'text') or (argtype == 'scalar_text') : result = self.dict.lookup(arg) elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') : result = arg elif (argtype == 'snippets') : result = arg else : print "Error Unknown argtype %s" % argtype sys.exit(-2) return result # process the next tag token, recursively handling subtags, # arguments, and commands def procToken(self, token): known_token = False self.tag_push(token) if self.debug : print 'Processing: ', self.get_tagpath(0) cnt = self.tagpath_len() for j in xrange(cnt): tkn = self.get_tagpath(j) if tkn in self.token_tags : num_args = self.token_tags[tkn][0] argtype = self.token_tags[tkn][1] subtags = self.token_tags[tkn][2] splcase = self.token_tags[tkn][3] ntags = -1 known_token = True break if known_token : # handle subtags if present subtagres = [] if (splcase == 1): # this type of tag uses of escape marker 0x74 indicate subtag count if self.peek(1) == 0x74: skip = readEncodedNumber(self.fo) subtags = 1 num_args = 0 if (subtags == 1): ntags = readEncodedNumber(self.fo) if self.debug : print 'subtags: ' + token + ' has ' + str(ntags) for j in xrange(ntags): val = readEncodedNumber(self.fo) subtagres.append(self.procToken(self.dict.lookup(val))) # arguments can be scalars or vectors of text or numbers argres = [] if num_args > 0 : firstarg = self.peek(1) if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'): # single argument is a variable length vector of data arg = readEncodedNumber(self.fo) argres = self.decodeCMD(arg,argtype) else : # num_arg scalar arguments for i in xrange(num_args): argres.append(self.formatArg(readEncodedNumber(self.fo), argtype)) # build the return tag result = [] tkn = self.get_tagpath(0) result.append(tkn) result.append(subtagres) result.append(argtype) result.append(argres) self.tag_pop() return result # all tokens that need to be processed should be in the hash # table if it may indicate a problem, either new token # or an out of sync condition else: result = [] if (self.debug): print 'Unknown Token:', token self.tag_pop() return result # special loop used to process code snippets # it is NEVER used to format arguments. # builds the snippetList def doLoop72(self, argtype): cnt = readEncodedNumber(self.fo) if self.debug : result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n' result += 'of the document is indicated by snippet number sets at the\n' result += 'end of each snippet. \n' print result for i in xrange(cnt): if self.debug: print 'Snippet:',str(i) snippet = [] snippet.append(i) val = readEncodedNumber(self.fo) snippet.append(self.procToken(self.dict.lookup(val))) self.snippetList.append(snippet) return # loop: pass though values unchanged # DO NOT CHANGE - this has proven to be correct def doLoop76Mode0(self, argtype, cnt): result = [] for i in xrange(cnt): result.append(self.formatArg(readEncodedNumber(self.fo), argtype)) return result # loop generating values relative to the *negative* # of the offset - don't ask why - it just is # DO NOT CHANGE - this has proven to be correct def doLoop76Mode1(self, argtype, cnt): result = [] offset = -readEncodedNumber(self.fo) for i in xrange(cnt): val = readEncodedNumber(self.fo) + offset result.append(self.formatArg(val, argtype)) return result # loop generating values with starting value and accumulation # DO NOT CHANGE - this has proven to be the correct def doLoop76Mode2(self, argtype, cnt): result = [] ptr = readEncodedNumber(self.fo) result.append(self.formatArg(ptr, argtype)) for i in xrange(cnt-1): ptr = ptr + readEncodedNumber(self.fo) result.append(self.formatArg(ptr, argtype)) return result # loop generating values with starting value and accumulation # **after** subtracting adjustment value from each # DO NOT CHANGE - this has been proven to be correct def doLoop76Mode3(self, argtype, cnt): result = [] adj = readEncodedNumber(self.fo) ptr = readEncodedNumber(self.fo) ptr = ptr - adj result.append(self.formatArg(ptr, argtype)) for i in xrange(cnt-1): ptr = ptr + readEncodedNumber(self.fo) - adj result.append(self.formatArg(ptr,argtype)) return result # loop using runing sum of data values and starting value # with accumulation to get new value # Again, don't ask it took me forever to figure this out # DO NOT CHANGE - this has been proven to be correct def doLoop76Mode4(self, argtype, cnt): result = [] val = readEncodedNumber(self.fo) runsum = val ptr = val result.append(self.formatArg(ptr, argtype)) for i in xrange(cnt-1): runsum += readEncodedNumber(self.fo) ptr = ptr + runsum result.append(self.formatArg(ptr,argtype)) return result # loop using and extra value as an adjustment # and a running sum of the values after subtracting # the adjustment, added to a ptr to get a new pointer def doLoop76Mode5(self, argtype, cnt): result = [] adj = readEncodedNumber(self.fo) ptr = 0 runsum = 0 for i in xrange(cnt): val = readEncodedNumber(self.fo) runsum += (val - adj) ptr = ptr +runsum result.append(self.formatArg(ptr,argtype)) return result # FIXME: I have only 4 points to work this out with inside my book # So may be wrong but it is correct for my 4 points def doLoop76Mode6(self, argtype, cnt): result = [] oldval = 0 for i in xrange(cnt): val = readEncodedNumber(self.fo) ptr= (3 * oldval) + val + 1 result.append(self.formatArg(ptr,argtype)) oldval = val return result # dispatches loop commands bytes with various modes # The 0x76 style loops are used to build vectors # This was all derived by trial and error and # new loop types may exist that are not handled here # since they did not appear in the test cases def decodeCMD(self, cmd, argtype): # if (cmd == 0x72): # self.doLoop72(argtype) # result =[] # return result if (cmd == 0x76): # loop with cnt, and mode to control loop styles cnt = readEncodedNumber(self.fo) mode = readEncodedNumber(self.fo) if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' if (mode == 0x00): return self.doLoop76Mode0(argtype, cnt) elif (mode == 0x01): return self.doLoop76Mode1(argtype, cnt) elif (mode == 0x02): return self.doLoop76Mode2(argtype, cnt) elif (mode == 0x03): return self.doLoop76Mode3(argtype, cnt) elif (mode == 0x04): return self.doLoop76Mode4(argtype, cnt) elif (mode == 0x05): return self.doLoop76Mode5(argtype, cnt) elif (mode == 0x06): return self.doLoop76Mode6(argtype, cnt) else: if self.debug : # try to mark any unknown loop comands # if they exist, unless they are used to process # text or some other known list, we won't be able to prove them correct print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode) for i in xrange(cnt): val = readEncodedNumber(self.fo) print ' 0x%x' % val, print ' ' result = [] return result if self.dbug: print "Unknown command", cmd result = [] return result # add full tag path to injected snippets def updateName(self, tag, prefix): name = tag[0] subtagList = tag[1] argtype = tag[2] argList = tag[3] nname = prefix + '.' + name nsubtaglist = [] for j in subtagList: nsubtaglist.append(self.updateName(j,prefix)) ntag = [] ntag.append(nname) ntag.append(nsubtaglist) ntag.append(argtype) ntag.append(argList) return ntag # perform depth first injection of specified snippets into this one def injectSnippets(self, snippet): snipno, tag = snippet name = tag[0] subtagList = tag[1] argtype = tag[2] argList = tag[3] nsubtagList = [] if len(argList) > 0 : for j in argList: asnip = self.snippetList[j] aso, atag = self.injectSnippets(asnip) atag = self.updateName(atag, name) nsubtagList.append(atag) argtype='number' argList=[] if len(nsubtagList) > 0 : subtagList.extend(nsubtagList) tag = [] tag.append(name) tag.append(subtagList) tag.append(argtype) tag.append(argList) snippet = [] snippet.append(snipno) snippet.append(tag) return snippet # format the tag for output def formatTag(self, node): name = node[0] subtagList = node[1] argtype = node[2] argList = node[3] fullpathname = name.split('.') nodename = fullpathname.pop() ilvl = len(fullpathname) indent = ' ' * (3 * ilvl) result = indent + '<' + nodename + '>' if len(argList) > 0: argres = '' for j in argList: if (argtype == 'text') or (argtype == 'scalar_text') : argres += j + '|' else : argres += str(j) + ',' argres = argres[0:-1] if argtype == 'snippets' : result += 'snippets:' + argres else : result += argres if len(subtagList) > 0 : result += '\n' for j in subtagList: if len(j) > 0 : result += self.formatTag(j) result += indent + '\n' else: result += '\n' return result # flatten tag def flattenTag(self, node): name = node[0] subtagList = node[1] argtype = node[2] argList = node[3] result = name if (len(argList) > 0): argres = '' for j in argList: if (argtype == 'text') or (argtype == 'scalar_text') : argres += j + '|' else : argres += str(j) + '|' argres = argres[0:-1] if argtype == 'snippets' : result += '.snippets=' + argres else : result += '=' + argres result += '\n' for j in subtagList: if len(j) > 0 : result += self.flattenTag(j) return result # reduce create xml output def formatDoc(self, flat_xml): result = '' for j in self.doc : if len(j) > 0: if flat_xml: result += self.flattenTag(j) else: result += self.formatTag(j) if self.debug : print result return result # main loop - parse the page.dat files # to create structured document and snippets # FIXME: value at end of magic appears to be a subtags count # but for what? For now, inject an 'info" tag as it is in # every dictionary and seems close to what is meant # The alternative is to special case the last _ "0x5f" to mean something def process(self): # peek at the first bytes to see what type of file it is magic = self.fo.read(11) if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'): first_token = 'info' elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'): skip = self.fo.read(1) first_token = 'info' else : # other0.dat file first_token = None self.fo.seek(-11,1) # main loop to read and build the document tree while True: if first_token != None : # use "inserted" first token 'info' for page and glyph files tag = self.procToken(first_token) if len(tag) > 0 : self.doc.append(tag) first_token = None v = self.getNext() if (v == None): break if (v == 0x72): self.doLoop72('number') elif (v > 0) and (v < self.dict.getSize()) : tag = self.procToken(self.dict.lookup(v)) if len(tag) > 0 : self.doc.append(tag) else: if self.debug: print "Mina Loop: Unknown value: %x" % v # now do snippet injection if len(self.snippetList) > 0 : if self.debug : print 'Injecting Snippets:' snippet = self.injectSnippets(self.snippetList[0]) snipno = snippet[0] tag_add = snippet[1] if self.debug : print self.formatTag(tag_add) if len(tag_add) > 0: self.doc.append(tag_add) # handle generation of xml output xmlpage = self.formatDoc(self.flat_xml) return xmlpage def usage(): print 'Usage: ' print ' convert2xml.py dict0000.dat infile.dat ' print ' ' print ' Options:' print ' -h print this usage help message ' print ' -d turn on debug output to check for potential errors ' print ' --flat-xml output the flattened xml page description only ' print ' ' print ' This program will attempt to convert a page*.dat file or ' print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. ' print ' ' print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump ' print ' the *.dat files from a Topaz format e-book.' # # Main # def main(argv): dictFile = "" pageFile = "" debug = False flat_xml = False printOutput = False if len(argv) == 0: printOutput = True argv = sys.argv else : argv = argv.split() try: opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) if len(opts) == 0 and len(args) == 0 : usage() sys.exit(2) for o, a in opts: if o =="-d": debug=True if o =="-h": usage() sys.exit(0) if o =="--flat-xml": flat_xml = True dictFile, pageFile = args[0], args[1] # read in the string table dictionary dict = Dictionary(dictFile) # create a page parser pp = PageParser(pageFile, dict, debug, flat_xml) xmlpage = pp.process() if printOutput: print xmlpage return 0 return xmlpage if __name__ == '__main__': sys.exit(main(''))