topaz tools 1.0 (I think)

2025-01-11 19:04:43 +06:00 · 2010-01-17 12:10:35 +00:00 · 2010-01-17 12:10:35 +00:00 · 0a437510f6
commit 0a437510f6
parent 1fc40376cf
9 changed files with 2931 additions and 0 deletions
--- a/Topaz_Tools/lib/cmbtc_dump.py
+++ b/Topaz_Tools/lib/cmbtc_dump.py
@ -0,0 +1,865 @@
 #! /usr/bin/python
 """
 Comprehensive Mazama Book DRM with Topaz Cryptography V2.0
 -----BEGIN PUBLIC KEY-----
 MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6
 M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC
 B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx
 y2/pHuYme7U1TsgSjwIDAQAB
 -----END PUBLIC KEY-----
 """
 from __future__ import with_statement
 import csv
 import sys
 import os
 import getopt
 import zlib
 from struct import pack
 from struct import unpack
 from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \
    create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \
    string_at, Structure, c_void_p, cast
 import _winreg as winreg
 import Tkinter
 import Tkconstants
 import tkMessageBox
 import traceback
 import hashlib
 MAX_PATH = 255
 kernel32 = windll.kernel32
 advapi32 = windll.advapi32
 crypt32 = windll.crypt32
 global kindleDatabase
 global bookFile
 global bookPayloadOffset
 global bookHeaderRecords
 global bookMetadata
 global bookKey
 global command
 #
 # Various character maps used to decrypt books. Probably supposed to act as obfuscation
 #
 charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
 charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_"
 charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
 charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
 #
 # Exceptions for all the problems that might happen during the script
 #
 class CMBDTCError(Exception):
    pass
 class CMBDTCFatal(Exception):
    pass
 #
 # Stolen stuff
 #
 class DataBlob(Structure):
    _fields_ = [('cbData', c_uint),
                ('pbData', c_void_p)]
 DataBlob_p = POINTER(DataBlob)
 def GetSystemDirectory():
    GetSystemDirectoryW = kernel32.GetSystemDirectoryW
    GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint]
    GetSystemDirectoryW.restype = c_uint
    def GetSystemDirectory():
        buffer = create_unicode_buffer(MAX_PATH + 1)
        GetSystemDirectoryW(buffer, len(buffer))
        return buffer.value
    return GetSystemDirectory
 GetSystemDirectory = GetSystemDirectory()
 def GetVolumeSerialNumber():
    GetVolumeInformationW = kernel32.GetVolumeInformationW
    GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint,
                                      POINTER(c_uint), POINTER(c_uint),
                                      POINTER(c_uint), c_wchar_p, c_uint]
    GetVolumeInformationW.restype = c_uint
    def GetVolumeSerialNumber(path):
        vsn = c_uint(0)
        GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0)
        return vsn.value
    return GetVolumeSerialNumber
 GetVolumeSerialNumber = GetVolumeSerialNumber()
 def GetUserName():
    GetUserNameW = advapi32.GetUserNameW
    GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)]
    GetUserNameW.restype = c_uint
    def GetUserName():
        buffer = create_unicode_buffer(32)
        size = c_uint(len(buffer))
        while not GetUserNameW(buffer, byref(size)):
            buffer = create_unicode_buffer(len(buffer) * 2)
            size.value = len(buffer)
        return buffer.value.encode('utf-16-le')[::2]
    return GetUserName
 GetUserName = GetUserName()
 def CryptUnprotectData():
    _CryptUnprotectData = crypt32.CryptUnprotectData
    _CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p,
                                   c_void_p, c_void_p, c_uint, DataBlob_p]
    _CryptUnprotectData.restype = c_uint
    def CryptUnprotectData(indata, entropy):
        indatab = create_string_buffer(indata)
        indata = DataBlob(len(indata), cast(indatab, c_void_p))
        entropyb = create_string_buffer(entropy)
        entropy = DataBlob(len(entropy), cast(entropyb, c_void_p))
        outdata = DataBlob()
        if not _CryptUnprotectData(byref(indata), None, byref(entropy),
                                   None, None, 0, byref(outdata)):
            raise CMBDTCFatal("Failed to Unprotect Data")
        return string_at(outdata.pbData, outdata.cbData)
    return CryptUnprotectData
 CryptUnprotectData = CryptUnprotectData()
 #
 # Returns the MD5 digest of "message"
 #
 def MD5(message):
    ctx = hashlib.md5()
    ctx.update(message)
    return ctx.digest()
 #
 # Returns the MD5 digest of "message"
 #
 def SHA1(message):
    ctx = hashlib.sha1()
    ctx.update(message)
    return ctx.digest()
 #
 # Open the book file at path
 #
 def openBook(path):
    try:
        return open(path,'rb')
    except:
        raise CMBDTCFatal("Could not open book file: " + path)
 #
 # Encode the bytes in data with the characters in map
 #
 def encode(data, map):
    result = ""
    for char in data:
        value = ord(char)
        Q = (value ^ 0x80) // len(map)
        R = value % len(map)
        result += map[Q]
        result += map[R]
    return result
 #
 # Hash the bytes in data and then encode the digest with the characters in map
 #
 def encodeHash(data,map):
    return encode(MD5(data),map)
 #
 # Decode the string in data with the characters in map. Returns the decoded bytes
 #
 def decode(data,map):
    result = ""
    for i in range (0,len(data),2):
        high = map.find(data[i])
        low = map.find(data[i+1])
        value = (((high * 0x40) ^ 0x80) & 0xFF) + low
        result += pack("B",value)
    return result
 #
 # Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application)
 #
 def openKindleInfo():
    regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\")
    path = winreg.QueryValueEx(regkey, 'Local AppData')[0] 
    return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r')
 #
 # Parse the Kindle.info file and return the records as a list of key-values
 #
 def parseKindleInfo():
    DB = {}
    infoReader = openKindleInfo()
    infoReader.read(1)
    data = infoReader.read()
    items = data.split('{')
    for item in items:
        splito = item.split(':')
        DB[splito[0]] =splito[1]
    return DB
 #
 # Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal)
 #
 def findNameForHash(hash):
    names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
    result = ""
    for name in names:
        if hash == encodeHash(name, charMap2):
           result = name
           break
    return name
 #
 # Print all the records from the kindle.info file (option -i)
 #
 def printKindleInfo():
    for record in kindleDatabase:
        name = findNameForHash(record)
        if name != "" :
            print (name)
            print ("--------------------------\n")
        else :
            print ("Unknown Record")
        print getKindleInfoValueForHash(record)
        print "\n"
 #
 # Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
 #
 def getKindleInfoValueForHash(hashedKey):
    global kindleDatabase
    encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
    return CryptUnprotectData(encryptedValue,"")
 #
 #  Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
 #
 def getKindleInfoValueForKey(key):
    return getKindleInfoValueForHash(encodeHash(key,charMap2))
 #
 # Get a 7 bit encoded number from the book file
 #
 def bookReadEncodedNumber():
    flag = False
    data = ord(bookFile.read(1))
    if data == 0xFF:
       flag = True
       data = ord(bookFile.read(1))
    if data >= 0x80:
        datax = (data & 0x7F)
        while data >= 0x80 :
            data = ord(bookFile.read(1))
            datax = (datax <<7) + (data & 0x7F)
        data = datax 
    if flag:
       data = -data
    return data
 #
 # Encode a number in 7 bit format
 #
 def encodeNumber(number):
   result = ""
   negative = False
   flag = 0
   if number < 0 :
       number = -number + 1
       negative = True
   while True:
       byte = number & 0x7F
       number = number >> 7
       byte += flag
       result += chr(byte)
       flag = 0x80
       if number == 0 : break
   if negative:
       result += chr(0xFF)
   return result[::-1]
 #
 # Get a length prefixed string from the file 
 #
 def bookReadString():
    stringLength = bookReadEncodedNumber()
    return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]  
 #
 # Returns a length prefixed string
 #
 def lengthPrefixString(data):
    return encodeNumber(len(data))+data
 #
 # Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
 #
 def bookReadHeaderRecordData():
    nbValues = bookReadEncodedNumber()
    values = []
    for i in range (0,nbValues):
        values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
    return values
 #
 # Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
 #
 def parseTopazHeaderRecord():
    if ord(bookFile.read(1)) != 0x63:
        raise CMBDTCFatal("Parse Error : Invalid Header")
    tag = bookReadString()
    record = bookReadHeaderRecordData()
    return [tag,record]
 #
 # Parse the header of a Topaz file, get all the header records and the offset for the payload
 #
 def parseTopazHeader():
    global bookHeaderRecords
    global bookPayloadOffset
    magic = unpack("4s",bookFile.read(4))[0]
    if magic != 'TPZ0':
        raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
    nbRecords = bookReadEncodedNumber()
    bookHeaderRecords = {}
    for i in range (0,nbRecords):
        result = parseTopazHeaderRecord()
        print result[0], result[1]
        bookHeaderRecords[result[0]] = result[1]
    if ord(bookFile.read(1))  != 0x64 :
        raise CMBDTCFatal("Parse Error : Invalid Header")
    bookPayloadOffset = bookFile.tell()
 #
 # Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
 # Correction, the record is correctly decompressed too
 #
 def getBookPayloadRecord(name, index):   
    encrypted = False
    compressed = False
    try: 
        recordOffset = bookHeaderRecords[name][index][0]
    except:
        raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
    bookFile.seek(bookPayloadOffset + recordOffset)
    tag = bookReadString()
    if tag != name :
        raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
    recordIndex = bookReadEncodedNumber()
    if recordIndex < 0 :
        encrypted = True
        recordIndex = -recordIndex -1
    if recordIndex != index :
      raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
    if (bookHeaderRecords[name][index][2] > 0):
        compressed = True
        record = bookFile.read(bookHeaderRecords[name][index][2])
    else:
        record = bookFile.read(bookHeaderRecords[name][index][1])
    if encrypted:
       ctx = topazCryptoInit(bookKey)
       record = topazCryptoDecrypt(record,ctx)
    if compressed:
        record = zlib.decompress(record)
    return record
 #
 # Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
 #
 def extractBookPayloadRecord(name, index, filename):
    compressed = False
    try:
        compressed = bookHeaderRecords[name][index][2] != 0
        record = getBookPayloadRecord(name,index)
    except:
        print("Could not find record")
    # if compressed:
    #    try:
    #        record = zlib.decompress(record)
    #    except:
    #        raise CMBDTCFatal("Could not decompress record")
    if filename != "":
        try:
            file = open(filename,"wb")
            file.write(record)
            file.close()
        except:
            raise CMBDTCFatal("Could not write to destination file")
    else:
        print(record)
 #
 # return next record [key,value] from the book metadata from the current book position
 #  
 def readMetadataRecord():
    return [bookReadString(),bookReadString()]
 #
 # Parse the metadata record from the book payload and return a list of [key,values]
 #
 def parseMetadata():
    global bookHeaderRecords
    global bookPayloadAddress
    global bookMetadata
    bookMetadata = {}
    bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
    tag = bookReadString()
    if tag != "metadata" :
        raise CMBDTCFatal("Parse Error : Record Names Don't Match")
    flags = ord(bookFile.read(1))
    nbRecords = ord(bookFile.read(1))
    for i in range (0,nbRecords) :
        record =readMetadataRecord()
        bookMetadata[record[0]] = record[1]
 #
 # Returns two bit at offset from a bit field
 #
 def getTwoBitsFromBitField(bitField,offset):
    byteNumber = offset // 4
    bitPosition = 6 - 2*(offset % 4)
    return ord(bitField[byteNumber]) >> bitPosition & 3
 #
 # Returns the six bits at offset from a bit field
 #    
 def getSixBitsFromBitField(bitField,offset):
     offset *= 3
     value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
     return value
 #
 # 8 bits to six bits encoding from hash to generate PID string
 #
 def encodePID(hash):
    global charMap3
    PID = ""
    for position in range (0,8):
        PID += charMap3[getSixBitsFromBitField(hash,position)]
    return PID
 #
 # Context initialisation for the Topaz Crypto
 #
 def topazCryptoInit(key):
    ctx1 = 0x0CAFFE19E
    for keyChar in key:
        keyByte = ord(keyChar)
        ctx2 = ctx1 
        ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
    return [ctx1,ctx2]
 #
 # decrypt data with the context prepared by topazCryptoInit()
 #
 def topazCryptoDecrypt(data, ctx):
    ctx1 = ctx[0]
    ctx2 = ctx[1]
    plainText = ""
    for dataChar in data:
        dataByte = ord(dataChar)
        m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
        ctx2 = ctx1
        ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
        plainText += chr(m)
    return plainText
 #
 # Decrypt a payload record with the PID
 #
 def decryptRecord(data,PID):
    ctx = topazCryptoInit(PID)
    return topazCryptoDecrypt(data, ctx)
 #
 # Try to decrypt a dkey record (contains the book PID)
 #
 def decryptDkeyRecord(data,PID):
    record = decryptRecord(data,PID)
    fields = unpack("3sB8sB8s3s",record)
    if fields[0] != "PID" or fields[5] != "pid" :
        raise CMBDTCError("Didn't find PID magic numbers in record")
    elif fields[1] != 8 or fields[3] != 8 :
        raise CMBDTCError("Record didn't contain correct length fields")
    elif fields[2] != PID :
        raise CMBDTCError("Record didn't contain PID")
    return fields[4]
 #
 # Decrypt all the book's dkey records (contain the book PID)
 #
 def decryptDkeyRecords(data,PID):
    nbKeyRecords = ord(data[0])
    records = []
    data = data[1:]
    for i in range (0,nbKeyRecords):
        length = ord(data[0])
        try:
            key = decryptDkeyRecord(data[1:length+1],PID)
            records.append(key)
        except CMBDTCError:
            pass
        data = data[1+length:]
    return records
 #
 # Encryption table used to generate the device PID
 #
 def generatePidEncryptionTable() :
    table = []
    for counter1 in range (0,0x100):
        value = counter1
        for counter2 in range (0,8):
            if (value & 1 == 0) :
                value = value >> 1
            else :
                value = value >> 1
                value = value ^ 0xEDB88320
        table.append(value)
    return table
 #
 # Seed value used to generate the device PID
 #
 def generatePidSeed(table,dsn) :
    value = 0
    for counter in range (0,4) :
       index = (ord(dsn[counter]) ^ value) &0xFF
       value = (value >> 8) ^ table[index]
    return value
 #
 # Generate the device PID
 #
 def generateDevicePID(table,dsn,nbRoll):
    seed = generatePidSeed(table,dsn)
    pidAscii = ""
    pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
    index = 0
    for counter in range (0,nbRoll):
        pid[index] = pid[index] ^ ord(dsn[counter])
        index = (index+1) %8
    for counter in range (0,8):
        index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
        pidAscii += charMap4[index]
    return pidAscii
 #
 # Create decrypted book payload
 #
 def createDecryptedPayload(payload):
    for headerRecord in bookHeaderRecords:
       name = headerRecord
       if name != "dkey" :
           ext = '.dat'
           if name == 'img' : ext = '.jpg'
           for index in range (0,len(bookHeaderRecords[name])) :
               fnum = "%04d" % index
               fname = name + fnum + ext
               destdir = payload
               if name == 'img':
                   destdir =  os.path.join(payload,'img')
               if name == 'page':
                   destdir =  os.path.join(payload,'page')
               if name == 'glyphs':
                   destdir =  os.path.join(payload,'glyphs')
               outputFile = os.path.join(destdir,fname)
               file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
 # Create decrypted book
 #
 def createDecryptedBook(outdir):
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    destdir =  os.path.join(outdir,'img')
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    destdir =  os.path.join(outdir,'page')
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    destdir =  os.path.join(outdir,'glyphs')
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    createDecryptedPayload(outdir)
 #
 # Set the command to execute by the programm according to cmdLine parameters
 #
 def setCommand(name) :
    global command
    if command != "" :
         raise CMBDTCFatal("Invalid command line parameters")
    else :
        command = name
 # 
 # Program usage
 #
 def usage():
    print("\nUsage:")
    print("\ncmbtc_dump.py [options] bookFileName\n")
    print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
    print("-d Dumps the unencrypted book as files to outdir")
    print("-o Output directory to save book files to")
    print("-v Verbose (can be used several times)")
    print("-i Prints kindle.info database")
 #
 # Main
 #   
 def main(argv=sys.argv):
    global kindleDatabase
    global bookMetadata
    global bookKey
    global bookFile
    global command
    progname = os.path.basename(argv[0])
    verbose = 0
    recordName = ""
    recordIndex = 0
    outdir = ""
    PIDs = []
    kindleDatabase = None
    command = ""
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vi:o:p:d")
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    if len(opts) == 0 and len(args) == 0 :
        usage()
        sys.exit(2) 
    for o, a in opts:
        if o == "-v":
            verbose+=1
        if o == "-i":
            setCommand("printInfo")
        if o =="-o":
            if a == None :
                raise CMBDTCFatal("Invalid parameter for -o")
            outdir = a
        if o =="-p":
            PIDs.append(a)
        if o =="-d":
            setCommand("doit")
    if command == "" :
        raise CMBDTCFatal("No action supplied on command line")
    #
    # Read the encrypted database
    #
    try:
        kindleDatabase = parseKindleInfo()
    except Exception as message:
        if verbose>0:
            print(message)
    if kindleDatabase != None :
        if command == "printInfo" :
            printKindleInfo()
    #
    # Compute the DSN
    #
    # Get the Mazama Random number
        MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber")
    # Get the HDD serial
        encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1)
    # Get the current user name
        encodedUsername = encodeHash(GetUserName(),charMap1)
    # concat, hash and encode
        DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1)
        if verbose >1:
            print("DSN: " + DSN)
    #
    # Compute the device PID
    #
        table =  generatePidEncryptionTable()
        devicePID = generateDevicePID(table,DSN,4)
        PIDs.append(devicePID)
        if verbose > 0:
            print("Device PID: " + devicePID)
    #
    # Open book and parse metadata
    #
    if len(args) == 1:
        bookFile = openBook(args[0])
        parseTopazHeader()
        parseMetadata()
    #
    # Compute book PID
    # 
    # Get the account token
        if kindleDatabase != None:
            kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens")
            if verbose >1:
                print("Account Token: " + kindleAccountToken)
            keysRecord = bookMetadata["keys"]
            keysRecordRecord = bookMetadata[keysRecord]
            pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord)
            bookPID = encodePID(pidHash)
            PIDs.append(bookPID)
            if verbose > 0:
                print ("Book PID: " + bookPID )
    #
    #  Decrypt book key
    #
        dkey = getBookPayloadRecord('dkey', 0) 
        bookKeys = []
        for PID in PIDs :
            bookKeys+=decryptDkeyRecords(dkey,PID)
        if len(bookKeys) == 0 :
            if verbose > 0 :
                print ("Book key could not be found. Maybe this book is not registered with this device.")
        else :
            bookKey = bookKeys[0]
            if verbose > 0:
                print("Book key: " + bookKey.encode('hex'))
            if command == "printRecord" :
                extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
                if outputFile != "" and verbose>0 :
                    print("Wrote record to file: "+outputFile) 
            elif command == "doit" :
                if outdir != "" :
                    createDecryptedBook(outdir)
                    if verbose >0 :
                        print ("Decrypted book saved. Don't pirate!")
                elif verbose > 0:
                    print("Output directory name was not supplied.")
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/Topaz_Tools/lib/convert2xml.py
+++ b/Topaz_Tools/lib/convert2xml.py
@ -0,0 +1,821 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import with_statement
 import csv
 import sys
 import os
 import getopt
 from struct import pack
 from struct import unpack
 # Get a 7 bit encoded number from string. The most 
 # significant byte comes first and has the high bit (8th) set
 def readEncodedNumber(file):
    flag = False
    c = file.read(1)
    if (len(c) == 0):
        return None
    data = ord(c)
    if data == 0xFF:
       flag = True
       c = file.read(1)
       if (len(c) == 0):
           return None
       data = ord(c)
    if data >= 0x80:
        datax = (data & 0x7F)
        while data >= 0x80 :
            c = file.read(1)
            if (len(c) == 0): 
                return None
            data = ord(c)
            datax = (datax <<7) + (data & 0x7F)
        data = datax 
    if flag:
       data = -data
    return data
 # returns a binary string that encodes a number into 7 bits
 # most significant byte first which has the high bit set
 def encodeNumber(number):
   result = ""
   negative = False
   flag = 0
   if number < 0 :
       number = -number + 1
       negative = True
   while True:
       byte = number & 0x7F
       number = number >> 7
       byte += flag
       result += chr(byte)
       flag = 0x80
       if number == 0 : break
   if negative:
       result += chr(0xFF)
   return result[::-1]
 # create / read  a length prefixed string from the file
 def lengthPrefixString(data):
    return encodeNumber(len(data))+data
 def readString(file):
    stringLength = readEncodedNumber(file)
    if (stringLength == None):
        return ""
    sv = file.read(stringLength)
    if (len(sv)  != stringLength):
        return ""
    return unpack(str(stringLength)+"s",sv)[0]  
 # convert a binary string generated by encodeNumber (7 bit encoded number)
 # to the value you would find inside the page*.dat files to be processed
 def convert(i):
    result = ''
    val = encodeNumber(i)
    for j in xrange(len(val)):
        c = ord(val[j:j+1])
        result += '%02x' % c
        return result
 # the complete string table used to store all book text content
 # as well as the xml tokens and values that make sense out of it
 class Dictionary(object):
    def __init__(self, dictFile):
        self.filename = dictFile
        self.size = 0
        self.fo = file(dictFile,'rb')
        self.stable = []
        self.size = readEncodedNumber(self.fo)
        for i in xrange(self.size):
            self.stable.append(self.escapestr(readString(self.fo)))
        self.pos = 0
    def escapestr(self, str):
        str = str.replace('&','&amp;')
        str = str.replace('<','&lt;')
        str = str.replace('>','&gt;')
        str = str.replace('=','&#61;')
        return str
    def lookup(self,val):
        if ((val >= 0) and (val < self.size)) :
            self.pos = val
            return self.stable[self.pos]
        else:
            print "Error - %d outside of string table limits" % val
            sys.exit(-1)
    def getSize(self):
        return self.size
    def getPos(self):
        return self.pos
    def dumpDict(self):
        for i in xrange(self.size):
            print "%d %s %s" % (i, convert(i), self.stable[i])
        return
 # parses the xml snippets that are represented by each page*.dat file.
 # also parses the other0.dat file - the main stylesheet
 # and information used to inject the xml snippets into page*.dat files
 class PageParser(object):
    def __init__(self, filename, dict, debug, flat_xml):
        self.fo = file(filename,'rb')
        self.id = os.path.basename(filename).replace('.dat','')
        self.dict = dict
        self.debug = debug
        self.flat_xml = flat_xml
        self.tagpath = []
        self.doc = []
        self.snippetList = []
    # hash table used to enable the decoding process
    # This has all been developed by trial and error so it may still have omissions or
    # contain errors
    # Format:
    # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
    token_tags = {
        'book'         : (1, 'snippets', 1, 0),
        'version'      : (1, 'snippets', 1, 0),
        'stylesheet'   : (1, 'snippets', 1, 0),
        'links'        : (0, 'number', 0, 1),
        'pages'        : (0, 'number', 0, 1),
        'page'         : (1, 'snippets', 1, 0),
        'group'        : (1, 'snippets', 1, 0),
        'region'       : (1, 'snippets', 1, 0),
        'reflow'       : (1, 'number', 1, 0),
        'img'          : (1, 'snippets', 1, 0),
        'paragraph'    : (1, 'snippets', 1, 0),
        'extratokens'  : (1, 'snippets', 1, 0),
        'style'        : (1, 'snippets', 1, 0),
        'rule'         : (1, 'snippets', 1, 0),
        '_span'        : (1, 'snippets', 1, 0),
        'word_semantic': (1, 'snippets', 1, 1),
        'value'        : (1, 'scalar_text', 0, 0),
        'h'            : (1, 'scalar_number', 0, 0),
        'w'            : (1, 'scalar_number', 0, 0),
        'firstWord'    : (1, 'scalar_number', 0, 0),
        'lastWord'     : (1, 'scalar_number', 0, 0),
        'x'            : (1, 'number', 0, 0),
        'y'            : (1, 'number', 0, 0),
        'links.page'   : (1, 'number', 0, 0),
        'link_id'      : (1, 'number', 0, 0),
        'glyph'        : (0, 'number', 1, 1),
        'glyph.h'      : (1, 'number', 0, 0),
        'glyph.w'      : (1, 'number', 0, 0),
        'sh'           : (1, 'number', 0, 0),
        'word'         : (0, 'number', 1, 1),
        'src'          : (1, 'scalar_number', 0, 0),
        'rel'          : (1, 'number', 0, 0),
        'row'          : (1, 'number', 0, 0),
        'startID'      : (1, 'number', 0, 1),
        'startID.page' : (1, 'number', 0, 0),
        'glyphID'      : (1, 'number', 0, 0),
        'rootID'       : (1, 'number', 0, 0),
        'stemID'       : (1, 'number', 0, 0),
        'margin-top'   : (1, 'number', 0, 0),
        'stemPage'     : (1, 'number', 0, 0),
        'dehyphen'     : (1, 'number', 1, 1),
        'rootID'       : (1, 'number', 0, 0),
        'paraCont'     : (1, 'number', 1, 1),
        'paraStems'    : (1, 'number', 1, 1),
        'wordStems'    : (1, 'number', 1, 1),
        'original'     : (0, 'number', 0, 1),
        'use'          : (1, 'number', 0, 0),
        'vtx'          : (1, 'number', 0, 1),
        'len'          : (1, 'number', 0, 1),
        'dpi'          : (1, 'number', 0, 0),
        'n'            : (1, 'number', 0, 0),
        'id'           : (1, 'number', 0, 0),
        'ref'          : (1, 'number', 0, 0),
        'pnum'         : (1, 'number', 0, 0),
        'pid'          : (1, 'text', 0, 0),
        'info'         : (0, 'number', 1, 0),
        'bl'           : (1, 'raw', 0, 0),
        'firstGlyph'   : (1, 'raw', 0, 0),
        'lastGlyph'    : (1, 'raw', 0, 0),
        'ocrText'      : (1, 'text', 0, 0),
        'title'        : (1, 'text', 0, 0),
        'href'         : (1, 'text', 0, 0),
        '_parent_type' : (1, 'text', 0, 0),
        'attr'         : (1, 'scalar_text', 0, 0),
        'justify'      : (1, 'scalar_text', 0, 0),
        'align'        : (1, 'scalar_text', 0, 0),
        'layout'       : (1, 'scalar_text', 0, 0),
        'pageid'       : (1, 'scalar_text', 0, 0),
        'pagelabel'    : (1, 'scalar_text', 0, 0),
        'type'         : (1, 'text', 0, 0),
        'class'        : (1, 'scalar_text', 0, 0),
        'container'    : (1, 'scalar_text', 0, 0),
        '_after_class' : (1, 'scalar_text', 0, 0),
        '_tag'         : (1, 'scalar_text', 0, 0),
        'pos'          : (1, 'scalar_text', 0, 0),
        'page_num'     : (1, 'scalar_text', 0, 0),
        'page_type'    : (1, 'scalar_text', 0, 0),
        'findlists'    : (1, 'scalar_text', 0, 0),
        'FlowEdit_1_id'            : (1, 'scalar_text', 0, 0),
        'FlowEdit_1_version'       : (1, 'scalar_text', 0, 0),
        'Schema_id'                : (1, 'scalar_text', 0, 0),
        'Schema_version'           : (1, 'scalar_text', 0, 0),
        'Topaz_version'            : (1, 'scalar_text', 0, 0),
        'WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0),
        'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
        'ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0),
        'ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0),
        'chapterheaders'           : (1, 'scalar_text', 0, 0),
        'creation_date'            : (1, 'scalar_text', 0, 0),
        'header_footer'            : (1, 'scalar_text', 0, 0),
        'init_from_ocr'            : (1, 'scalar_text', 0, 0),
        'letter_insertion'         : (1, 'scalar_text', 0, 0),
        'xmlinj_convert'           : (1, 'scalar_text', 0, 0),
        'xmlinj_reflow'            : (1, 'scalar_text', 0, 0),
        'xmlinj_transform'         : (1, 'scalar_text', 0, 0),
     }
    # full tag path record keeping routines
    def tag_push(self, token):
        self.tagpath.append(token)
    def tag_pop(self):
        if len(self.tagpath) > 0 :
            self.tagpath.pop()
    def tagpath_len(self):
        return len(self.tagpath)
    def get_tagpath(self, i):
        cnt = len(self.tagpath)
        if i < cnt : result = self.tagpath[i]
        for j in xrange(i+1, cnt) :
            result += '.' + self.tagpath[j]
        return result
    # list of absolute command byte values values that indicate
    # various types of loop meachanisms typically used to generate vectors
    cmd_list = (0x76, 0x76)
    # peek at and return 1 byte that is ahead by i bytes 
    def peek(self, aheadi):
        c = self.fo.read(aheadi)
        if (len(c) == 0):
            return None
        self.fo.seek(-aheadi,1)
        c = c[-1:]
        return ord(c)
    # get the next value from the file being processed
    def getNext(self):
        nbyte = self.peek(1);
        if (nbyte == None):
            return None
        val = readEncodedNumber(self.fo)
        return val
    # format an arg by argtype
    def formatArg(self, arg, argtype):
        if (argtype == 'text') or (argtype == 'scalar_text') :
            result = self.dict.lookup(arg)
        elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
            result = arg
        elif (argtype == 'snippets') :
            result = arg
        else :
            print "Error Unknown argtype %s" % argtype
            sys.exit(-2)
        return result
    # process the next tag token, recursively handling subtags, 
    # arguments, and commands
    def procToken(self, token):
        known_token = False
        self.tag_push(token)
        if self.debug : print 'Processing: ', self.get_tagpath(0)
        cnt = self.tagpath_len()
        for j in xrange(cnt):
            tkn = self.get_tagpath(j)
            if tkn in self.token_tags :
                num_args = self.token_tags[tkn][0]
                argtype = self.token_tags[tkn][1]
                subtags = self.token_tags[tkn][2]
                splcase = self.token_tags[tkn][3]
                ntags = -1
                known_token = True
                break
        if known_token :
            # handle subtags if present 
            subtagres = []
            if (splcase == 1):
                # this type of tag uses of escape marker 0x74 indicate subtag count
                if self.peek(1) == 0x74:
                    skip = readEncodedNumber(self.fo)
                    subtags = 1
                    num_args = 0
            if (subtags == 1): 
                ntags = readEncodedNumber(self.fo)
                if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
                for j in xrange(ntags):
                    val = readEncodedNumber(self.fo)
                    subtagres.append(self.procToken(self.dict.lookup(val)))
            # arguments can be scalars or vectors of text or numbers
            argres = []
            if num_args > 0 :
                firstarg = self.peek(1)
                if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
                    # single argument is a variable length vector of data
                    arg = readEncodedNumber(self.fo)
                    argres = self.decodeCMD(arg,argtype)
                else :
                    # num_arg scalar arguments
                    for i in xrange(num_args):
                        argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
            # build the return tag
            result = []
            tkn = self.get_tagpath(0)
            result.append(tkn)
            result.append(subtagres)
            result.append(argtype)
            result.append(argres)
            self.tag_pop()
            return result
        # all tokens that need to be processed should be in the hash
        # table if it may indicate a problem, either new token 
        # or an out of sync condition
        else:
            result = []
            if (self.debug):
                print 'Unknown Token:', token
            self.tag_pop()
            return result
    # special loop used to process code snippets
    # it is NEVER used to format arguments.
    # builds the snippetList
    def doLoop72(self, argtype):
        cnt = readEncodedNumber(self.fo)
        if self.debug :
            result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
            result += 'of the document is indicated by snippet number sets at the\n'
            result += 'end of each snippet. \n'
            print result
        for i in xrange(cnt):
            if self.debug: print 'Snippet:',str(i)
            snippet = []
            snippet.append(i)
            val = readEncodedNumber(self.fo)
            snippet.append(self.procToken(self.dict.lookup(val)))
            self.snippetList.append(snippet)
        return
    # loop: pass though values unchanged
    # DO NOT CHANGE - this has proven to be correct
    def doLoop76Mode0(self, argtype, cnt):
        result = [] 
        for i in xrange(cnt):
            result.append(self.formatArg(readEncodedNumber(self.fo), argtype))
        return result
    # loop generating values relative to the *negative* 
    # of the offset - don't ask why - it just is
    # DO NOT CHANGE - this has proven to be correct
    def doLoop76Mode1(self, argtype, cnt):
        result = []
        offset = -readEncodedNumber(self.fo)
        for i in xrange(cnt):
            val = readEncodedNumber(self.fo) + offset
            result.append(self.formatArg(val, argtype))
        return result
    # loop generating values with starting value and accumulation
    # DO NOT CHANGE - this has proven to be the correct
    def doLoop76Mode2(self, argtype, cnt):
        result = []
        ptr = readEncodedNumber(self.fo)
        result.append(self.formatArg(ptr, argtype))
        for i in xrange(cnt-1):
            ptr = ptr + readEncodedNumber(self.fo) 
            result.append(self.formatArg(ptr, argtype))
        return result
    # loop generating values with starting value and accumulation
    # **after** subtracting adjustment value from each
    # DO NOT CHANGE - this has been proven to be correct
    def doLoop76Mode3(self, argtype, cnt):
        result = []
        adj = readEncodedNumber(self.fo)
        ptr = readEncodedNumber(self.fo)
        ptr = ptr - adj 
        result.append(self.formatArg(ptr, argtype))
        for i in xrange(cnt-1):
            ptr = ptr + readEncodedNumber(self.fo) - adj
            result.append(self.formatArg(ptr,argtype))
        return result
    # loop using runing sum of data values and starting value
    # with accumulation to get new value
    # Again, don't ask it took me forever to figure this out
    # DO NOT CHANGE - this has been proven to be correct
    def doLoop76Mode4(self, argtype, cnt):
        result = []
        val = readEncodedNumber(self.fo)
        runsum = val
        ptr = val
        result.append(self.formatArg(ptr, argtype))
        for i in xrange(cnt-1):
            runsum += readEncodedNumber(self.fo)
            ptr = ptr + runsum
            result.append(self.formatArg(ptr,argtype))
        return result
    # loop using and extra value as an adjustment
    # and a running sum of the values after subtracting
    # the adjustment, added to a ptr to get a new pointer
    def doLoop76Mode5(self, argtype, cnt):
        result = []
        adj = readEncodedNumber(self.fo)
        ptr = 0
        runsum = 0
        for i in xrange(cnt):
            val = readEncodedNumber(self.fo)
            runsum += (val - adj)
            ptr = ptr +runsum
            result.append(self.formatArg(ptr,argtype))
        return result
    # FIXME:  I have only 4 points to work this out with inside my book
    # So may be wrong but it is correct for my 4 points
    def doLoop76Mode6(self, argtype, cnt):
        result = []
        oldval = 0
        for i in xrange(cnt):
            val = readEncodedNumber(self.fo)
            ptr= (3 * oldval) + val + 1
            result.append(self.formatArg(ptr,argtype))
            oldval = val
        return result
    # dispatches loop commands bytes with various modes
    # The 0x76 style loops are used to build vectors
    # This was all derived by trial and error and 
    # new loop types may exist that are not handled here
    # since they did not appear in the test cases
    def decodeCMD(self, cmd, argtype):
        # if (cmd == 0x72):
        #     self.doLoop72(argtype)
        #     result =[]
        #     return result
        if (cmd == 0x76):
            # loop with cnt, and mode to control loop styles
            cnt = readEncodedNumber(self.fo)
            mode = readEncodedNumber(self.fo)
            if self.debug : print 'Loop for', cnt, 'with  mode', mode,  ':  '  
            if (mode == 0x00):
                return self.doLoop76Mode0(argtype, cnt)
            elif (mode == 0x01):
                return self.doLoop76Mode1(argtype, cnt)
            elif (mode == 0x02):
                return self.doLoop76Mode2(argtype, cnt)
            elif (mode == 0x03):
                return self.doLoop76Mode3(argtype, cnt)
            elif (mode == 0x04):
                return self.doLoop76Mode4(argtype, cnt)
            elif (mode == 0x05):
                return self.doLoop76Mode5(argtype, cnt)
            elif (mode == 0x06):
                return self.doLoop76Mode6(argtype, cnt)
            else:
                if self.debug :
                    # try to mark any unknown loop comands
                    # if they exist, unless they are used to process
                    # text or some other known list, we won't be able to prove them correct
                    print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode) 
                    for i in xrange(cnt):
                        val = readEncodedNumber(self.fo)
                        print ' 0x%x' % val,
                        print ' '
                result = []
                return result
        if self.dbug: print  "Unknown command", cmd
        result = []
        return result
    # add full tag path to injected snippets
    def updateName(self, tag, prefix):
        name = tag[0]
        subtagList = tag[1]
        argtype = tag[2]
        argList = tag[3]
        nname = prefix + '.' + name
        nsubtaglist = []
        for j in subtagList:
            nsubtaglist.append(self.updateName(j,prefix))
        ntag = []
        ntag.append(nname)
        ntag.append(nsubtaglist)
        ntag.append(argtype)
        ntag.append(argList)
        return ntag
    # perform depth first injection of specified snippets into this one
    def injectSnippets(self, snippet):
        snipno, tag = snippet
        name = tag[0]
        subtagList = tag[1]
        argtype = tag[2]
        argList = tag[3]
        nsubtagList = []
        if len(argList) > 0 : 
            for j in argList:
                asnip = self.snippetList[j]
                aso, atag = self.injectSnippets(asnip)
                atag = self.updateName(atag, name)
                nsubtagList.append(atag)
        argtype='number'
        argList=[]
        if len(nsubtagList) > 0 :
            subtagList.extend(nsubtagList)
        tag = []
        tag.append(name)
        tag.append(subtagList)
        tag.append(argtype)
        tag.append(argList)
        snippet = []
        snippet.append(snipno)
        snippet.append(tag)
        return snippet
    # format the tag for output
    def formatTag(self, node):
        name = node[0]
        subtagList = node[1]
        argtype = node[2]
        argList = node[3]
        fullpathname = name.split('.')
        nodename = fullpathname.pop()
        ilvl = len(fullpathname)
        indent = ' ' * (3 * ilvl)
        result = indent + '<' + nodename + '>'
        if len(argList) > 0:
            argres = ''
            for j in argList:
                if (argtype == 'text') or (argtype == 'scalar_text') :
                    argres += j + '|'
                else :
                    argres += str(j) + ','
            argres = argres[0:-1]
            if argtype == 'snippets' :
                result += 'snippets:' + argres
            else :
                result += argres
        if len(subtagList) > 0 :
            result += '\n'
            for j in subtagList:
                if len(j) > 0 :
                    result += self.formatTag(j)
            result += indent + '</' + nodename + '>\n'
        else:
            result += '</' + nodename + '>\n'
        return result
   # flatten tag
    def flattenTag(self, node):
        name = node[0]
        subtagList = node[1]
        argtype = node[2]
        argList = node[3]
        result = name
        if (len(argList) > 0):
            argres = ''
            for j in argList:
                if (argtype == 'text') or (argtype == 'scalar_text') :
                    argres += j + '|'
                else :
                    argres += str(j) + '|'
            argres = argres[0:-1]
            if argtype == 'snippets' :
                result += '.snippets=' + argres
            else :
                result += '=' + argres
        result += '\n'
        for j in subtagList:
            if len(j) > 0 :
                result += self.flattenTag(j)
        return result
    # reduce create xml output
    def formatDoc(self, flat_xml):
        result = ''
        for j in self.doc :
            if len(j) > 0:
                if flat_xml:
                    result += self.flattenTag(j)
                else:
                    result += self.formatTag(j)
        if self.debug : print result
        return result
    # main loop - parse the page.dat files
    # to create structured document and snippets
    # FIXME: value at end of magic appears to be a subtags count
    # but for what?  For now, inject an 'info" tag as it is in
    # every dictionary and seems close to what is meant
    # The alternative is to special case the last _ "0x5f" to mean something
    def process(self):
        # peek at the first bytes to see what type of file it is
        magic = self.fo.read(11)
        if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'):
            first_token = 'info'
        elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'):
            skip = self.fo.read(1)
            first_token = 'info'
        else :
            # other0.dat file
            first_token = None
            self.fo.seek(-11,1)
        # main loop to read and build the document tree
        while True:
            if first_token != None :
                # use "inserted" first token 'info' for page and glyph files
                tag = self.procToken(first_token)
                if len(tag) > 0 :
                    self.doc.append(tag)
                first_token = None
            v = self.getNext()
            if (v == None): 
                break
            if (v == 0x72):
                self.doLoop72('number')
            elif (v > 0) and (v < self.dict.getSize()) :
                tag = self.procToken(self.dict.lookup(v))
                if len(tag) > 0 :
                    self.doc.append(tag)
            else:
                if self.debug:
                    print "Mina Loop:  Unknown value: %x" % v 
        # now do snippet injection
        if len(self.snippetList) > 0 :
            if self.debug : print 'Injecting Snippets:'
            snippet = self.injectSnippets(self.snippetList[0])
            snipno = snippet[0]
            tag_add = snippet[1]
            if self.debug : print self.formatTag(tag_add)
            if len(tag_add) > 0:
                self.doc.append(tag_add)
        # handle generation of xml output
        xmlpage = self.formatDoc(self.flat_xml)
        return xmlpage
 def usage():
    print 'Usage: '
    print '    convert2xml.py dict0000.dat infile.dat '
    print ' '
    print ' Options:'
    print '   -h            print this usage help message '
    print '   -d            turn on debug output to check for potential errors '
    print '   --flat-xml    output the flattened xml page description only '
    print ' '
    print '     This program will attempt to convert a page*.dat file or '
    print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
    print ' '
    print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
    print ' the *.dat files from a Topaz format e-book.'
 #
 # Main
 #   
 def main(argv):
    dictFile = ""
    pageFile = ""
    debug = False
    flat_xml = False
    printOutput = False
    if len(argv) == 0:
        printOutput = True
        argv = sys.argv
    else :
        argv = argv.split()
    try:
        opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    if len(opts) == 0 and len(args) == 0 :
        usage()
        sys.exit(2) 
    for o, a in opts:
        if o =="-d":
            debug=True
        if o =="-h":
            usage()
            sys.exit(0)
        if o =="--flat-xml":
            flat_xml = True
    dictFile, pageFile = args[0], args[1]
    # read in the string table dictionary
    dict = Dictionary(dictFile)
    # create a page parser
    pp = PageParser(pageFile, dict, debug, flat_xml)
    xmlpage = pp.process()
    if printOutput:
        print xmlpage
        return 0
    return xmlpage
 if __name__ == '__main__':
    sys.exit(main(''))
--- a/Topaz_Tools/lib/decode_meta.py
+++ b/Topaz_Tools/lib/decode_meta.py
@ -0,0 +1,109 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import with_statement
 import csv
 import sys
 import os
 import getopt
 from struct import pack
 from struct import unpack
 #
 # Get a 7 bit encoded number from string
 #
 def readEncodedNumber(file):
    flag = False
    c = file.read(1)
    if (len(c) == 0):
        return None
    data = ord(c)
    if data == 0xFF:
       flag = True
       c = file.read(1)
       if (len(c) == 0):
           return None
       data = ord(c)
    if data >= 0x80:
        datax = (data & 0x7F)
        while data >= 0x80 :
            c = file.read(1)
            if (len(c) == 0): 
                return None
            data = ord(c)
            datax = (datax <<7) + (data & 0x7F)
        data = datax 
    if flag:
       data = -data
    return data
 #
 # Encode a number in 7 bit format
 #
 def encodeNumber(number):
   result = ""
   negative = False
   flag = 0
   if number < 0 :
       number = -number + 1
       negative = True
   while True:
       byte = number & 0x7F
       number = number >> 7
       byte += flag
       result += chr(byte)
       flag = 0x80
       if number == 0 : break
   if negative:
       result += chr(0xFF)
   return result[::-1]
 #
 # Get a length prefixed string from the file 
 #
 def lengthPrefixString(data):
    return encodeNumber(len(data))+data
 def readString(file):
    stringLength = readEncodedNumber(file)
    if (stringLength == None):
        return None
    sv = file.read(stringLength)
    if (len(sv)  != stringLength):
        return ""
    return unpack(str(stringLength)+"s",sv)[0]  
 def getMetaArray(metaFile):
    # parse the meta file into a Python dictionary (associative array)
    result = {}
    fo = file(metaFile,'rb')
    size = readEncodedNumber(fo)
    for i in xrange(size):
        temp = readString(fo)
        result[temp] = readString(fo)
    fo.close()
    return result
 def getMetaData(metaFile):
    # parse the meta file
    result = ''    
    fo = file(metaFile,'rb')
    size = readEncodedNumber(fo)
    for i in xrange(size):
        result += readString(fo) + '|'
        result += readString(fo) + '\n'
    fo.close()
    return result
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@ -0,0 +1,299 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import with_statement
 import csv
 import sys
 import os
 import getopt
 from struct import pack
 from struct import unpack
 class DocParser(object):
    def __init__(self, flatxml, fileid):
        self.id = os.path.basename(fileid).replace('.dat','')
        self.flatdoc = flatxml.split('\n')
        self.ocrtext = []
        self.link_id = []
        self.link_title = []
        self.link_page = []
        self.dehyphen_rootid = []
        self.paracont_stemid = []
        self.parastems_stemid = []
    # find tag if within pos to end inclusive
    def findinDoc(self, tagpath, pos, end) :
        result = None
        docList = self.flatdoc
        cnt = len(docList)
        if end == -1 :
            end = cnt
        else:
            end = min(cnt,end)
        foundat = -1
        for j in xrange(pos, end):
            item = docList[j]
            if item.find('=') >= 0:
                (name, argres) = item.split('=')
            else : 
                name = item
                argres = ''
            if name.endswith(tagpath) : 
                result = argres
                foundat = j
                break
        return foundat, result
    # return list of start positions for the tagpath
    def posinDoc(self, tagpath):
        startpos = []
        pos = 0
        res = ""
        while res != None :
            (foundpos, res) = self.findinDoc(tagpath, pos, -1)
            if res != None :
                startpos.append(foundpos)
            pos = foundpos + 1
        return startpos
    # get a description of the paragraph
    def getParaDescription(self, start, end):
        # normal paragraph
        (pos, pclass) = self.findinDoc('paragraph.class',start,end) 
        # class names are an issue given topaz starts them with numerals (not allowed)
        # use a mix of cases, (which cause some browsers problems), and actually
        # attach numbers after "reclustered*" to the end to deal with reflow issues
        # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered
        pclass = pclass.lower()
        pclass = 'cl_' + pclass
        p = pclass.find('reclustered')
        if p > 0 : pclass = pclass[0:p+11]
        (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
        (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
        if (sfirst != None) and (slast != None) :
            return pclass, int(sfirst), int(slast)
        # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well
        # so walk through this region keeping track of the first firstword, and the last lastWord
        # on any items that have it
        (pos, sfirst) = self.findinDoc('firstWord',start, end)
        first = int(sfirst)
        last = -1
        for i in xrange(pos+1,end):
            (pos, slast) = self.findinDoc('lastWord',i,i+1)
            if slast != None:
                last = int(slast)
        return pclass, first, last
    def buildParagraph(self, cname, first, last, type, regtype) :
        parares = ''
        sep =''
        br_lb = False
        if (regtype == 'fixed') or (regtype == 'chapterheading') :
            br_lb = True
        handle_links = False
        if len(self.link_id) > 0:
            handle_links = True
        if (type == 'full') or (type == 'begin') :
            parares += '<p class="' + cname + '">'
        if (type == 'end'):
            parares += ' '
        for j in xrange(first, last) :
            word = self.ocrtext[j]
            sep = ' '
            if handle_links:
                link = self.link_id[j]
                if (link > 0): 
                    title = self.link_title[link-1]
                    if title == "": title='_link_'
                    ptarget = self.link_page[link-1] - 1
                    linkhtml = '<a href="#page%04d">' % ptarget
                    linkhtml += title + '</a>'
                    pos = parares.rfind(title)
                    if pos >= 0:
                        parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
                    else :
                        parares += linkhtml
                    if word == '_link_' : word = ''
                elif (link < 0) :
                    if word == '_link_' : word = ''
            if word == '_lb_':
                if (j-1) in self.dehyphen_rootid :
                    word = ''
                    sep = ''
                elif handle_links :
                    word = ''
                    sep = ''
                elif br_lb :
                    word = '<br />\n'
                    sep = ''
                else :
                    word = '\n'
                    sep = ''
            if j in self.dehyphen_rootid :
                word = word[0:-1]
                sep = ''
            parares += word + sep
        if len(sep) > 0 : parares = parares[0:-1]
        if (type == 'full') or (type == 'end') :
            parares += '</p>'
        return parares
    # walk the document tree collecting the information needed
    # to build an html page using the ocrText
    def process(self):
        htmlpage = ''
        # first collect information from the xml doc that describes this page
        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
        if argres :  self.ocrtext = argres.split('|')
        (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
        if argres: 
            argList = argres.split('|')
            self.dehyphen_rootid = [ int(strval) for strval in argList]
        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
        if self.parastems_stemid == None : self.parastems_stemid = []
        (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
        if self.paracont_stemid == None : self.paracont_stemid = []
        (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
        if argres:
            argList = argres.split('|')
            self.link_id = [ int(strval) for strval in argList]
        (pos, argres) = self.findinDoc('info.links.page',0,-1)
        if argres :
            argList = argres.split('|')
            self.link_page = [ int(strval) for strval in argList]
        (pos, argres) = self.findinDoc('info.links.title',0,-1)
        if argres :
            self.link_title = argres.split('|')
        else:
            self.link_title.append('')
        (pos, pagetype) = self.findinDoc('page.type',0,-1)
        # generate a list of each region starting point
        # each region has one paragraph,, or one image, or one chapterheading
        regionList= self.posinDoc('region')
        regcnt = len(regionList)
        regionList.append(-1)
        anchorSet = False
        breakSet = False
        # process each region tag and convert what you can to html
        for j in xrange(regcnt):
            start = regionList[j]
            end = regionList[j+1]
            (pos, regtype) = self.findinDoc('region.type',start,end)
            if regtype == 'graphic' :
                if not anchorSet:
                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                    anchorSet = True
                (pos, simgsrc) = self.findinDoc('img.src',start,end)
                if simgsrc:
                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
            elif regtype == 'chapterheading' :
                (pclass, first, last) = self.getParaDescription(start,end)
                if not breakSet:
                    htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
                    breakSet = True
                if not anchorSet:
                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                    anchorSet = True
                tag = 'h1'
                if pclass[3:7] == 'ch1-' : tag = 'h1'
                if pclass[3:7] == 'ch2-' : tag = 'h2'
                if pclass[3:7] == 'ch3-' : tag = 'h3'
                htmlpage += '<' + tag + ' class="' + pclass + '">'
                htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype)
                htmlpage += '</' + tag + '>'
            elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
                ptype = 'full'
                # check to see if this is a continution from the previous page
                if (len(self.parastems_stemid) > 0):
                    ptype = 'end'
                    self.parastems_stemid=[]
                else:
                    if not anchorSet:
                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                        anchorSet = True
                (pclass, first, last) = self.getParaDescription(start,end)
                if ptype == 'full' :
                    tag = 'p'
                    if pclass[3:6] == 'h1-' : tag = 'h4'
                    if pclass[3:6] == 'h2-' : tag = 'h5'
                    if pclass[3:6] == 'h3-' : tag = 'h6'
                    htmlpage += '<' + tag + ' class="' + pclass + '">'
                    htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype)
                    htmlpage += '</' + tag + '>'
                else :
                    htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
            elif (regtype == 'tocentry') :
                ptype = 'full'
                # check to see if this is a continution from the previous page
                if (len(self.parastems_stemid) > 0) and (j == 0):
                    # process the first paragraph as a continuation from the last page
                    ptype = 'end'
                    self.parastems_stemid = []
                else:
                    if not anchorSet:
                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                        anchorSet = True
                (pclass, first, last) = self.getParaDescription(start,end)
                htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
            else :
                print 'Unknown region type', regtype
                print 'Warning: skipping this region'
        if len(self.paracont_stemid) > 0 :
            if htmlpage[-4:] == '</p>':
                htmlpage = htmlpage[0:-4]    
        return htmlpage
        return self.convert2HTML()
 def convert2HTML(flatxml, fileid):
    # create a document parser
    dp = DocParser(flatxml, fileid)
    htmlpage = dp.process()
    return htmlpage
--- a/Topaz_Tools/lib/genhtml.py
+++ b/Topaz_Tools/lib/genhtml.py
@ -0,0 +1,125 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 import os, sys, getopt
 # local routines
 import convert2xml
 import flatxml2html
 import decode_meta
 import stylexml2css
 def usage():
    print 'Usage: '
    print ' '
    print '   genhtml.py unencryptedBookDir'
    print '  '
 def main(argv):
    bookDir = ''
    if len(argv) == 0:
        argv = sys.argv
    else :
        argv = argv.split()
    try:
        opts, args = getopt.getopt(argv[1:], "h:")
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)
    if len(opts) == 0 and len(args) == 0 :
        usage()
        sys.exit(2) 
    for o, a in opts:
        if o =="-h":
            usage()
            sys.exit(0)
    bookDir = args[0]
    if not os.path.exists(bookDir) :
        print "Can not find directory with unencrypted book"
        sys.exit(-1)
    dictFile = os.path.join(bookDir,'dict0000.dat')
    if not os.path.exists(dictFile) :
        print "Can not find dict0000.dat file"
        sys.exit(-1)
    pageDir = os.path.join(bookDir,'page')
    if not os.path.exists(pageDir) :
        print "Can not find page directory in unencrypted book"
        sys.exit(-1)
    imgDir = os.path.join(bookDir,'img')
    if not os.path.exists(imgDir) :
        print "Can not find image directory in unencrypted book"
        sys.exit(-1)
    otherFile = os.path.join(bookDir,'other0000.dat')
    if not os.path.exists(otherFile) :
        print "Can not find other0000.dat in unencrypted book"
        sys.exit(-1)
    metaFile = os.path.join(bookDir,'metadata0000.dat')
    if not os.path.exists(metaFile) :
        print "Can not find metadata0000.dat in unencrypted book"
        sys.exit(-1)
    htmlFileName = "book.html"
    htmlstr = '<html>\n'
    filenames = os.listdir(pageDir)
    filenames = sorted(filenames)
    print 'Processing ... '
    htmlstr += '<head>\n'
    print '     ', 'metadata0000.dat'
    fname = os.path.join(bookDir,'metadata0000.dat')
    xname = os.path.join(bookDir, 'metadata.txt')
    metastr = decode_meta.getMetaData(fname)
    file(xname, 'wb').write(metastr)
    meta_array = decode_meta.getMetaArray(fname)
    htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
    htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
    print '     ', 'other0000.dat'
    fname = os.path.join(bookDir,'other0000.dat')
    xname = os.path.join(bookDir, 'style.css')
    xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
    cssstr = '<style>\n'
    cssstr += stylexml2css.convert2CSS(xmlstr)
    cssstr += '</style>\n'
    file(xname, 'wb').write(cssstr)
    htmlstr += cssstr
    htmlstr += '</head>\n<body>\n'
    for filename in filenames:
        print '     ', filename
        fname = os.path.join(pageDir,filename)
        flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) 
        htmlstr += flatxml2html.convert2HTML(flat_xml, fname)
    htmlstr += '</body>\n</html>\n'
    file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
    print 'Processing Complete'
    return 0
 if __name__ == '__main__':
    sys.exit(main(''))
--- a/Topaz_Tools/lib/gensvg.py
+++ b/Topaz_Tools/lib/gensvg.py
@ -0,0 +1,295 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 import os, sys, getopt
 # local routines
 import convert2xml
 import flatxml2html
 import decode_meta
 class GParser(object):
   def __init__(self, flatxml):
       self.flatdoc = flatxml.split('\n')
       self.dpi = 1440
       self.gh = self.getData('info.glyph.h')
       self.gw = self.getData('info.glyph.w')
       self.guse = self.getData('info.glyph.use')
       self.count = len(self.guse)
       self.gvtx = self.getData('info.glyph.vtx')
       self.glen = self.getData('info.glyph.len')
       self.gdpi = self.getData('info.glyph.dpi')
       self.vx = self.getData('info.vtx.x')
       self.vy = self.getData('info.vtx.y')
       self.vlen = self.getData('info.len.n')
       self.glen.append(len(self.vlen))
       self.gvtx.append(len(self.vx))
   def getData(self, path):
       result = None
       cnt = len(self.flatdoc)
       for j in xrange(cnt):
           item = self.flatdoc[j]
           if item.find('=') >= 0:
               (name, argt) = item.split('=')
               argres = argt.split('|')
           else:
               name = item
               argres = []
           if (name == path):
               result = argres
               break
       if (len(argres) > 0) :
           for j in xrange(0,len(argres)):
               argres[j] = int(argres[j])
       return result
   def getPath(self, gly):
       path = ''
       if (gly < 0) or (gly >= self.count):
           return path
       tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
       ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
       p = 0
       for k in xrange(self.glen[gly], self.glen[gly+1]):
           if (p == 0):
               zx = tx[0:self.vlen[k]+1]
               zy = ty[0:self.vlen[k]+1]
           else:
               zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
               zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
           p += 1
           for j in xrange(0, len(zx)):
               if (j == 0):
                   path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
               else:
                   path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
       path += 'z'
       return path
 class PParser(object):
   def __init__(self, flatxml):
       self.flatdoc = flatxml.split('\n')
       self.temp = []
       self.ph = self.getData('page.h')[0]
       self.pw = self.getData('page.w')[0]
       self.gx = self.getData('info.glyph.x')
       self.gy = self.getData('info.glyph.y')
       self.gid = self.getData('info.glyph.glyphID')
   def getData(self, path):
       result = None
       cnt = len(self.flatdoc)
       for j in xrange(cnt):
           item = self.flatdoc[j]
           if item.find('=') >= 0:
               (name, argt) = item.split('=')
               argres = argt.split('|')
           else:
               name = item
               argres = []
           if (name.endswith(path)):
               result = argres
               break
       if (len(argres) > 0) :
           for j in xrange(0,len(argres)):
               argres[j] = int(argres[j])
       return result
   def getDataTemp(self, path):
       result = None
       cnt = len(self.temp)
       for j in xrange(cnt):
           item = self.temp[j]
           if item.find('=') >= 0:
               (name, argt) = item.split('=')
               argres = argt.split('|')
           else:
               name = item
               argres = []
           if (name.endswith(path)):
               result = argres
               self.temp.pop(j)
               break
       if (len(argres) > 0) :
           for j in xrange(0,len(argres)):
               argres[j] = int(argres[j])
       return result
   def getImages(self):
       result = []
       self.temp = self.flatdoc
       while (self.getDataTemp('region.img') != None):
           h = self.getDataTemp('region.img.h')[0]
           w = self.getDataTemp('region.img.w')[0]
           x = self.getDataTemp('region.img.x')[0]
           y = self.getDataTemp('region.img.y')[0]
           src = self.getDataTemp('region.img.src')[0]
           result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
       return result
   def getGlyphs(self,glyfname):
       result = []
       if (self.gid != None) and (len(self.gid) > 0):
           glyphs = []
           for j in set(self.gid):
               glyphs.append(j)
           glyphs.sort()
           gfile = open(glyfname, 'r')
           j = 0
           while True :
               inp = gfile.readline()
               if (inp == ''):
                   break
               id='id="gl%d"' % glyphs[j]
               if (inp.find(id) > 0):
                   result.append(inp)
                   j += 1
                   if (j == len(glyphs)):
                       break
           gfile.close()
       return result
 def usage():
   print 'Usage: '
   print ' '
   print '   gensvg.py unencryptedBookDir'
   print '  '
 def main(argv):
   bookDir = ''
   if len(argv) == 0:
       argv = sys.argv
   else :
       argv = argv.split()
   try:
       opts, args = getopt.getopt(argv[1:], "h:")
   except getopt.GetoptError, err:
       print str(err)
       usage()
       sys.exit(2)
   if len(opts) == 0 and len(args) == 0 :
       usage()
       sys.exit(2) 
   for o, a in opts:
       if o =="-h":
           usage()
           sys.exit(0)
   bookDir = args[0]
   if not os.path.exists(bookDir) :
       print "Can not find directory with unencrypted book"
       sys.exit(-1)
   dictFile = os.path.join(bookDir,'dict0000.dat')
   if not os.path.exists(dictFile) :
       print "Can not find dict0000.dat file"
       sys.exit(-1)
   pageDir = os.path.join(bookDir,'page')
   if not os.path.exists(pageDir) :
       print "Can not find page directory in unencrypted book"
       sys.exit(-1)
   imgDir = os.path.join(bookDir,'img')
   if not os.path.exists(imgDir) :
       print "Can not find image directory in unencrypted book"
       sys.exit(-1)
   glyphsDir = os.path.join(bookDir,'glyphs')
   if not os.path.exists(glyphsDir) :
       print "Can not find glyphs directory in unencrypted book"
       sys.exit(-1)
   metaFile = os.path.join(bookDir,'metadata0000.dat')
   if not os.path.exists(metaFile) :
       print "Can not find metadata0000.dat in unencrypted book"
       sys.exit(-1)
   svgDir = os.path.join(bookDir,'svg')
   if not os.path.exists(svgDir) :
       os.makedirs(svgDir)
   print 'Processing Meta Data ... '
   print '     ', 'metadata0000.dat'
   fname = os.path.join(bookDir,'metadata0000.dat')
   metadata = decode_meta.getMetaArray(fname)
   print 'Processing Glyphs ... '
   filenames = os.listdir(glyphsDir)
   filenames = sorted(filenames)
   glyfname = os.path.join(svgDir,'glyphs.svg')
   glyfile = open(glyfname, 'w')
   glyfile.write('<?xml version="1.0" standalone="no"?>\n')
   glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
   glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
   glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
   glyfile.write('<defs>\n')
   counter = 0
   for filename in filenames:
       print '     ', filename
       fname = os.path.join(glyphsDir,filename)
       flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) 
       gp = GParser(flat_xml)
       for i in xrange(0, gp.count):
           path = gp.getPath(i)
           glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
       counter += 1
   glyfile.write('</defs>\n')
   glyfile.write('</svg>\n')
   glyfile.close()
   print 'Processing Pages ... '
   scaledpi = 720
   filenames = os.listdir(pageDir)
   filenames = sorted(filenames)
   counter = 0
   for filename in filenames:
       print '     ', filename
       fname = os.path.join(pageDir,filename)
       flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) 
       pp = PParser(flat_xml)
       pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
       pfile.write('<?xml version="1.0" standalone="no"?>\n')
       pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
       pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
       pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
       if (pp.gid != None): 
           pfile.write('<defs>\n')
           gdefs = pp.getGlyphs(glyfname)
           for j in xrange(0,len(gdefs)):
               pfile.write(gdefs[j])
           pfile.write('</defs>\n')
           for j in xrange(0,len(pp.gid)):
               pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
       img = pp.getImages()
       if (img != None):
           for j in xrange(0,len(img)):
               pfile.write(img[j])
       pfile.write('</svg>')
       pfile.close()
       counter += 1
   print 'Processing Complete'
   return 0
 if __name__ == '__main__':
   sys.exit(main(''))
--- a/Topaz_Tools/lib/genxml.py
+++ b/Topaz_Tools/lib/genxml.py
@ -0,0 +1,121 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 import os, sys, getopt
 # local routines
 import convert2xml
 import flatxml2html
 import decode_meta
 def usage():
    print 'Usage: '
    print ' '
    print '   genxml.py dict0000.dat unencryptedBookDir'
    print '  '
 def main(argv):
    bookDir = ''
    if len(argv) == 0:
        argv = sys.argv
    else :
        argv = argv.split()
    try:
        opts, args = getopt.getopt(argv[1:], "h:")
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)
    if len(opts) == 0 and len(args) == 0 :
        usage()
        sys.exit(2) 
    for o, a in opts:
        if o =="-h":
            usage()
            sys.exit(0)
    bookDir = args[0]
    if not os.path.exists(bookDir) :
        print "Can not find directory with unencrypted book"
        sys.exit(-1)
    dictFile = os.path.join(bookDir,'dict0000.dat')
    if not os.path.exists(dictFile) :
        print "Can not find dict0000.dat file"
        sys.exit(-1)
    pageDir = os.path.join(bookDir,'page')
    if not os.path.exists(pageDir) :
        print "Can not find page directory in unencrypted book"
        sys.exit(-1)
    glyphsDir = os.path.join(bookDir,'glyphs')
    if not os.path.exists(glyphsDir) :
        print "Can not find glyphs directory in unencrypted book"
        sys.exit(-1)
    otherFile = os.path.join(bookDir,'other0000.dat')
    if not os.path.exists(otherFile) :
        print "Can not find other0000.dat in unencrypted book"
        sys.exit(-1)
    metaFile = os.path.join(bookDir,'metadata0000.dat')
    if not os.path.exists(metaFile) :
        print "Can not find metadata0000.dat in unencrypted book"
        sys.exit(-1)
    xmlDir = os.path.join(bookDir,'xml')
    if not os.path.exists(xmlDir):
        os.makedirs(xmlDir)
    print 'Processing ... '
    print '     ', 'metadata0000.dat'
    fname = os.path.join(bookDir,'metadata0000.dat')
    xname = os.path.join(xmlDir, 'metadata.txt')
    metastr = decode_meta.getMetaData(fname)
    file(xname, 'wb').write(metastr)
    print '     ', 'other0000.dat'
    fname = os.path.join(bookDir,'other0000.dat')
    xname = os.path.join(xmlDir, 'stylesheet.xml')
    xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
    file(xname, 'wb').write(xmlstr)
    filenames = os.listdir(pageDir)
    filenames = sorted(filenames)
    for filename in filenames:
        print '     ', filename
        fname = os.path.join(pageDir,filename)
        xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
        xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
        file(xname, 'wb').write(xmlstr)
    filenames = os.listdir(glyphsDir)
    filenames = sorted(filenames)
    for filename in filenames:
        print '     ', filename
        fname = os.path.join(glyphsDir,filename)
        xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
        xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
        file(xname, 'wb').write(xmlstr)
    print 'Processing Complete'
    return 0
 if __name__ == '__main__':
    sys.exit(main(''))
--- a/Topaz_Tools/lib/readme.txt
+++ b/Topaz_Tools/lib/readme.txt
@ -0,0 +1,75 @@
 This is experimental and it will probably not work for you but...
 ALSO:  Please do not use any of this to steal.  Theft is wrong. 
       This is meant to allow conversion of Topaz books for other book readers you own
 Here are the steps:
 1. Unzip the topazscripts.zip file to get the full set of python scripts.
 The files you should have after unzipping are:
 cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files
 decode_meta.py - converts metadata0000.dat to human readable text (for the most part)
 convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
 flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
 stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
 genxml.py - main program to convert everything to xml
 genhtml.py - main program to generate "book.html"
 gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
 Please note, gensvg.py, genhtml.py, and genxml.py import and use
 decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py 
 so please keep all of these python scripts together in the same place.
 2. Remove the DRM from the Topaz book and build a directory 
 of its contents as files
 All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else 
 would be possible
   cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
 This should create a directory called "TARGETDIR" in your current directory.  
 It should have the following files in it:
 metadata0000.dat - metadata info
 other0000.dat - information used to create a style sheet
 dict0000.dat - dictionary of words used to build page descriptions
 page - directory filled with page*.dat files
 glyphs - directory filled with glyphs*.dat files
 3. Convert the files in "TARGETDIR" to their xml descriptions
 which can be found in TARGETDIR/xml/ upon completion.
   genxml.py TARGETDIR
 4. Create book.html which can be found in "TARGETDIR" after 
 completion.  This html conversion can not fully capture 
 all of the layouts actually used in the book and needs to 
 be edited to include special font handling such as bold 
 or italics that can not be determined from the ocrText
 information or the style information.  If you want to 
 see things exactly as they were, see step 5 below.
   genhtml.py TARGETDIR
 5. Create an svg description of each page which can
 be found in TARGETDIR/svg/ upon completion.
 All thanks go to CLARKNOVA for this program.  This program is 
 needed to actually see the true image of each page so that hand
 editing of the html created by step 4 can be done.  
 Or use the resulting svg files to read each page of the book
 exactly as it has been laid out originally.
   gensvg.py TARGETDIR
--- a/Topaz_Tools/lib/stylexml2css.py
+++ b/Topaz_Tools/lib/stylexml2css.py
@ -0,0 +1,221 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import with_statement
 import csv
 import sys
 import os
 import getopt
 from struct import pack
 from struct import unpack
 class DocParser(object):
    def __init__(self, flatxml):
        self.flatdoc = flatxml.split('\n')
    stags = {
        'paragraph' : 'p',
        'graphic'   : '.graphic'
    }
    attr_val_map = {
        'hang'            : ('text-indent: ', 135),
        'indent'          : ('text-indent: ', 135),
        'line-space'      : ('line-height: ', 190),
        'margin-bottom'   : ('margin-bottom: ', 135),
        'margin-left'     : ('margin-left: ', 135),
        'margin-right'    : ('margin-right: ', 135),
        'margin-top'      : ('margin-top: ', 135),
        'space-after'     : ('padding-bottom: ', 135),
    }
    attr_str_map = {
        'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
        'align-left'   : 'text-align: left;',
        'align-right'  : 'text-align: right;',
        'align-justify' : 'text-align: justify;',
        'display-inline' : 'display: inline;',
        'pos-left' : 'text-align: left;',
        'pos-right' : 'text-align: right;',
        'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
    }
    # find tag if within pos to end inclusive
    def findinDoc(self, tagpath, pos, end) :
        result = None
        docList = self.flatdoc
        cnt = len(docList)
        if end == -1 :
            end = cnt
        else:
            end = min(cnt,end)
        foundat = -1
        for j in xrange(pos, end):
            item = docList[j]
            if item.find('=') >= 0:
                (name, argres) = item.split('=')
            else : 
                name = item
                argres = ''
            if name.endswith(tagpath) : 
                result = argres
                foundat = j
                break
        return foundat, result
    # return list of start positions for the tagpath
    def posinDoc(self, tagpath):
        startpos = []
        pos = 0
        res = ""
        while res != None :
            (foundpos, res) = self.findinDoc(tagpath, pos, -1)
            if res != None :
                startpos.append(foundpos)
            pos = foundpos + 1
        return startpos
    def process(self):
        csspage = ''
        # generate a list of each <style> starting point in the stylesheet
        styleList= self.posinDoc('book.stylesheet.style')
        stylecnt = len(styleList)
        styleList.append(-1)
        # process each style converting what you can
        for j in xrange(stylecnt):
            start = styleList[j]
            end = styleList[j+1]
            (pos, tag) = self.findinDoc('style._tag',start,end)
            if tag == None :
                (pos, tag) = self.findinDoc('style.type',start,end)
            # Is this something we know how to convert to css
            if tag in self.stags :
                # get the style class
                (pos, sclass) = self.findinDoc('style.class',start,end)
                if sclass != None:
                    sclass = '.cl_' + sclass.lower()
                else : 
                    sclass = ''
                # check for any "after class" specifiers
                (pos, aftclass) = self.findinDoc('style._after_class',start,end)
                if aftclass != None:
                    aftclass = '.cl_' + aftclass.lower()
                else : 
                    aftclass = ''
                cssargs = {}
                while True :
                    (pos, attr) = self.findinDoc('style.rule.attr', start, end)
                    (pos, val) = self.findinDoc('style.rule.value', start, end)
                    if attr == None : break
                    if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
                        # handle text based attributess
                        attr = attr + '-' + val
                        if attr in self.attr_str_map :
                            cssargs[attr] = (self.attr_str_map[attr], '')
                    else :
                        # handle value based attributes
                        if attr in self.attr_val_map :
                            (name, scale) = self.attr_val_map[attr]
                            if not ((attr == 'hang') and (int(val) == 0)) :
                                ems = int(val)/scale
                                cssargs[attr] = (self.attr_val_map[attr][0], ems)
                                keep = True
                    start = pos + 1
                # disable all of the after class tags until I figure out how to handle them
                # remove all numerals after the "reclustered" 
                if aftclass != "" : keep = False
                p = sclass.find('reclustered') 
                if p >= 0:
                    sclass = sclass[0:p+11]
                if keep :
                    # make sure line-space does not go below 1em
                    if 'line-space' in cssargs:
                        seg = cssargs['line-space'][0]
                        val = cssargs['line-space'][1]
                        if val < 1.0: val = 1.0
                        del cssargs['line-space']
                        cssargs['line-space'] = (self.attr_val_map['line-space'][0], val)
                    # handle modifications for css style hanging indents
                    if 'hang' in cssargs:
                        hseg = cssargs['hang'][0]
                        hval = cssargs['hang'][1]
                        del cssargs['hang']
                        cssargs['hang'] = (self.attr_val_map['hang'][0], -hval)
                        mval = 0
                        mseg = 'margin-left: '
                        if 'margin-left' in cssargs:
                            mseg = cssargs['margin-left'][0]
                            mval = cssargs['margin-left'][1]
                            mval = hval + mval
                            cssargs['margin-left'] = (mseg, mval)
                        if 'indent' in cssargs:
                            del cssargs['indent']
                    cssline = sclass + ' { '
                    for key in iter(cssargs):
                        mseg = cssargs[key][0]
                        mval = cssargs[key][1]
                        if mval == '':
                            cssline += mseg + ' '
                        else :
                            aseg = mseg + '%.1fem;' % mval
                            cssline += aseg + ' '
                    cssline += '}'
                    # handle special case of paragraph class used inside chapter heading
                    # and non-chapter headings
                    if sclass != '' :
                        ctype = sclass[4:7]
                        if ctype == 'ch1' :
                            csspage += 'h1' + cssline + '\n'
                        if ctype == 'ch2' :
                            csspage += 'h2' + cssline + '\n'
                        if ctype == 'ch3' :
                            csspage += 'h3' + cssline + '\n'
                        if ctype == 'h1-' :
                            csspage += 'h4' + cssline + '\n'
                        if ctype == 'h2-' :
                            csspage += 'h5' + cssline + '\n'
                        if ctype == 'h3_' :
                            csspage += 'h6' + cssline + '\n'
                    csspage += self.stags[tag] + cssline + '\n'
        return csspage
 def convert2CSS(flatxml):
    # create a document parser
    dp = DocParser(flatxml)
    csspage = dp.process()
    return csspage