diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py new file mode 100644 index 0000000..9cd32de --- /dev/null +++ b/Topaz_Tools/lib/cmbtc_dump.py @@ -0,0 +1,865 @@ +#! /usr/bin/python + +""" + +Comprehensive Mazama Book DRM with Topaz Cryptography V2.0 + +-----BEGIN PUBLIC KEY----- +MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6 +M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC +B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx +y2/pHuYme7U1TsgSjwIDAQAB +-----END PUBLIC KEY----- + +""" + +from __future__ import with_statement + +import csv +import sys +import os +import getopt +import zlib +from struct import pack +from struct import unpack +from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \ + create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \ + string_at, Structure, c_void_p, cast +import _winreg as winreg +import Tkinter +import Tkconstants +import tkMessageBox +import traceback +import hashlib + +MAX_PATH = 255 + +kernel32 = windll.kernel32 +advapi32 = windll.advapi32 +crypt32 = windll.crypt32 + +global kindleDatabase +global bookFile +global bookPayloadOffset +global bookHeaderRecords +global bookMetadata +global bookKey +global command + +# +# Various character maps used to decrypt books. Probably supposed to act as obfuscation +# + +charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M" +charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_" +charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" +charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789" + +# +# Exceptions for all the problems that might happen during the script +# + +class CMBDTCError(Exception): + pass + +class CMBDTCFatal(Exception): + pass + +# +# Stolen stuff +# + +class DataBlob(Structure): + _fields_ = [('cbData', c_uint), + ('pbData', c_void_p)] +DataBlob_p = POINTER(DataBlob) + +def GetSystemDirectory(): + GetSystemDirectoryW = kernel32.GetSystemDirectoryW + GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint] + GetSystemDirectoryW.restype = c_uint + def GetSystemDirectory(): + buffer = create_unicode_buffer(MAX_PATH + 1) + GetSystemDirectoryW(buffer, len(buffer)) + return buffer.value + return GetSystemDirectory +GetSystemDirectory = GetSystemDirectory() + + +def GetVolumeSerialNumber(): + GetVolumeInformationW = kernel32.GetVolumeInformationW + GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint, + POINTER(c_uint), POINTER(c_uint), + POINTER(c_uint), c_wchar_p, c_uint] + GetVolumeInformationW.restype = c_uint + def GetVolumeSerialNumber(path): + vsn = c_uint(0) + GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0) + return vsn.value + return GetVolumeSerialNumber +GetVolumeSerialNumber = GetVolumeSerialNumber() + + +def GetUserName(): + GetUserNameW = advapi32.GetUserNameW + GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)] + GetUserNameW.restype = c_uint + def GetUserName(): + buffer = create_unicode_buffer(32) + size = c_uint(len(buffer)) + while not GetUserNameW(buffer, byref(size)): + buffer = create_unicode_buffer(len(buffer) * 2) + size.value = len(buffer) + return buffer.value.encode('utf-16-le')[::2] + return GetUserName +GetUserName = GetUserName() + + +def CryptUnprotectData(): + _CryptUnprotectData = crypt32.CryptUnprotectData + _CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p, + c_void_p, c_void_p, c_uint, DataBlob_p] + _CryptUnprotectData.restype = c_uint + def CryptUnprotectData(indata, entropy): + indatab = create_string_buffer(indata) + indata = DataBlob(len(indata), cast(indatab, c_void_p)) + entropyb = create_string_buffer(entropy) + entropy = DataBlob(len(entropy), cast(entropyb, c_void_p)) + outdata = DataBlob() + if not _CryptUnprotectData(byref(indata), None, byref(entropy), + None, None, 0, byref(outdata)): + raise CMBDTCFatal("Failed to Unprotect Data") + return string_at(outdata.pbData, outdata.cbData) + return CryptUnprotectData +CryptUnprotectData = CryptUnprotectData() + +# +# Returns the MD5 digest of "message" +# + +def MD5(message): + ctx = hashlib.md5() + ctx.update(message) + return ctx.digest() + +# +# Returns the MD5 digest of "message" +# + +def SHA1(message): + ctx = hashlib.sha1() + ctx.update(message) + return ctx.digest() + +# +# Open the book file at path +# + +def openBook(path): + try: + return open(path,'rb') + except: + raise CMBDTCFatal("Could not open book file: " + path) +# +# Encode the bytes in data with the characters in map +# + +def encode(data, map): + result = "" + for char in data: + value = ord(char) + Q = (value ^ 0x80) // len(map) + R = value % len(map) + result += map[Q] + result += map[R] + return result + +# +# Hash the bytes in data and then encode the digest with the characters in map +# + +def encodeHash(data,map): + return encode(MD5(data),map) + +# +# Decode the string in data with the characters in map. Returns the decoded bytes +# + +def decode(data,map): + result = "" + for i in range (0,len(data),2): + high = map.find(data[i]) + low = map.find(data[i+1]) + value = (((high * 0x40) ^ 0x80) & 0xFF) + low + result += pack("B",value) + return result + +# +# Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application) +# + +def openKindleInfo(): + regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\") + path = winreg.QueryValueEx(regkey, 'Local AppData')[0] + return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r') + +# +# Parse the Kindle.info file and return the records as a list of key-values +# + +def parseKindleInfo(): + DB = {} + infoReader = openKindleInfo() + infoReader.read(1) + data = infoReader.read() + items = data.split('{') + + for item in items: + splito = item.split(':') + DB[splito[0]] =splito[1] + return DB + +# +# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal) +# + +def findNameForHash(hash): + names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"] + result = "" + for name in names: + if hash == encodeHash(name, charMap2): + result = name + break + return name + +# +# Print all the records from the kindle.info file (option -i) +# + +def printKindleInfo(): + for record in kindleDatabase: + name = findNameForHash(record) + if name != "" : + print (name) + print ("--------------------------\n") + else : + print ("Unknown Record") + print getKindleInfoValueForHash(record) + print "\n" +# +# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record +# + +def getKindleInfoValueForHash(hashedKey): + global kindleDatabase + encryptedValue = decode(kindleDatabase[hashedKey],charMap2) + return CryptUnprotectData(encryptedValue,"") + +# +# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record +# + +def getKindleInfoValueForKey(key): + return getKindleInfoValueForHash(encodeHash(key,charMap2)) + +# +# Get a 7 bit encoded number from the book file +# + +def bookReadEncodedNumber(): + flag = False + data = ord(bookFile.read(1)) + + if data == 0xFF: + flag = True + data = ord(bookFile.read(1)) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + data = ord(bookFile.read(1)) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + +# +# Encode a number in 7 bit format +# + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + +# +# Get a length prefixed string from the file +# + +def bookReadString(): + stringLength = bookReadEncodedNumber() + return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0] + +# +# Returns a length prefixed string +# + +def lengthPrefixString(data): + return encodeNumber(len(data))+data + + +# +# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...] +# + +def bookReadHeaderRecordData(): + nbValues = bookReadEncodedNumber() + values = [] + for i in range (0,nbValues): + values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()]) + return values + +# +# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...] +# + +def parseTopazHeaderRecord(): + if ord(bookFile.read(1)) != 0x63: + raise CMBDTCFatal("Parse Error : Invalid Header") + + tag = bookReadString() + record = bookReadHeaderRecordData() + return [tag,record] + +# +# Parse the header of a Topaz file, get all the header records and the offset for the payload +# + +def parseTopazHeader(): + global bookHeaderRecords + global bookPayloadOffset + magic = unpack("4s",bookFile.read(4))[0] + + if magic != 'TPZ0': + raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file") + + nbRecords = bookReadEncodedNumber() + bookHeaderRecords = {} + + for i in range (0,nbRecords): + result = parseTopazHeaderRecord() + print result[0], result[1] + bookHeaderRecords[result[0]] = result[1] + + if ord(bookFile.read(1)) != 0x64 : + raise CMBDTCFatal("Parse Error : Invalid Header") + + bookPayloadOffset = bookFile.tell() + +# +# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed +# Correction, the record is correctly decompressed too +# + +def getBookPayloadRecord(name, index): + encrypted = False + compressed = False + + try: + recordOffset = bookHeaderRecords[name][index][0] + except: + raise CMBDTCFatal("Parse Error : Invalid Record, record not found") + + bookFile.seek(bookPayloadOffset + recordOffset) + + tag = bookReadString() + if tag != name : + raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match") + + recordIndex = bookReadEncodedNumber() + + if recordIndex < 0 : + encrypted = True + recordIndex = -recordIndex -1 + + if recordIndex != index : + raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match") + + if (bookHeaderRecords[name][index][2] > 0): + compressed = True + record = bookFile.read(bookHeaderRecords[name][index][2]) + else: + record = bookFile.read(bookHeaderRecords[name][index][1]) + + if encrypted: + ctx = topazCryptoInit(bookKey) + record = topazCryptoDecrypt(record,ctx) + + if compressed: + record = zlib.decompress(record) + + return record + +# +# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename" +# + +def extractBookPayloadRecord(name, index, filename): + compressed = False + + try: + compressed = bookHeaderRecords[name][index][2] != 0 + record = getBookPayloadRecord(name,index) + except: + print("Could not find record") + + # if compressed: + # try: + # record = zlib.decompress(record) + # except: + # raise CMBDTCFatal("Could not decompress record") + + if filename != "": + try: + file = open(filename,"wb") + file.write(record) + file.close() + except: + raise CMBDTCFatal("Could not write to destination file") + else: + print(record) + +# +# return next record [key,value] from the book metadata from the current book position +# + +def readMetadataRecord(): + return [bookReadString(),bookReadString()] + +# +# Parse the metadata record from the book payload and return a list of [key,values] +# + +def parseMetadata(): + global bookHeaderRecords + global bookPayloadAddress + global bookMetadata + bookMetadata = {} + bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0]) + tag = bookReadString() + if tag != "metadata" : + raise CMBDTCFatal("Parse Error : Record Names Don't Match") + + flags = ord(bookFile.read(1)) + nbRecords = ord(bookFile.read(1)) + + for i in range (0,nbRecords) : + record =readMetadataRecord() + bookMetadata[record[0]] = record[1] + +# +# Returns two bit at offset from a bit field +# + +def getTwoBitsFromBitField(bitField,offset): + byteNumber = offset // 4 + bitPosition = 6 - 2*(offset % 4) + + return ord(bitField[byteNumber]) >> bitPosition & 3 + +# +# Returns the six bits at offset from a bit field +# + +def getSixBitsFromBitField(bitField,offset): + offset *= 3 + value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2) + return value + +# +# 8 bits to six bits encoding from hash to generate PID string +# + +def encodePID(hash): + global charMap3 + PID = "" + for position in range (0,8): + PID += charMap3[getSixBitsFromBitField(hash,position)] + return PID + +# +# Context initialisation for the Topaz Crypto +# + +def topazCryptoInit(key): + ctx1 = 0x0CAFFE19E + + for keyChar in key: + keyByte = ord(keyChar) + ctx2 = ctx1 + ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF ) + return [ctx1,ctx2] + +# +# decrypt data with the context prepared by topazCryptoInit() +# + +def topazCryptoDecrypt(data, ctx): + ctx1 = ctx[0] + ctx2 = ctx[1] + + plainText = "" + + for dataChar in data: + dataByte = ord(dataChar) + m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF + ctx2 = ctx1 + ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF) + plainText += chr(m) + + return plainText + +# +# Decrypt a payload record with the PID +# + +def decryptRecord(data,PID): + ctx = topazCryptoInit(PID) + return topazCryptoDecrypt(data, ctx) + +# +# Try to decrypt a dkey record (contains the book PID) +# + +def decryptDkeyRecord(data,PID): + record = decryptRecord(data,PID) + fields = unpack("3sB8sB8s3s",record) + + if fields[0] != "PID" or fields[5] != "pid" : + raise CMBDTCError("Didn't find PID magic numbers in record") + elif fields[1] != 8 or fields[3] != 8 : + raise CMBDTCError("Record didn't contain correct length fields") + elif fields[2] != PID : + raise CMBDTCError("Record didn't contain PID") + + return fields[4] + +# +# Decrypt all the book's dkey records (contain the book PID) +# + +def decryptDkeyRecords(data,PID): + nbKeyRecords = ord(data[0]) + records = [] + data = data[1:] + for i in range (0,nbKeyRecords): + length = ord(data[0]) + try: + key = decryptDkeyRecord(data[1:length+1],PID) + records.append(key) + except CMBDTCError: + pass + data = data[1+length:] + + return records + +# +# Encryption table used to generate the device PID +# + +def generatePidEncryptionTable() : + table = [] + for counter1 in range (0,0x100): + value = counter1 + for counter2 in range (0,8): + if (value & 1 == 0) : + value = value >> 1 + else : + value = value >> 1 + value = value ^ 0xEDB88320 + table.append(value) + return table + +# +# Seed value used to generate the device PID +# + +def generatePidSeed(table,dsn) : + value = 0 + for counter in range (0,4) : + index = (ord(dsn[counter]) ^ value) &0xFF + value = (value >> 8) ^ table[index] + return value + +# +# Generate the device PID +# + +def generateDevicePID(table,dsn,nbRoll): + seed = generatePidSeed(table,dsn) + pidAscii = "" + pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF] + index = 0 + + for counter in range (0,nbRoll): + pid[index] = pid[index] ^ ord(dsn[counter]) + index = (index+1) %8 + + for counter in range (0,8): + index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7) + pidAscii += charMap4[index] + return pidAscii + +# +# Create decrypted book payload +# + +def createDecryptedPayload(payload): + for headerRecord in bookHeaderRecords: + name = headerRecord + if name != "dkey" : + ext = '.dat' + if name == 'img' : ext = '.jpg' + for index in range (0,len(bookHeaderRecords[name])) : + fnum = "%04d" % index + fname = name + fnum + ext + destdir = payload + if name == 'img': + destdir = os.path.join(payload,'img') + if name == 'page': + destdir = os.path.join(payload,'page') + if name == 'glyphs': + destdir = os.path.join(payload,'glyphs') + outputFile = os.path.join(destdir,fname) + file(outputFile, 'wb').write(getBookPayloadRecord(name, index)) + + +# Create decrypted book +# + +def createDecryptedBook(outdir): + if not os.path.exists(outdir): + os.makedirs(outdir) + + destdir = os.path.join(outdir,'img') + if not os.path.exists(destdir): + os.makedirs(destdir) + + destdir = os.path.join(outdir,'page') + if not os.path.exists(destdir): + os.makedirs(destdir) + + destdir = os.path.join(outdir,'glyphs') + if not os.path.exists(destdir): + os.makedirs(destdir) + + createDecryptedPayload(outdir) + + +# +# Set the command to execute by the programm according to cmdLine parameters +# + +def setCommand(name) : + global command + if command != "" : + raise CMBDTCFatal("Invalid command line parameters") + else : + command = name + +# +# Program usage +# + +def usage(): + print("\nUsage:") + print("\ncmbtc_dump.py [options] bookFileName\n") + print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)") + print("-d Dumps the unencrypted book as files to outdir") + print("-o Output directory to save book files to") + print("-v Verbose (can be used several times)") + print("-i Prints kindle.info database") + +# +# Main +# + +def main(argv=sys.argv): + global kindleDatabase + global bookMetadata + global bookKey + global bookFile + global command + + progname = os.path.basename(argv[0]) + + verbose = 0 + recordName = "" + recordIndex = 0 + outdir = "" + PIDs = [] + kindleDatabase = None + command = "" + + + try: + opts, args = getopt.getopt(sys.argv[1:], "vi:o:p:d") + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o == "-v": + verbose+=1 + if o == "-i": + setCommand("printInfo") + if o =="-o": + if a == None : + raise CMBDTCFatal("Invalid parameter for -o") + outdir = a + if o =="-p": + PIDs.append(a) + if o =="-d": + setCommand("doit") + + if command == "" : + raise CMBDTCFatal("No action supplied on command line") + + # + # Read the encrypted database + # + + try: + kindleDatabase = parseKindleInfo() + except Exception as message: + if verbose>0: + print(message) + + if kindleDatabase != None : + if command == "printInfo" : + printKindleInfo() + + # + # Compute the DSN + # + + # Get the Mazama Random number + MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber") + + # Get the HDD serial + encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1) + + # Get the current user name + encodedUsername = encodeHash(GetUserName(),charMap1) + + # concat, hash and encode + DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1) + + if verbose >1: + print("DSN: " + DSN) + + # + # Compute the device PID + # + + table = generatePidEncryptionTable() + devicePID = generateDevicePID(table,DSN,4) + PIDs.append(devicePID) + + if verbose > 0: + print("Device PID: " + devicePID) + + # + # Open book and parse metadata + # + + if len(args) == 1: + + bookFile = openBook(args[0]) + parseTopazHeader() + parseMetadata() + + # + # Compute book PID + # + + # Get the account token + + if kindleDatabase != None: + kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens") + + if verbose >1: + print("Account Token: " + kindleAccountToken) + + keysRecord = bookMetadata["keys"] + keysRecordRecord = bookMetadata[keysRecord] + + pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord) + + bookPID = encodePID(pidHash) + PIDs.append(bookPID) + + if verbose > 0: + print ("Book PID: " + bookPID ) + + # + # Decrypt book key + # + + dkey = getBookPayloadRecord('dkey', 0) + + bookKeys = [] + for PID in PIDs : + bookKeys+=decryptDkeyRecords(dkey,PID) + + if len(bookKeys) == 0 : + if verbose > 0 : + print ("Book key could not be found. Maybe this book is not registered with this device.") + else : + bookKey = bookKeys[0] + if verbose > 0: + print("Book key: " + bookKey.encode('hex')) + + + + if command == "printRecord" : + extractBookPayloadRecord(recordName,int(recordIndex),outputFile) + if outputFile != "" and verbose>0 : + print("Wrote record to file: "+outputFile) + elif command == "doit" : + if outdir != "" : + createDecryptedBook(outdir) + if verbose >0 : + print ("Decrypted book saved. Don't pirate!") + elif verbose > 0: + print("Output directory name was not supplied.") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py new file mode 100644 index 0000000..86d08d4 --- /dev/null +++ b/Topaz_Tools/lib/convert2xml.py @@ -0,0 +1,821 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + + +# Get a 7 bit encoded number from string. The most +# significant byte comes first and has the high bit (8th) set + +def readEncodedNumber(file): + flag = False + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data == 0xFF: + flag = True + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + + +# returns a binary string that encodes a number into 7 bits +# most significant byte first which has the high bit set + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + + + +# create / read a length prefixed string from the file + +def lengthPrefixString(data): + return encodeNumber(len(data))+data + +def readString(file): + stringLength = readEncodedNumber(file) + if (stringLength == None): + return "" + sv = file.read(stringLength) + if (len(sv) != stringLength): + return "" + return unpack(str(stringLength)+"s",sv)[0] + + +# convert a binary string generated by encodeNumber (7 bit encoded number) +# to the value you would find inside the page*.dat files to be processed + +def convert(i): + result = '' + val = encodeNumber(i) + for j in xrange(len(val)): + c = ord(val[j:j+1]) + result += '%02x' % c + return result + + + +# the complete string table used to store all book text content +# as well as the xml tokens and values that make sense out of it + +class Dictionary(object): + def __init__(self, dictFile): + self.filename = dictFile + self.size = 0 + self.fo = file(dictFile,'rb') + self.stable = [] + self.size = readEncodedNumber(self.fo) + for i in xrange(self.size): + self.stable.append(self.escapestr(readString(self.fo))) + self.pos = 0 + + def escapestr(self, str): + str = str.replace('&','&') + str = str.replace('<','<') + str = str.replace('>','>') + str = str.replace('=','=') + return str + + def lookup(self,val): + if ((val >= 0) and (val < self.size)) : + self.pos = val + return self.stable[self.pos] + else: + print "Error - %d outside of string table limits" % val + sys.exit(-1) + + def getSize(self): + return self.size + + def getPos(self): + return self.pos + + def dumpDict(self): + for i in xrange(self.size): + print "%d %s %s" % (i, convert(i), self.stable[i]) + return + +# parses the xml snippets that are represented by each page*.dat file. +# also parses the other0.dat file - the main stylesheet +# and information used to inject the xml snippets into page*.dat files + +class PageParser(object): + def __init__(self, filename, dict, debug, flat_xml): + self.fo = file(filename,'rb') + self.id = os.path.basename(filename).replace('.dat','') + self.dict = dict + self.debug = debug + self.flat_xml = flat_xml + self.tagpath = [] + self.doc = [] + self.snippetList = [] + + + # hash table used to enable the decoding process + # This has all been developed by trial and error so it may still have omissions or + # contain errors + # Format: + # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) + + token_tags = { + 'book' : (1, 'snippets', 1, 0), + 'version' : (1, 'snippets', 1, 0), + 'stylesheet' : (1, 'snippets', 1, 0), + 'links' : (0, 'number', 0, 1), + 'pages' : (0, 'number', 0, 1), + 'page' : (1, 'snippets', 1, 0), + 'group' : (1, 'snippets', 1, 0), + 'region' : (1, 'snippets', 1, 0), + 'reflow' : (1, 'number', 1, 0), + 'img' : (1, 'snippets', 1, 0), + 'paragraph' : (1, 'snippets', 1, 0), + 'extratokens' : (1, 'snippets', 1, 0), + 'style' : (1, 'snippets', 1, 0), + 'rule' : (1, 'snippets', 1, 0), + '_span' : (1, 'snippets', 1, 0), + 'word_semantic': (1, 'snippets', 1, 1), + 'value' : (1, 'scalar_text', 0, 0), + 'h' : (1, 'scalar_number', 0, 0), + 'w' : (1, 'scalar_number', 0, 0), + 'firstWord' : (1, 'scalar_number', 0, 0), + 'lastWord' : (1, 'scalar_number', 0, 0), + 'x' : (1, 'number', 0, 0), + 'y' : (1, 'number', 0, 0), + 'links.page' : (1, 'number', 0, 0), + 'link_id' : (1, 'number', 0, 0), + 'glyph' : (0, 'number', 1, 1), + 'glyph.h' : (1, 'number', 0, 0), + 'glyph.w' : (1, 'number', 0, 0), + 'sh' : (1, 'number', 0, 0), + 'word' : (0, 'number', 1, 1), + 'src' : (1, 'scalar_number', 0, 0), + 'rel' : (1, 'number', 0, 0), + 'row' : (1, 'number', 0, 0), + 'startID' : (1, 'number', 0, 1), + 'startID.page' : (1, 'number', 0, 0), + 'glyphID' : (1, 'number', 0, 0), + 'rootID' : (1, 'number', 0, 0), + 'stemID' : (1, 'number', 0, 0), + 'margin-top' : (1, 'number', 0, 0), + 'stemPage' : (1, 'number', 0, 0), + 'dehyphen' : (1, 'number', 1, 1), + 'rootID' : (1, 'number', 0, 0), + 'paraCont' : (1, 'number', 1, 1), + 'paraStems' : (1, 'number', 1, 1), + 'wordStems' : (1, 'number', 1, 1), + 'original' : (0, 'number', 0, 1), + 'use' : (1, 'number', 0, 0), + 'vtx' : (1, 'number', 0, 1), + 'len' : (1, 'number', 0, 1), + 'dpi' : (1, 'number', 0, 0), + 'n' : (1, 'number', 0, 0), + 'id' : (1, 'number', 0, 0), + 'ref' : (1, 'number', 0, 0), + 'pnum' : (1, 'number', 0, 0), + 'pid' : (1, 'text', 0, 0), + 'info' : (0, 'number', 1, 0), + 'bl' : (1, 'raw', 0, 0), + 'firstGlyph' : (1, 'raw', 0, 0), + 'lastGlyph' : (1, 'raw', 0, 0), + 'ocrText' : (1, 'text', 0, 0), + 'title' : (1, 'text', 0, 0), + 'href' : (1, 'text', 0, 0), + '_parent_type' : (1, 'text', 0, 0), + 'attr' : (1, 'scalar_text', 0, 0), + 'justify' : (1, 'scalar_text', 0, 0), + 'align' : (1, 'scalar_text', 0, 0), + 'layout' : (1, 'scalar_text', 0, 0), + 'pageid' : (1, 'scalar_text', 0, 0), + 'pagelabel' : (1, 'scalar_text', 0, 0), + 'type' : (1, 'text', 0, 0), + 'class' : (1, 'scalar_text', 0, 0), + 'container' : (1, 'scalar_text', 0, 0), + '_after_class' : (1, 'scalar_text', 0, 0), + '_tag' : (1, 'scalar_text', 0, 0), + 'pos' : (1, 'scalar_text', 0, 0), + 'page_num' : (1, 'scalar_text', 0, 0), + 'page_type' : (1, 'scalar_text', 0, 0), + 'findlists' : (1, 'scalar_text', 0, 0), + 'FlowEdit_1_id' : (1, 'scalar_text', 0, 0), + 'FlowEdit_1_version' : (1, 'scalar_text', 0, 0), + 'Schema_id' : (1, 'scalar_text', 0, 0), + 'Schema_version' : (1, 'scalar_text', 0, 0), + 'Topaz_version' : (1, 'scalar_text', 0, 0), + 'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), + 'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), + 'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), + 'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), + 'chapterheaders' : (1, 'scalar_text', 0, 0), + 'creation_date' : (1, 'scalar_text', 0, 0), + 'header_footer' : (1, 'scalar_text', 0, 0), + 'init_from_ocr' : (1, 'scalar_text', 0, 0), + 'letter_insertion' : (1, 'scalar_text', 0, 0), + 'xmlinj_convert' : (1, 'scalar_text', 0, 0), + 'xmlinj_reflow' : (1, 'scalar_text', 0, 0), + 'xmlinj_transform' : (1, 'scalar_text', 0, 0), + } + + + # full tag path record keeping routines + def tag_push(self, token): + self.tagpath.append(token) + def tag_pop(self): + if len(self.tagpath) > 0 : + self.tagpath.pop() + def tagpath_len(self): + return len(self.tagpath) + def get_tagpath(self, i): + cnt = len(self.tagpath) + if i < cnt : result = self.tagpath[i] + for j in xrange(i+1, cnt) : + result += '.' + self.tagpath[j] + return result + + + # list of absolute command byte values values that indicate + # various types of loop meachanisms typically used to generate vectors + + cmd_list = (0x76, 0x76) + + # peek at and return 1 byte that is ahead by i bytes + def peek(self, aheadi): + c = self.fo.read(aheadi) + if (len(c) == 0): + return None + self.fo.seek(-aheadi,1) + c = c[-1:] + return ord(c) + + + # get the next value from the file being processed + def getNext(self): + nbyte = self.peek(1); + if (nbyte == None): + return None + val = readEncodedNumber(self.fo) + return val + + + # format an arg by argtype + def formatArg(self, arg, argtype): + if (argtype == 'text') or (argtype == 'scalar_text') : + result = self.dict.lookup(arg) + elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') : + result = arg + elif (argtype == 'snippets') : + result = arg + else : + print "Error Unknown argtype %s" % argtype + sys.exit(-2) + return result + + + # process the next tag token, recursively handling subtags, + # arguments, and commands + def procToken(self, token): + + known_token = False + self.tag_push(token) + + if self.debug : print 'Processing: ', self.get_tagpath(0) + cnt = self.tagpath_len() + for j in xrange(cnt): + tkn = self.get_tagpath(j) + if tkn in self.token_tags : + num_args = self.token_tags[tkn][0] + argtype = self.token_tags[tkn][1] + subtags = self.token_tags[tkn][2] + splcase = self.token_tags[tkn][3] + ntags = -1 + known_token = True + break + + if known_token : + + # handle subtags if present + subtagres = [] + if (splcase == 1): + # this type of tag uses of escape marker 0x74 indicate subtag count + if self.peek(1) == 0x74: + skip = readEncodedNumber(self.fo) + subtags = 1 + num_args = 0 + + if (subtags == 1): + ntags = readEncodedNumber(self.fo) + if self.debug : print 'subtags: ' + token + ' has ' + str(ntags) + for j in xrange(ntags): + val = readEncodedNumber(self.fo) + subtagres.append(self.procToken(self.dict.lookup(val))) + + # arguments can be scalars or vectors of text or numbers + argres = [] + if num_args > 0 : + firstarg = self.peek(1) + if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'): + # single argument is a variable length vector of data + arg = readEncodedNumber(self.fo) + argres = self.decodeCMD(arg,argtype) + else : + # num_arg scalar arguments + for i in xrange(num_args): + argres.append(self.formatArg(readEncodedNumber(self.fo), argtype)) + + # build the return tag + result = [] + tkn = self.get_tagpath(0) + result.append(tkn) + result.append(subtagres) + result.append(argtype) + result.append(argres) + self.tag_pop() + return result + + # all tokens that need to be processed should be in the hash + # table if it may indicate a problem, either new token + # or an out of sync condition + else: + result = [] + if (self.debug): + print 'Unknown Token:', token + self.tag_pop() + return result + + + # special loop used to process code snippets + # it is NEVER used to format arguments. + # builds the snippetList + def doLoop72(self, argtype): + cnt = readEncodedNumber(self.fo) + if self.debug : + result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n' + result += 'of the document is indicated by snippet number sets at the\n' + result += 'end of each snippet. \n' + print result + for i in xrange(cnt): + if self.debug: print 'Snippet:',str(i) + snippet = [] + snippet.append(i) + val = readEncodedNumber(self.fo) + snippet.append(self.procToken(self.dict.lookup(val))) + self.snippetList.append(snippet) + return + + + # loop: pass though values unchanged + # DO NOT CHANGE - this has proven to be correct + def doLoop76Mode0(self, argtype, cnt): + result = [] + for i in xrange(cnt): + result.append(self.formatArg(readEncodedNumber(self.fo), argtype)) + return result + + + # loop generating values relative to the *negative* + # of the offset - don't ask why - it just is + # DO NOT CHANGE - this has proven to be correct + def doLoop76Mode1(self, argtype, cnt): + result = [] + offset = -readEncodedNumber(self.fo) + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + offset + result.append(self.formatArg(val, argtype)) + return result + + + # loop generating values with starting value and accumulation + # DO NOT CHANGE - this has proven to be the correct + def doLoop76Mode2(self, argtype, cnt): + result = [] + ptr = readEncodedNumber(self.fo) + result.append(self.formatArg(ptr, argtype)) + for i in xrange(cnt-1): + ptr = ptr + readEncodedNumber(self.fo) + result.append(self.formatArg(ptr, argtype)) + return result + + + # loop generating values with starting value and accumulation + # **after** subtracting adjustment value from each + # DO NOT CHANGE - this has been proven to be correct + def doLoop76Mode3(self, argtype, cnt): + result = [] + adj = readEncodedNumber(self.fo) + ptr = readEncodedNumber(self.fo) + ptr = ptr - adj + result.append(self.formatArg(ptr, argtype)) + for i in xrange(cnt-1): + ptr = ptr + readEncodedNumber(self.fo) - adj + result.append(self.formatArg(ptr,argtype)) + return result + + + # loop using runing sum of data values and starting value + # with accumulation to get new value + # Again, don't ask it took me forever to figure this out + # DO NOT CHANGE - this has been proven to be correct + def doLoop76Mode4(self, argtype, cnt): + result = [] + val = readEncodedNumber(self.fo) + runsum = val + ptr = val + result.append(self.formatArg(ptr, argtype)) + for i in xrange(cnt-1): + runsum += readEncodedNumber(self.fo) + ptr = ptr + runsum + result.append(self.formatArg(ptr,argtype)) + return result + + + # loop using and extra value as an adjustment + # and a running sum of the values after subtracting + # the adjustment, added to a ptr to get a new pointer + def doLoop76Mode5(self, argtype, cnt): + result = [] + adj = readEncodedNumber(self.fo) + ptr = 0 + runsum = 0 + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + runsum += (val - adj) + ptr = ptr +runsum + result.append(self.formatArg(ptr,argtype)) + return result + + + # FIXME: I have only 4 points to work this out with inside my book + # So may be wrong but it is correct for my 4 points + def doLoop76Mode6(self, argtype, cnt): + result = [] + oldval = 0 + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + ptr= (3 * oldval) + val + 1 + result.append(self.formatArg(ptr,argtype)) + oldval = val + return result + + + + # dispatches loop commands bytes with various modes + # The 0x76 style loops are used to build vectors + + # This was all derived by trial and error and + # new loop types may exist that are not handled here + # since they did not appear in the test cases + + def decodeCMD(self, cmd, argtype): + + # if (cmd == 0x72): + # self.doLoop72(argtype) + # result =[] + # return result + + if (cmd == 0x76): + # loop with cnt, and mode to control loop styles + cnt = readEncodedNumber(self.fo) + mode = readEncodedNumber(self.fo) + + if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' + + if (mode == 0x00): + return self.doLoop76Mode0(argtype, cnt) + + elif (mode == 0x01): + return self.doLoop76Mode1(argtype, cnt) + + elif (mode == 0x02): + return self.doLoop76Mode2(argtype, cnt) + + elif (mode == 0x03): + return self.doLoop76Mode3(argtype, cnt) + + elif (mode == 0x04): + return self.doLoop76Mode4(argtype, cnt) + + elif (mode == 0x05): + return self.doLoop76Mode5(argtype, cnt) + + elif (mode == 0x06): + return self.doLoop76Mode6(argtype, cnt) + + else: + + if self.debug : + # try to mark any unknown loop comands + # if they exist, unless they are used to process + # text or some other known list, we won't be able to prove them correct + print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode) + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + print ' 0x%x' % val, + print ' ' + result = [] + return result + + if self.dbug: print "Unknown command", cmd + result = [] + return result + + # add full tag path to injected snippets + def updateName(self, tag, prefix): + name = tag[0] + subtagList = tag[1] + argtype = tag[2] + argList = tag[3] + nname = prefix + '.' + name + nsubtaglist = [] + for j in subtagList: + nsubtaglist.append(self.updateName(j,prefix)) + ntag = [] + ntag.append(nname) + ntag.append(nsubtaglist) + ntag.append(argtype) + ntag.append(argList) + return ntag + + + + # perform depth first injection of specified snippets into this one + def injectSnippets(self, snippet): + snipno, tag = snippet + name = tag[0] + subtagList = tag[1] + argtype = tag[2] + argList = tag[3] + nsubtagList = [] + if len(argList) > 0 : + for j in argList: + asnip = self.snippetList[j] + aso, atag = self.injectSnippets(asnip) + atag = self.updateName(atag, name) + nsubtagList.append(atag) + argtype='number' + argList=[] + if len(nsubtagList) > 0 : + subtagList.extend(nsubtagList) + tag = [] + tag.append(name) + tag.append(subtagList) + tag.append(argtype) + tag.append(argList) + snippet = [] + snippet.append(snipno) + snippet.append(tag) + return snippet + + + + # format the tag for output + def formatTag(self, node): + name = node[0] + subtagList = node[1] + argtype = node[2] + argList = node[3] + fullpathname = name.split('.') + nodename = fullpathname.pop() + ilvl = len(fullpathname) + indent = ' ' * (3 * ilvl) + result = indent + '<' + nodename + '>' + if len(argList) > 0: + argres = '' + for j in argList: + if (argtype == 'text') or (argtype == 'scalar_text') : + argres += j + '|' + else : + argres += str(j) + ',' + argres = argres[0:-1] + if argtype == 'snippets' : + result += 'snippets:' + argres + else : + result += argres + if len(subtagList) > 0 : + result += '\n' + for j in subtagList: + if len(j) > 0 : + result += self.formatTag(j) + result += indent + '\n' + else: + result += '\n' + return result + + + # flatten tag + def flattenTag(self, node): + name = node[0] + subtagList = node[1] + argtype = node[2] + argList = node[3] + result = name + if (len(argList) > 0): + argres = '' + for j in argList: + if (argtype == 'text') or (argtype == 'scalar_text') : + argres += j + '|' + else : + argres += str(j) + '|' + argres = argres[0:-1] + if argtype == 'snippets' : + result += '.snippets=' + argres + else : + result += '=' + argres + result += '\n' + for j in subtagList: + if len(j) > 0 : + result += self.flattenTag(j) + return result + + + # reduce create xml output + def formatDoc(self, flat_xml): + result = '' + for j in self.doc : + if len(j) > 0: + if flat_xml: + result += self.flattenTag(j) + else: + result += self.formatTag(j) + if self.debug : print result + return result + + + + # main loop - parse the page.dat files + # to create structured document and snippets + + # FIXME: value at end of magic appears to be a subtags count + # but for what? For now, inject an 'info" tag as it is in + # every dictionary and seems close to what is meant + # The alternative is to special case the last _ "0x5f" to mean something + + def process(self): + + # peek at the first bytes to see what type of file it is + magic = self.fo.read(11) + if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'): + first_token = 'info' + elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'): + skip = self.fo.read(1) + first_token = 'info' + else : + # other0.dat file + first_token = None + self.fo.seek(-11,1) + + + # main loop to read and build the document tree + while True: + + if first_token != None : + # use "inserted" first token 'info' for page and glyph files + tag = self.procToken(first_token) + if len(tag) > 0 : + self.doc.append(tag) + first_token = None + + v = self.getNext() + if (v == None): + break + + if (v == 0x72): + self.doLoop72('number') + elif (v > 0) and (v < self.dict.getSize()) : + tag = self.procToken(self.dict.lookup(v)) + if len(tag) > 0 : + self.doc.append(tag) + else: + if self.debug: + print "Mina Loop: Unknown value: %x" % v + + + # now do snippet injection + if len(self.snippetList) > 0 : + if self.debug : print 'Injecting Snippets:' + snippet = self.injectSnippets(self.snippetList[0]) + snipno = snippet[0] + tag_add = snippet[1] + if self.debug : print self.formatTag(tag_add) + if len(tag_add) > 0: + self.doc.append(tag_add) + + # handle generation of xml output + xmlpage = self.formatDoc(self.flat_xml) + + return xmlpage + + + +def usage(): + print 'Usage: ' + print ' convert2xml.py dict0000.dat infile.dat ' + print ' ' + print ' Options:' + print ' -h print this usage help message ' + print ' -d turn on debug output to check for potential errors ' + print ' --flat-xml output the flattened xml page description only ' + print ' ' + print ' This program will attempt to convert a page*.dat file or ' + print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. ' + print ' ' + print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump ' + print ' the *.dat files from a Topaz format e-book.' + +# +# Main +# + +def main(argv): + dictFile = "" + pageFile = "" + debug = False + flat_xml = False + printOutput = False + if len(argv) == 0: + printOutput = True + argv = sys.argv + else : + argv = argv.split() + + try: + opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"]) + + except getopt.GetoptError, err: + + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o =="-d": + debug=True + if o =="-h": + usage() + sys.exit(0) + if o =="--flat-xml": + flat_xml = True + + dictFile, pageFile = args[0], args[1] + + # read in the string table dictionary + dict = Dictionary(dictFile) + + # create a page parser + pp = PageParser(pageFile, dict, debug, flat_xml) + + xmlpage = pp.process() + + if printOutput: + print xmlpage + return 0 + + return xmlpage + +if __name__ == '__main__': + sys.exit(main('')) diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py new file mode 100644 index 0000000..f038310 --- /dev/null +++ b/Topaz_Tools/lib/decode_meta.py @@ -0,0 +1,109 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + +# +# Get a 7 bit encoded number from string +# + +def readEncodedNumber(file): + flag = False + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data == 0xFF: + flag = True + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + +# +# Encode a number in 7 bit format +# + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + +# +# Get a length prefixed string from the file +# +def lengthPrefixString(data): + return encodeNumber(len(data))+data + +def readString(file): + stringLength = readEncodedNumber(file) + if (stringLength == None): + return None + sv = file.read(stringLength) + if (len(sv) != stringLength): + return "" + return unpack(str(stringLength)+"s",sv)[0] + + + +def getMetaArray(metaFile): + # parse the meta file into a Python dictionary (associative array) + result = {} + fo = file(metaFile,'rb') + size = readEncodedNumber(fo) + for i in xrange(size): + temp = readString(fo) + result[temp] = readString(fo) + fo.close() + return result + + + +def getMetaData(metaFile): + # parse the meta file + result = '' + fo = file(metaFile,'rb') + size = readEncodedNumber(fo) + for i in xrange(size): + result += readString(fo) + '|' + result += readString(fo) + '\n' + fo.close() + return result diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py new file mode 100644 index 0000000..1a800e8 --- /dev/null +++ b/Topaz_Tools/lib/flatxml2html.py @@ -0,0 +1,299 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + + +class DocParser(object): + def __init__(self, flatxml, fileid): + self.id = os.path.basename(fileid).replace('.dat','') + self.flatdoc = flatxml.split('\n') + self.ocrtext = [] + self.link_id = [] + self.link_title = [] + self.link_page = [] + self.dehyphen_rootid = [] + self.paracont_stemid = [] + self.parastems_stemid = [] + + + + # find tag if within pos to end inclusive + def findinDoc(self, tagpath, pos, end) : + result = None + docList = self.flatdoc + cnt = len(docList) + if end == -1 : + end = cnt + else: + end = min(cnt,end) + foundat = -1 + for j in xrange(pos, end): + item = docList[j] + if item.find('=') >= 0: + (name, argres) = item.split('=') + else : + name = item + argres = '' + if name.endswith(tagpath) : + result = argres + foundat = j + break + return foundat, result + + + # return list of start positions for the tagpath + def posinDoc(self, tagpath): + startpos = [] + pos = 0 + res = "" + while res != None : + (foundpos, res) = self.findinDoc(tagpath, pos, -1) + if res != None : + startpos.append(foundpos) + pos = foundpos + 1 + return startpos + + + # get a description of the paragraph + def getParaDescription(self, start, end): + # normal paragraph + (pos, pclass) = self.findinDoc('paragraph.class',start,end) + + # class names are an issue given topaz starts them with numerals (not allowed) + # use a mix of cases, (which cause some browsers problems), and actually + # attach numbers after "reclustered*" to the end to deal with reflow issues + # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered + pclass = pclass.lower() + pclass = 'cl_' + pclass + p = pclass.find('reclustered') + if p > 0 : pclass = pclass[0:p+11] + + (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) + (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) + if (sfirst != None) and (slast != None) : + return pclass, int(sfirst), int(slast) + + # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well + # so walk through this region keeping track of the first firstword, and the last lastWord + # on any items that have it + (pos, sfirst) = self.findinDoc('firstWord',start, end) + first = int(sfirst) + last = -1 + for i in xrange(pos+1,end): + (pos, slast) = self.findinDoc('lastWord',i,i+1) + if slast != None: + last = int(slast) + return pclass, first, last + + + def buildParagraph(self, cname, first, last, type, regtype) : + parares = '' + sep ='' + br_lb = False + if (regtype == 'fixed') or (regtype == 'chapterheading') : + br_lb = True + handle_links = False + if len(self.link_id) > 0: + handle_links = True + if (type == 'full') or (type == 'begin') : + parares += '

' + if (type == 'end'): + parares += ' ' + for j in xrange(first, last) : + word = self.ocrtext[j] + sep = ' ' + + if handle_links: + link = self.link_id[j] + if (link > 0): + title = self.link_title[link-1] + if title == "": title='_link_' + ptarget = self.link_page[link-1] - 1 + linkhtml = '' % ptarget + linkhtml += title + '' + pos = parares.rfind(title) + if pos >= 0: + parares = parares[0:pos] + linkhtml + parares[pos+len(title):] + else : + parares += linkhtml + if word == '_link_' : word = '' + elif (link < 0) : + if word == '_link_' : word = '' + + if word == '_lb_': + if (j-1) in self.dehyphen_rootid : + word = '' + sep = '' + elif handle_links : + word = '' + sep = '' + elif br_lb : + word = '
\n' + sep = '' + else : + word = '\n' + sep = '' + + if j in self.dehyphen_rootid : + word = word[0:-1] + sep = '' + + parares += word + sep + + if len(sep) > 0 : parares = parares[0:-1] + if (type == 'full') or (type == 'end') : + parares += '

' + return parares + + + + # walk the document tree collecting the information needed + # to build an html page using the ocrText + + def process(self): + + htmlpage = '' + + # first collect information from the xml doc that describes this page + (pos, argres) = self.findinDoc('info.word.ocrText',0,-1) + if argres : self.ocrtext = argres.split('|') + + (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1) + if argres: + argList = argres.split('|') + self.dehyphen_rootid = [ int(strval) for strval in argList] + + (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) + if self.parastems_stemid == None : self.parastems_stemid = [] + + (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1) + if self.paracont_stemid == None : self.paracont_stemid = [] + + + (pos, argres) = self.findinDoc('info.word.link_id',0,-1) + if argres: + argList = argres.split('|') + self.link_id = [ int(strval) for strval in argList] + + (pos, argres) = self.findinDoc('info.links.page',0,-1) + if argres : + argList = argres.split('|') + self.link_page = [ int(strval) for strval in argList] + + (pos, argres) = self.findinDoc('info.links.title',0,-1) + if argres : + self.link_title = argres.split('|') + else: + self.link_title.append('') + + (pos, pagetype) = self.findinDoc('page.type',0,-1) + + + # generate a list of each region starting point + # each region has one paragraph,, or one image, or one chapterheading + regionList= self.posinDoc('region') + regcnt = len(regionList) + regionList.append(-1) + + anchorSet = False + breakSet = False + + # process each region tag and convert what you can to html + + for j in xrange(regcnt): + start = regionList[j] + end = regionList[j+1] + + (pos, regtype) = self.findinDoc('region.type',start,end) + + if regtype == 'graphic' : + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + (pos, simgsrc) = self.findinDoc('img.src',start,end) + if simgsrc: + htmlpage += '
' % int(simgsrc) + + elif regtype == 'chapterheading' : + (pclass, first, last) = self.getParaDescription(start,end) + if not breakSet: + htmlpage += '
 
\n' + breakSet = True + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + tag = 'h1' + if pclass[3:7] == 'ch1-' : tag = 'h1' + if pclass[3:7] == 'ch2-' : tag = 'h2' + if pclass[3:7] == 'ch3-' : tag = 'h3' + htmlpage += '<' + tag + ' class="' + pclass + '">' + htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype) + htmlpage += '' + + elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') : + ptype = 'full' + # check to see if this is a continution from the previous page + if (len(self.parastems_stemid) > 0): + ptype = 'end' + self.parastems_stemid=[] + else: + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + (pclass, first, last) = self.getParaDescription(start,end) + if ptype == 'full' : + tag = 'p' + if pclass[3:6] == 'h1-' : tag = 'h4' + if pclass[3:6] == 'h2-' : tag = 'h5' + if pclass[3:6] == 'h3-' : tag = 'h6' + htmlpage += '<' + tag + ' class="' + pclass + '">' + htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype) + htmlpage += '' + else : + htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype) + + + elif (regtype == 'tocentry') : + ptype = 'full' + # check to see if this is a continution from the previous page + if (len(self.parastems_stemid) > 0) and (j == 0): + # process the first paragraph as a continuation from the last page + ptype = 'end' + self.parastems_stemid = [] + else: + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + (pclass, first, last) = self.getParaDescription(start,end) + htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype) + + else : + print 'Unknown region type', regtype + print 'Warning: skipping this region' + + if len(self.paracont_stemid) > 0 : + if htmlpage[-4:] == '

': + htmlpage = htmlpage[0:-4] + + return htmlpage + + + return self.convert2HTML() + + + +def convert2HTML(flatxml, fileid): + + # create a document parser + dp = DocParser(flatxml, fileid) + + htmlpage = dp.process() + + return htmlpage diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py new file mode 100644 index 0000000..be50aae --- /dev/null +++ b/Topaz_Tools/lib/genhtml.py @@ -0,0 +1,125 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +import os, sys, getopt + +# local routines +import convert2xml +import flatxml2html +import decode_meta +import stylexml2css + + +def usage(): + print 'Usage: ' + print ' ' + print ' genhtml.py unencryptedBookDir' + print ' ' + + + +def main(argv): + bookDir = '' + + if len(argv) == 0: + argv = sys.argv + else : + argv = argv.split() + + try: + opts, args = getopt.getopt(argv[1:], "h:") + + except getopt.GetoptError, err: + print str(err) + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o =="-h": + usage() + sys.exit(0) + + bookDir = args[0] + + if not os.path.exists(bookDir) : + print "Can not find directory with unencrypted book" + sys.exit(-1) + + dictFile = os.path.join(bookDir,'dict0000.dat') + + if not os.path.exists(dictFile) : + print "Can not find dict0000.dat file" + sys.exit(-1) + + pageDir = os.path.join(bookDir,'page') + if not os.path.exists(pageDir) : + print "Can not find page directory in unencrypted book" + sys.exit(-1) + + imgDir = os.path.join(bookDir,'img') + if not os.path.exists(imgDir) : + print "Can not find image directory in unencrypted book" + sys.exit(-1) + + otherFile = os.path.join(bookDir,'other0000.dat') + if not os.path.exists(otherFile) : + print "Can not find other0000.dat in unencrypted book" + sys.exit(-1) + + metaFile = os.path.join(bookDir,'metadata0000.dat') + if not os.path.exists(metaFile) : + print "Can not find metadata0000.dat in unencrypted book" + sys.exit(-1) + + + htmlFileName = "book.html" + htmlstr = '\n' + + filenames = os.listdir(pageDir) + filenames = sorted(filenames) + + print 'Processing ... ' + + htmlstr += '\n' + + print ' ', 'metadata0000.dat' + fname = os.path.join(bookDir,'metadata0000.dat') + xname = os.path.join(bookDir, 'metadata.txt') + metastr = decode_meta.getMetaData(fname) + file(xname, 'wb').write(metastr) + meta_array = decode_meta.getMetaArray(fname) + htmlstr += '\n' + htmlstr += '\n' + + print ' ', 'other0000.dat' + fname = os.path.join(bookDir,'other0000.dat') + xname = os.path.join(bookDir, 'style.css') + xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + cssstr = '\n' + file(xname, 'wb').write(cssstr) + htmlstr += cssstr + htmlstr += '\n\n' + + for filename in filenames: + print ' ', filename + fname = os.path.join(pageDir,filename) + flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + htmlstr += flatxml2html.convert2HTML(flat_xml, fname) + + htmlstr += '\n\n' + + file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr) + print 'Processing Complete' + + return 0 + +if __name__ == '__main__': + sys.exit(main('')) + + diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py new file mode 100644 index 0000000..7df8043 --- /dev/null +++ b/Topaz_Tools/lib/gensvg.py @@ -0,0 +1,295 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +import os, sys, getopt + +# local routines +import convert2xml +import flatxml2html +import decode_meta + + +class GParser(object): + def __init__(self, flatxml): + self.flatdoc = flatxml.split('\n') + self.dpi = 1440 + self.gh = self.getData('info.glyph.h') + self.gw = self.getData('info.glyph.w') + self.guse = self.getData('info.glyph.use') + self.count = len(self.guse) + self.gvtx = self.getData('info.glyph.vtx') + self.glen = self.getData('info.glyph.len') + self.gdpi = self.getData('info.glyph.dpi') + self.vx = self.getData('info.vtx.x') + self.vy = self.getData('info.vtx.y') + self.vlen = self.getData('info.len.n') + self.glen.append(len(self.vlen)) + self.gvtx.append(len(self.vx)) + + def getData(self, path): + result = None + cnt = len(self.flatdoc) + for j in xrange(cnt): + item = self.flatdoc[j] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (name == path): + result = argres + break + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + return result + + def getPath(self, gly): + path = '' + if (gly < 0) or (gly >= self.count): + return path + tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1] + ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1] + p = 0 + for k in xrange(self.glen[gly], self.glen[gly+1]): + if (p == 0): + zx = tx[0:self.vlen[k]+1] + zy = ty[0:self.vlen[k]+1] + else: + zx = tx[self.vlen[k-1]+1:self.vlen[k]+1] + zy = ty[self.vlen[k-1]+1:self.vlen[k]+1] + p += 1 + for j in xrange(0, len(zx)): + if (j == 0): + path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) + else: + path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) + path += 'z' + return path + +class PParser(object): + def __init__(self, flatxml): + self.flatdoc = flatxml.split('\n') + self.temp = [] + self.ph = self.getData('page.h')[0] + self.pw = self.getData('page.w')[0] + self.gx = self.getData('info.glyph.x') + self.gy = self.getData('info.glyph.y') + self.gid = self.getData('info.glyph.glyphID') + + def getData(self, path): + result = None + cnt = len(self.flatdoc) + for j in xrange(cnt): + item = self.flatdoc[j] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (name.endswith(path)): + result = argres + break + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + return result + + def getDataTemp(self, path): + result = None + cnt = len(self.temp) + for j in xrange(cnt): + item = self.temp[j] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (name.endswith(path)): + result = argres + self.temp.pop(j) + break + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + return result + + def getImages(self): + result = [] + self.temp = self.flatdoc + while (self.getDataTemp('region.img') != None): + h = self.getDataTemp('region.img.h')[0] + w = self.getDataTemp('region.img.w')[0] + x = self.getDataTemp('region.img.x')[0] + y = self.getDataTemp('region.img.y')[0] + src = self.getDataTemp('region.img.src')[0] + result.append('\n' % (src, x, y, w, h)) + return result + + def getGlyphs(self,glyfname): + result = [] + if (self.gid != None) and (len(self.gid) > 0): + glyphs = [] + for j in set(self.gid): + glyphs.append(j) + glyphs.sort() + gfile = open(glyfname, 'r') + j = 0 + while True : + inp = gfile.readline() + if (inp == ''): + break + id='id="gl%d"' % glyphs[j] + if (inp.find(id) > 0): + result.append(inp) + j += 1 + if (j == len(glyphs)): + break + gfile.close() + return result + + + + +def usage(): + print 'Usage: ' + print ' ' + print ' gensvg.py unencryptedBookDir' + print ' ' + + +def main(argv): + bookDir = '' + + if len(argv) == 0: + argv = sys.argv + else : + argv = argv.split() + + try: + opts, args = getopt.getopt(argv[1:], "h:") + + except getopt.GetoptError, err: + print str(err) + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o =="-h": + usage() + sys.exit(0) + + bookDir = args[0] + + if not os.path.exists(bookDir) : + print "Can not find directory with unencrypted book" + sys.exit(-1) + + dictFile = os.path.join(bookDir,'dict0000.dat') + + if not os.path.exists(dictFile) : + print "Can not find dict0000.dat file" + sys.exit(-1) + + pageDir = os.path.join(bookDir,'page') + if not os.path.exists(pageDir) : + print "Can not find page directory in unencrypted book" + sys.exit(-1) + + imgDir = os.path.join(bookDir,'img') + if not os.path.exists(imgDir) : + print "Can not find image directory in unencrypted book" + sys.exit(-1) + + glyphsDir = os.path.join(bookDir,'glyphs') + if not os.path.exists(glyphsDir) : + print "Can not find glyphs directory in unencrypted book" + sys.exit(-1) + + metaFile = os.path.join(bookDir,'metadata0000.dat') + if not os.path.exists(metaFile) : + print "Can not find metadata0000.dat in unencrypted book" + sys.exit(-1) + + svgDir = os.path.join(bookDir,'svg') + if not os.path.exists(svgDir) : + os.makedirs(svgDir) + + + print 'Processing Meta Data ... ' + + print ' ', 'metadata0000.dat' + fname = os.path.join(bookDir,'metadata0000.dat') + metadata = decode_meta.getMetaArray(fname) + + print 'Processing Glyphs ... ' + + filenames = os.listdir(glyphsDir) + filenames = sorted(filenames) + + glyfname = os.path.join(svgDir,'glyphs.svg') + glyfile = open(glyfname, 'w') + glyfile.write('\n') + glyfile.write('\n') + glyfile.write('\n') + glyfile.write('Glyphs for %s\n' % metadata['Title']) + glyfile.write('\n') + counter = 0 + for filename in filenames: + print ' ', filename + fname = os.path.join(glyphsDir,filename) + flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + gp = GParser(flat_xml) + for i in xrange(0, gp.count): + path = gp.getPath(i) + glyfile.write('\n' % (counter * 256 + i, path)) + counter += 1 + glyfile.write('\n') + glyfile.write('\n') + glyfile.close() + + print 'Processing Pages ... ' + + scaledpi = 720 + filenames = os.listdir(pageDir) + filenames = sorted(filenames) + counter = 0 + for filename in filenames: + print ' ', filename + fname = os.path.join(pageDir,filename) + flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + pp = PParser(flat_xml) + pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') + pfile.write('\n') + pfile.write('\n') + pfile.write('\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) + pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) + if (pp.gid != None): + pfile.write('\n') + gdefs = pp.getGlyphs(glyfname) + for j in xrange(0,len(gdefs)): + pfile.write(gdefs[j]) + pfile.write('\n') + for j in xrange(0,len(pp.gid)): + pfile.write('\n' % (pp.gid[j], pp.gx[j], pp.gy[j])) + img = pp.getImages() + if (img != None): + for j in xrange(0,len(img)): + pfile.write(img[j]) + pfile.write('') + pfile.close() + counter += 1 + + print 'Processing Complete' + + return 0 + +if __name__ == '__main__': + sys.exit(main('')) diff --git a/Topaz_Tools/lib/genxml.py b/Topaz_Tools/lib/genxml.py new file mode 100644 index 0000000..c335e88 --- /dev/null +++ b/Topaz_Tools/lib/genxml.py @@ -0,0 +1,121 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +import os, sys, getopt + +# local routines +import convert2xml +import flatxml2html +import decode_meta + + +def usage(): + print 'Usage: ' + print ' ' + print ' genxml.py dict0000.dat unencryptedBookDir' + print ' ' + + + +def main(argv): + bookDir = '' + + if len(argv) == 0: + argv = sys.argv + else : + argv = argv.split() + + try: + opts, args = getopt.getopt(argv[1:], "h:") + + except getopt.GetoptError, err: + print str(err) + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o =="-h": + usage() + sys.exit(0) + + bookDir = args[0] + + if not os.path.exists(bookDir) : + print "Can not find directory with unencrypted book" + sys.exit(-1) + + dictFile = os.path.join(bookDir,'dict0000.dat') + if not os.path.exists(dictFile) : + print "Can not find dict0000.dat file" + sys.exit(-1) + + pageDir = os.path.join(bookDir,'page') + if not os.path.exists(pageDir) : + print "Can not find page directory in unencrypted book" + sys.exit(-1) + + glyphsDir = os.path.join(bookDir,'glyphs') + if not os.path.exists(glyphsDir) : + print "Can not find glyphs directory in unencrypted book" + sys.exit(-1) + + otherFile = os.path.join(bookDir,'other0000.dat') + if not os.path.exists(otherFile) : + print "Can not find other0000.dat in unencrypted book" + sys.exit(-1) + + metaFile = os.path.join(bookDir,'metadata0000.dat') + if not os.path.exists(metaFile) : + print "Can not find metadata0000.dat in unencrypted book" + sys.exit(-1) + + xmlDir = os.path.join(bookDir,'xml') + if not os.path.exists(xmlDir): + os.makedirs(xmlDir) + + + print 'Processing ... ' + + print ' ', 'metadata0000.dat' + fname = os.path.join(bookDir,'metadata0000.dat') + xname = os.path.join(xmlDir, 'metadata.txt') + metastr = decode_meta.getMetaData(fname) + file(xname, 'wb').write(metastr) + + print ' ', 'other0000.dat' + fname = os.path.join(bookDir,'other0000.dat') + xname = os.path.join(xmlDir, 'stylesheet.xml') + xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname) + file(xname, 'wb').write(xmlstr) + + filenames = os.listdir(pageDir) + filenames = sorted(filenames) + + for filename in filenames: + print ' ', filename + fname = os.path.join(pageDir,filename) + xname = os.path.join(xmlDir, filename.replace('.dat','.xml')) + xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname) + file(xname, 'wb').write(xmlstr) + + filenames = os.listdir(glyphsDir) + filenames = sorted(filenames) + + for filename in filenames: + print ' ', filename + fname = os.path.join(glyphsDir,filename) + xname = os.path.join(xmlDir, filename.replace('.dat','.xml')) + xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname) + file(xname, 'wb').write(xmlstr) + + + print 'Processing Complete' + + return 0 + +if __name__ == '__main__': + sys.exit(main('')) diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt new file mode 100644 index 0000000..4a79d20 --- /dev/null +++ b/Topaz_Tools/lib/readme.txt @@ -0,0 +1,75 @@ +This is experimental and it will probably not work for you but... + +ALSO: Please do not use any of this to steal. Theft is wrong. + This is meant to allow conversion of Topaz books for other book readers you own + +Here are the steps: + +1. Unzip the topazscripts.zip file to get the full set of python scripts. +The files you should have after unzipping are: + +cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files +decode_meta.py - converts metadata0000.dat to human readable text (for the most part) +convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions +flatxml2html.py - converts a "flattened" xml description to html using the ocrtext +stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can) +genxml.py - main program to convert everything to xml +genhtml.py - main program to generate "book.html" +gensvg.py - (author: clarknova) main program to create an svg grpahic of each page + +Please note, gensvg.py, genhtml.py, and genxml.py import and use +decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py +so please keep all of these python scripts together in the same place. + + + +2. Remove the DRM from the Topaz book and build a directory +of its contents as files + +All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else +would be possible + + cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE + +This should create a directory called "TARGETDIR" in your current directory. +It should have the following files in it: + +metadata0000.dat - metadata info +other0000.dat - information used to create a style sheet +dict0000.dat - dictionary of words used to build page descriptions +page - directory filled with page*.dat files +glyphs - directory filled with glyphs*.dat files + + + +3. Convert the files in "TARGETDIR" to their xml descriptions +which can be found in TARGETDIR/xml/ upon completion. + + genxml.py TARGETDIR + + + +4. Create book.html which can be found in "TARGETDIR" after +completion. This html conversion can not fully capture +all of the layouts actually used in the book and needs to +be edited to include special font handling such as bold +or italics that can not be determined from the ocrText +information or the style information. If you want to +see things exactly as they were, see step 5 below. + + genhtml.py TARGETDIR + + + +5. Create an svg description of each page which can +be found in TARGETDIR/svg/ upon completion. + +All thanks go to CLARKNOVA for this program. This program is +needed to actually see the true image of each page so that hand +editing of the html created by step 4 can be done. + +Or use the resulting svg files to read each page of the book +exactly as it has been laid out originally. + + gensvg.py TARGETDIR + diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py new file mode 100644 index 0000000..cf02984 --- /dev/null +++ b/Topaz_Tools/lib/stylexml2css.py @@ -0,0 +1,221 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + + +class DocParser(object): + def __init__(self, flatxml): + self.flatdoc = flatxml.split('\n') + + stags = { + 'paragraph' : 'p', + 'graphic' : '.graphic' + } + + attr_val_map = { + 'hang' : ('text-indent: ', 135), + 'indent' : ('text-indent: ', 135), + 'line-space' : ('line-height: ', 190), + 'margin-bottom' : ('margin-bottom: ', 135), + 'margin-left' : ('margin-left: ', 135), + 'margin-right' : ('margin-right: ', 135), + 'margin-top' : ('margin-top: ', 135), + 'space-after' : ('padding-bottom: ', 135), + } + + attr_str_map = { + 'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', + 'align-left' : 'text-align: left;', + 'align-right' : 'text-align: right;', + 'align-justify' : 'text-align: justify;', + 'display-inline' : 'display: inline;', + 'pos-left' : 'text-align: left;', + 'pos-right' : 'text-align: right;', + 'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', + } + + + # find tag if within pos to end inclusive + def findinDoc(self, tagpath, pos, end) : + result = None + docList = self.flatdoc + cnt = len(docList) + if end == -1 : + end = cnt + else: + end = min(cnt,end) + foundat = -1 + for j in xrange(pos, end): + item = docList[j] + if item.find('=') >= 0: + (name, argres) = item.split('=') + else : + name = item + argres = '' + if name.endswith(tagpath) : + result = argres + foundat = j + break + return foundat, result + + + # return list of start positions for the tagpath + def posinDoc(self, tagpath): + startpos = [] + pos = 0 + res = "" + while res != None : + (foundpos, res) = self.findinDoc(tagpath, pos, -1) + if res != None : + startpos.append(foundpos) + pos = foundpos + 1 + return startpos + + + def process(self): + + csspage = '' + + # generate a list of each