From 0a437510f60534ddb860eb29fe4f80474cfe8ec5 Mon Sep 17 00:00:00 2001
From: some_updates
Date: Sun, 17 Jan 2010 12:10:35 +0000
Subject: [PATCH] topaz tools 1.0 (I think)
---
Topaz_Tools/lib/cmbtc_dump.py | 865 ++++++++++++++++++++++++++++++++
Topaz_Tools/lib/convert2xml.py | 821 ++++++++++++++++++++++++++++++
Topaz_Tools/lib/decode_meta.py | 109 ++++
Topaz_Tools/lib/flatxml2html.py | 299 +++++++++++
Topaz_Tools/lib/genhtml.py | 125 +++++
Topaz_Tools/lib/gensvg.py | 295 +++++++++++
Topaz_Tools/lib/genxml.py | 121 +++++
Topaz_Tools/lib/readme.txt | 75 +++
Topaz_Tools/lib/stylexml2css.py | 221 ++++++++
9 files changed, 2931 insertions(+)
create mode 100644 Topaz_Tools/lib/cmbtc_dump.py
create mode 100644 Topaz_Tools/lib/convert2xml.py
create mode 100644 Topaz_Tools/lib/decode_meta.py
create mode 100644 Topaz_Tools/lib/flatxml2html.py
create mode 100644 Topaz_Tools/lib/genhtml.py
create mode 100644 Topaz_Tools/lib/gensvg.py
create mode 100644 Topaz_Tools/lib/genxml.py
create mode 100644 Topaz_Tools/lib/readme.txt
create mode 100644 Topaz_Tools/lib/stylexml2css.py
diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py
new file mode 100644
index 0000000..9cd32de
--- /dev/null
+++ b/Topaz_Tools/lib/cmbtc_dump.py
@@ -0,0 +1,865 @@
+#! /usr/bin/python
+
+"""
+
+Comprehensive Mazama Book DRM with Topaz Cryptography V2.0
+
+-----BEGIN PUBLIC KEY-----
+MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6
+M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC
+B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx
+y2/pHuYme7U1TsgSjwIDAQAB
+-----END PUBLIC KEY-----
+
+"""
+
+from __future__ import with_statement
+
+import csv
+import sys
+import os
+import getopt
+import zlib
+from struct import pack
+from struct import unpack
+from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \
+ create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \
+ string_at, Structure, c_void_p, cast
+import _winreg as winreg
+import Tkinter
+import Tkconstants
+import tkMessageBox
+import traceback
+import hashlib
+
+MAX_PATH = 255
+
+kernel32 = windll.kernel32
+advapi32 = windll.advapi32
+crypt32 = windll.crypt32
+
+global kindleDatabase
+global bookFile
+global bookPayloadOffset
+global bookHeaderRecords
+global bookMetadata
+global bookKey
+global command
+
+#
+# Various character maps used to decrypt books. Probably supposed to act as obfuscation
+#
+
+charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
+charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_"
+charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
+
+#
+# Exceptions for all the problems that might happen during the script
+#
+
+class CMBDTCError(Exception):
+ pass
+
+class CMBDTCFatal(Exception):
+ pass
+
+#
+# Stolen stuff
+#
+
+class DataBlob(Structure):
+ _fields_ = [('cbData', c_uint),
+ ('pbData', c_void_p)]
+DataBlob_p = POINTER(DataBlob)
+
+def GetSystemDirectory():
+ GetSystemDirectoryW = kernel32.GetSystemDirectoryW
+ GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint]
+ GetSystemDirectoryW.restype = c_uint
+ def GetSystemDirectory():
+ buffer = create_unicode_buffer(MAX_PATH + 1)
+ GetSystemDirectoryW(buffer, len(buffer))
+ return buffer.value
+ return GetSystemDirectory
+GetSystemDirectory = GetSystemDirectory()
+
+
+def GetVolumeSerialNumber():
+ GetVolumeInformationW = kernel32.GetVolumeInformationW
+ GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint,
+ POINTER(c_uint), POINTER(c_uint),
+ POINTER(c_uint), c_wchar_p, c_uint]
+ GetVolumeInformationW.restype = c_uint
+ def GetVolumeSerialNumber(path):
+ vsn = c_uint(0)
+ GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0)
+ return vsn.value
+ return GetVolumeSerialNumber
+GetVolumeSerialNumber = GetVolumeSerialNumber()
+
+
+def GetUserName():
+ GetUserNameW = advapi32.GetUserNameW
+ GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)]
+ GetUserNameW.restype = c_uint
+ def GetUserName():
+ buffer = create_unicode_buffer(32)
+ size = c_uint(len(buffer))
+ while not GetUserNameW(buffer, byref(size)):
+ buffer = create_unicode_buffer(len(buffer) * 2)
+ size.value = len(buffer)
+ return buffer.value.encode('utf-16-le')[::2]
+ return GetUserName
+GetUserName = GetUserName()
+
+
+def CryptUnprotectData():
+ _CryptUnprotectData = crypt32.CryptUnprotectData
+ _CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p,
+ c_void_p, c_void_p, c_uint, DataBlob_p]
+ _CryptUnprotectData.restype = c_uint
+ def CryptUnprotectData(indata, entropy):
+ indatab = create_string_buffer(indata)
+ indata = DataBlob(len(indata), cast(indatab, c_void_p))
+ entropyb = create_string_buffer(entropy)
+ entropy = DataBlob(len(entropy), cast(entropyb, c_void_p))
+ outdata = DataBlob()
+ if not _CryptUnprotectData(byref(indata), None, byref(entropy),
+ None, None, 0, byref(outdata)):
+ raise CMBDTCFatal("Failed to Unprotect Data")
+ return string_at(outdata.pbData, outdata.cbData)
+ return CryptUnprotectData
+CryptUnprotectData = CryptUnprotectData()
+
+#
+# Returns the MD5 digest of "message"
+#
+
+def MD5(message):
+ ctx = hashlib.md5()
+ ctx.update(message)
+ return ctx.digest()
+
+#
+# Returns the MD5 digest of "message"
+#
+
+def SHA1(message):
+ ctx = hashlib.sha1()
+ ctx.update(message)
+ return ctx.digest()
+
+#
+# Open the book file at path
+#
+
+def openBook(path):
+ try:
+ return open(path,'rb')
+ except:
+ raise CMBDTCFatal("Could not open book file: " + path)
+#
+# Encode the bytes in data with the characters in map
+#
+
+def encode(data, map):
+ result = ""
+ for char in data:
+ value = ord(char)
+ Q = (value ^ 0x80) // len(map)
+ R = value % len(map)
+ result += map[Q]
+ result += map[R]
+ return result
+
+#
+# Hash the bytes in data and then encode the digest with the characters in map
+#
+
+def encodeHash(data,map):
+ return encode(MD5(data),map)
+
+#
+# Decode the string in data with the characters in map. Returns the decoded bytes
+#
+
+def decode(data,map):
+ result = ""
+ for i in range (0,len(data),2):
+ high = map.find(data[i])
+ low = map.find(data[i+1])
+ value = (((high * 0x40) ^ 0x80) & 0xFF) + low
+ result += pack("B",value)
+ return result
+
+#
+# Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application)
+#
+
+def openKindleInfo():
+ regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\")
+ path = winreg.QueryValueEx(regkey, 'Local AppData')[0]
+ return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r')
+
+#
+# Parse the Kindle.info file and return the records as a list of key-values
+#
+
+def parseKindleInfo():
+ DB = {}
+ infoReader = openKindleInfo()
+ infoReader.read(1)
+ data = infoReader.read()
+ items = data.split('{')
+
+ for item in items:
+ splito = item.split(':')
+ DB[splito[0]] =splito[1]
+ return DB
+
+#
+# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal)
+#
+
+def findNameForHash(hash):
+ names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
+ result = ""
+ for name in names:
+ if hash == encodeHash(name, charMap2):
+ result = name
+ break
+ return name
+
+#
+# Print all the records from the kindle.info file (option -i)
+#
+
+def printKindleInfo():
+ for record in kindleDatabase:
+ name = findNameForHash(record)
+ if name != "" :
+ print (name)
+ print ("--------------------------\n")
+ else :
+ print ("Unknown Record")
+ print getKindleInfoValueForHash(record)
+ print "\n"
+#
+# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
+#
+
+def getKindleInfoValueForHash(hashedKey):
+ global kindleDatabase
+ encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
+ return CryptUnprotectData(encryptedValue,"")
+
+#
+# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
+#
+
+def getKindleInfoValueForKey(key):
+ return getKindleInfoValueForHash(encodeHash(key,charMap2))
+
+#
+# Get a 7 bit encoded number from the book file
+#
+
+def bookReadEncodedNumber():
+ flag = False
+ data = ord(bookFile.read(1))
+
+ if data == 0xFF:
+ flag = True
+ data = ord(bookFile.read(1))
+
+ if data >= 0x80:
+ datax = (data & 0x7F)
+ while data >= 0x80 :
+ data = ord(bookFile.read(1))
+ datax = (datax <<7) + (data & 0x7F)
+ data = datax
+
+ if flag:
+ data = -data
+ return data
+
+#
+# Encode a number in 7 bit format
+#
+
+def encodeNumber(number):
+ result = ""
+ negative = False
+ flag = 0
+
+ if number < 0 :
+ number = -number + 1
+ negative = True
+
+ while True:
+ byte = number & 0x7F
+ number = number >> 7
+ byte += flag
+ result += chr(byte)
+ flag = 0x80
+ if number == 0 : break
+
+ if negative:
+ result += chr(0xFF)
+
+ return result[::-1]
+
+#
+# Get a length prefixed string from the file
+#
+
+def bookReadString():
+ stringLength = bookReadEncodedNumber()
+ return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]
+
+#
+# Returns a length prefixed string
+#
+
+def lengthPrefixString(data):
+ return encodeNumber(len(data))+data
+
+
+#
+# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
+#
+
+def bookReadHeaderRecordData():
+ nbValues = bookReadEncodedNumber()
+ values = []
+ for i in range (0,nbValues):
+ values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
+ return values
+
+#
+# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
+#
+
+def parseTopazHeaderRecord():
+ if ord(bookFile.read(1)) != 0x63:
+ raise CMBDTCFatal("Parse Error : Invalid Header")
+
+ tag = bookReadString()
+ record = bookReadHeaderRecordData()
+ return [tag,record]
+
+#
+# Parse the header of a Topaz file, get all the header records and the offset for the payload
+#
+
+def parseTopazHeader():
+ global bookHeaderRecords
+ global bookPayloadOffset
+ magic = unpack("4s",bookFile.read(4))[0]
+
+ if magic != 'TPZ0':
+ raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
+
+ nbRecords = bookReadEncodedNumber()
+ bookHeaderRecords = {}
+
+ for i in range (0,nbRecords):
+ result = parseTopazHeaderRecord()
+ print result[0], result[1]
+ bookHeaderRecords[result[0]] = result[1]
+
+ if ord(bookFile.read(1)) != 0x64 :
+ raise CMBDTCFatal("Parse Error : Invalid Header")
+
+ bookPayloadOffset = bookFile.tell()
+
+#
+# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
+# Correction, the record is correctly decompressed too
+#
+
+def getBookPayloadRecord(name, index):
+ encrypted = False
+ compressed = False
+
+ try:
+ recordOffset = bookHeaderRecords[name][index][0]
+ except:
+ raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
+
+ bookFile.seek(bookPayloadOffset + recordOffset)
+
+ tag = bookReadString()
+ if tag != name :
+ raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
+
+ recordIndex = bookReadEncodedNumber()
+
+ if recordIndex < 0 :
+ encrypted = True
+ recordIndex = -recordIndex -1
+
+ if recordIndex != index :
+ raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
+
+ if (bookHeaderRecords[name][index][2] > 0):
+ compressed = True
+ record = bookFile.read(bookHeaderRecords[name][index][2])
+ else:
+ record = bookFile.read(bookHeaderRecords[name][index][1])
+
+ if encrypted:
+ ctx = topazCryptoInit(bookKey)
+ record = topazCryptoDecrypt(record,ctx)
+
+ if compressed:
+ record = zlib.decompress(record)
+
+ return record
+
+#
+# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
+#
+
+def extractBookPayloadRecord(name, index, filename):
+ compressed = False
+
+ try:
+ compressed = bookHeaderRecords[name][index][2] != 0
+ record = getBookPayloadRecord(name,index)
+ except:
+ print("Could not find record")
+
+ # if compressed:
+ # try:
+ # record = zlib.decompress(record)
+ # except:
+ # raise CMBDTCFatal("Could not decompress record")
+
+ if filename != "":
+ try:
+ file = open(filename,"wb")
+ file.write(record)
+ file.close()
+ except:
+ raise CMBDTCFatal("Could not write to destination file")
+ else:
+ print(record)
+
+#
+# return next record [key,value] from the book metadata from the current book position
+#
+
+def readMetadataRecord():
+ return [bookReadString(),bookReadString()]
+
+#
+# Parse the metadata record from the book payload and return a list of [key,values]
+#
+
+def parseMetadata():
+ global bookHeaderRecords
+ global bookPayloadAddress
+ global bookMetadata
+ bookMetadata = {}
+ bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
+ tag = bookReadString()
+ if tag != "metadata" :
+ raise CMBDTCFatal("Parse Error : Record Names Don't Match")
+
+ flags = ord(bookFile.read(1))
+ nbRecords = ord(bookFile.read(1))
+
+ for i in range (0,nbRecords) :
+ record =readMetadataRecord()
+ bookMetadata[record[0]] = record[1]
+
+#
+# Returns two bit at offset from a bit field
+#
+
+def getTwoBitsFromBitField(bitField,offset):
+ byteNumber = offset // 4
+ bitPosition = 6 - 2*(offset % 4)
+
+ return ord(bitField[byteNumber]) >> bitPosition & 3
+
+#
+# Returns the six bits at offset from a bit field
+#
+
+def getSixBitsFromBitField(bitField,offset):
+ offset *= 3
+ value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
+ return value
+
+#
+# 8 bits to six bits encoding from hash to generate PID string
+#
+
+def encodePID(hash):
+ global charMap3
+ PID = ""
+ for position in range (0,8):
+ PID += charMap3[getSixBitsFromBitField(hash,position)]
+ return PID
+
+#
+# Context initialisation for the Topaz Crypto
+#
+
+def topazCryptoInit(key):
+ ctx1 = 0x0CAFFE19E
+
+ for keyChar in key:
+ keyByte = ord(keyChar)
+ ctx2 = ctx1
+ ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
+ return [ctx1,ctx2]
+
+#
+# decrypt data with the context prepared by topazCryptoInit()
+#
+
+def topazCryptoDecrypt(data, ctx):
+ ctx1 = ctx[0]
+ ctx2 = ctx[1]
+
+ plainText = ""
+
+ for dataChar in data:
+ dataByte = ord(dataChar)
+ m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
+ ctx2 = ctx1
+ ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
+ plainText += chr(m)
+
+ return plainText
+
+#
+# Decrypt a payload record with the PID
+#
+
+def decryptRecord(data,PID):
+ ctx = topazCryptoInit(PID)
+ return topazCryptoDecrypt(data, ctx)
+
+#
+# Try to decrypt a dkey record (contains the book PID)
+#
+
+def decryptDkeyRecord(data,PID):
+ record = decryptRecord(data,PID)
+ fields = unpack("3sB8sB8s3s",record)
+
+ if fields[0] != "PID" or fields[5] != "pid" :
+ raise CMBDTCError("Didn't find PID magic numbers in record")
+ elif fields[1] != 8 or fields[3] != 8 :
+ raise CMBDTCError("Record didn't contain correct length fields")
+ elif fields[2] != PID :
+ raise CMBDTCError("Record didn't contain PID")
+
+ return fields[4]
+
+#
+# Decrypt all the book's dkey records (contain the book PID)
+#
+
+def decryptDkeyRecords(data,PID):
+ nbKeyRecords = ord(data[0])
+ records = []
+ data = data[1:]
+ for i in range (0,nbKeyRecords):
+ length = ord(data[0])
+ try:
+ key = decryptDkeyRecord(data[1:length+1],PID)
+ records.append(key)
+ except CMBDTCError:
+ pass
+ data = data[1+length:]
+
+ return records
+
+#
+# Encryption table used to generate the device PID
+#
+
+def generatePidEncryptionTable() :
+ table = []
+ for counter1 in range (0,0x100):
+ value = counter1
+ for counter2 in range (0,8):
+ if (value & 1 == 0) :
+ value = value >> 1
+ else :
+ value = value >> 1
+ value = value ^ 0xEDB88320
+ table.append(value)
+ return table
+
+#
+# Seed value used to generate the device PID
+#
+
+def generatePidSeed(table,dsn) :
+ value = 0
+ for counter in range (0,4) :
+ index = (ord(dsn[counter]) ^ value) &0xFF
+ value = (value >> 8) ^ table[index]
+ return value
+
+#
+# Generate the device PID
+#
+
+def generateDevicePID(table,dsn,nbRoll):
+ seed = generatePidSeed(table,dsn)
+ pidAscii = ""
+ pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
+ index = 0
+
+ for counter in range (0,nbRoll):
+ pid[index] = pid[index] ^ ord(dsn[counter])
+ index = (index+1) %8
+
+ for counter in range (0,8):
+ index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
+ pidAscii += charMap4[index]
+ return pidAscii
+
+#
+# Create decrypted book payload
+#
+
+def createDecryptedPayload(payload):
+ for headerRecord in bookHeaderRecords:
+ name = headerRecord
+ if name != "dkey" :
+ ext = '.dat'
+ if name == 'img' : ext = '.jpg'
+ for index in range (0,len(bookHeaderRecords[name])) :
+ fnum = "%04d" % index
+ fname = name + fnum + ext
+ destdir = payload
+ if name == 'img':
+ destdir = os.path.join(payload,'img')
+ if name == 'page':
+ destdir = os.path.join(payload,'page')
+ if name == 'glyphs':
+ destdir = os.path.join(payload,'glyphs')
+ outputFile = os.path.join(destdir,fname)
+ file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
+
+
+# Create decrypted book
+#
+
+def createDecryptedBook(outdir):
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ destdir = os.path.join(outdir,'img')
+ if not os.path.exists(destdir):
+ os.makedirs(destdir)
+
+ destdir = os.path.join(outdir,'page')
+ if not os.path.exists(destdir):
+ os.makedirs(destdir)
+
+ destdir = os.path.join(outdir,'glyphs')
+ if not os.path.exists(destdir):
+ os.makedirs(destdir)
+
+ createDecryptedPayload(outdir)
+
+
+#
+# Set the command to execute by the programm according to cmdLine parameters
+#
+
+def setCommand(name) :
+ global command
+ if command != "" :
+ raise CMBDTCFatal("Invalid command line parameters")
+ else :
+ command = name
+
+#
+# Program usage
+#
+
+def usage():
+ print("\nUsage:")
+ print("\ncmbtc_dump.py [options] bookFileName\n")
+ print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
+ print("-d Dumps the unencrypted book as files to outdir")
+ print("-o Output directory to save book files to")
+ print("-v Verbose (can be used several times)")
+ print("-i Prints kindle.info database")
+
+#
+# Main
+#
+
+def main(argv=sys.argv):
+ global kindleDatabase
+ global bookMetadata
+ global bookKey
+ global bookFile
+ global command
+
+ progname = os.path.basename(argv[0])
+
+ verbose = 0
+ recordName = ""
+ recordIndex = 0
+ outdir = ""
+ PIDs = []
+ kindleDatabase = None
+ command = ""
+
+
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "vi:o:p:d")
+ except getopt.GetoptError, err:
+ # print help information and exit:
+ print str(err) # will print something like "option -a not recognized"
+ usage()
+ sys.exit(2)
+
+ if len(opts) == 0 and len(args) == 0 :
+ usage()
+ sys.exit(2)
+
+ for o, a in opts:
+ if o == "-v":
+ verbose+=1
+ if o == "-i":
+ setCommand("printInfo")
+ if o =="-o":
+ if a == None :
+ raise CMBDTCFatal("Invalid parameter for -o")
+ outdir = a
+ if o =="-p":
+ PIDs.append(a)
+ if o =="-d":
+ setCommand("doit")
+
+ if command == "" :
+ raise CMBDTCFatal("No action supplied on command line")
+
+ #
+ # Read the encrypted database
+ #
+
+ try:
+ kindleDatabase = parseKindleInfo()
+ except Exception as message:
+ if verbose>0:
+ print(message)
+
+ if kindleDatabase != None :
+ if command == "printInfo" :
+ printKindleInfo()
+
+ #
+ # Compute the DSN
+ #
+
+ # Get the Mazama Random number
+ MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber")
+
+ # Get the HDD serial
+ encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1)
+
+ # Get the current user name
+ encodedUsername = encodeHash(GetUserName(),charMap1)
+
+ # concat, hash and encode
+ DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1)
+
+ if verbose >1:
+ print("DSN: " + DSN)
+
+ #
+ # Compute the device PID
+ #
+
+ table = generatePidEncryptionTable()
+ devicePID = generateDevicePID(table,DSN,4)
+ PIDs.append(devicePID)
+
+ if verbose > 0:
+ print("Device PID: " + devicePID)
+
+ #
+ # Open book and parse metadata
+ #
+
+ if len(args) == 1:
+
+ bookFile = openBook(args[0])
+ parseTopazHeader()
+ parseMetadata()
+
+ #
+ # Compute book PID
+ #
+
+ # Get the account token
+
+ if kindleDatabase != None:
+ kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens")
+
+ if verbose >1:
+ print("Account Token: " + kindleAccountToken)
+
+ keysRecord = bookMetadata["keys"]
+ keysRecordRecord = bookMetadata[keysRecord]
+
+ pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord)
+
+ bookPID = encodePID(pidHash)
+ PIDs.append(bookPID)
+
+ if verbose > 0:
+ print ("Book PID: " + bookPID )
+
+ #
+ # Decrypt book key
+ #
+
+ dkey = getBookPayloadRecord('dkey', 0)
+
+ bookKeys = []
+ for PID in PIDs :
+ bookKeys+=decryptDkeyRecords(dkey,PID)
+
+ if len(bookKeys) == 0 :
+ if verbose > 0 :
+ print ("Book key could not be found. Maybe this book is not registered with this device.")
+ else :
+ bookKey = bookKeys[0]
+ if verbose > 0:
+ print("Book key: " + bookKey.encode('hex'))
+
+
+
+ if command == "printRecord" :
+ extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
+ if outputFile != "" and verbose>0 :
+ print("Wrote record to file: "+outputFile)
+ elif command == "doit" :
+ if outdir != "" :
+ createDecryptedBook(outdir)
+ if verbose >0 :
+ print ("Decrypted book saved. Don't pirate!")
+ elif verbose > 0:
+ print("Output directory name was not supplied.")
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py
new file mode 100644
index 0000000..86d08d4
--- /dev/null
+++ b/Topaz_Tools/lib/convert2xml.py
@@ -0,0 +1,821 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+# Get a 7 bit encoded number from string. The most
+# significant byte comes first and has the high bit (8th) set
+
+def readEncodedNumber(file):
+ flag = False
+ c = file.read(1)
+ if (len(c) == 0):
+ return None
+ data = ord(c)
+
+ if data == 0xFF:
+ flag = True
+ c = file.read(1)
+ if (len(c) == 0):
+ return None
+ data = ord(c)
+
+ if data >= 0x80:
+ datax = (data & 0x7F)
+ while data >= 0x80 :
+ c = file.read(1)
+ if (len(c) == 0):
+ return None
+ data = ord(c)
+ datax = (datax <<7) + (data & 0x7F)
+ data = datax
+
+ if flag:
+ data = -data
+ return data
+
+
+# returns a binary string that encodes a number into 7 bits
+# most significant byte first which has the high bit set
+
+def encodeNumber(number):
+ result = ""
+ negative = False
+ flag = 0
+
+ if number < 0 :
+ number = -number + 1
+ negative = True
+
+ while True:
+ byte = number & 0x7F
+ number = number >> 7
+ byte += flag
+ result += chr(byte)
+ flag = 0x80
+ if number == 0 : break
+
+ if negative:
+ result += chr(0xFF)
+
+ return result[::-1]
+
+
+
+# create / read a length prefixed string from the file
+
+def lengthPrefixString(data):
+ return encodeNumber(len(data))+data
+
+def readString(file):
+ stringLength = readEncodedNumber(file)
+ if (stringLength == None):
+ return ""
+ sv = file.read(stringLength)
+ if (len(sv) != stringLength):
+ return ""
+ return unpack(str(stringLength)+"s",sv)[0]
+
+
+# convert a binary string generated by encodeNumber (7 bit encoded number)
+# to the value you would find inside the page*.dat files to be processed
+
+def convert(i):
+ result = ''
+ val = encodeNumber(i)
+ for j in xrange(len(val)):
+ c = ord(val[j:j+1])
+ result += '%02x' % c
+ return result
+
+
+
+# the complete string table used to store all book text content
+# as well as the xml tokens and values that make sense out of it
+
+class Dictionary(object):
+ def __init__(self, dictFile):
+ self.filename = dictFile
+ self.size = 0
+ self.fo = file(dictFile,'rb')
+ self.stable = []
+ self.size = readEncodedNumber(self.fo)
+ for i in xrange(self.size):
+ self.stable.append(self.escapestr(readString(self.fo)))
+ self.pos = 0
+
+ def escapestr(self, str):
+ str = str.replace('&','&')
+ str = str.replace('<','<')
+ str = str.replace('>','>')
+ str = str.replace('=','=')
+ return str
+
+ def lookup(self,val):
+ if ((val >= 0) and (val < self.size)) :
+ self.pos = val
+ return self.stable[self.pos]
+ else:
+ print "Error - %d outside of string table limits" % val
+ sys.exit(-1)
+
+ def getSize(self):
+ return self.size
+
+ def getPos(self):
+ return self.pos
+
+ def dumpDict(self):
+ for i in xrange(self.size):
+ print "%d %s %s" % (i, convert(i), self.stable[i])
+ return
+
+# parses the xml snippets that are represented by each page*.dat file.
+# also parses the other0.dat file - the main stylesheet
+# and information used to inject the xml snippets into page*.dat files
+
+class PageParser(object):
+ def __init__(self, filename, dict, debug, flat_xml):
+ self.fo = file(filename,'rb')
+ self.id = os.path.basename(filename).replace('.dat','')
+ self.dict = dict
+ self.debug = debug
+ self.flat_xml = flat_xml
+ self.tagpath = []
+ self.doc = []
+ self.snippetList = []
+
+
+ # hash table used to enable the decoding process
+ # This has all been developed by trial and error so it may still have omissions or
+ # contain errors
+ # Format:
+ # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
+
+ token_tags = {
+ 'book' : (1, 'snippets', 1, 0),
+ 'version' : (1, 'snippets', 1, 0),
+ 'stylesheet' : (1, 'snippets', 1, 0),
+ 'links' : (0, 'number', 0, 1),
+ 'pages' : (0, 'number', 0, 1),
+ 'page' : (1, 'snippets', 1, 0),
+ 'group' : (1, 'snippets', 1, 0),
+ 'region' : (1, 'snippets', 1, 0),
+ 'reflow' : (1, 'number', 1, 0),
+ 'img' : (1, 'snippets', 1, 0),
+ 'paragraph' : (1, 'snippets', 1, 0),
+ 'extratokens' : (1, 'snippets', 1, 0),
+ 'style' : (1, 'snippets', 1, 0),
+ 'rule' : (1, 'snippets', 1, 0),
+ '_span' : (1, 'snippets', 1, 0),
+ 'word_semantic': (1, 'snippets', 1, 1),
+ 'value' : (1, 'scalar_text', 0, 0),
+ 'h' : (1, 'scalar_number', 0, 0),
+ 'w' : (1, 'scalar_number', 0, 0),
+ 'firstWord' : (1, 'scalar_number', 0, 0),
+ 'lastWord' : (1, 'scalar_number', 0, 0),
+ 'x' : (1, 'number', 0, 0),
+ 'y' : (1, 'number', 0, 0),
+ 'links.page' : (1, 'number', 0, 0),
+ 'link_id' : (1, 'number', 0, 0),
+ 'glyph' : (0, 'number', 1, 1),
+ 'glyph.h' : (1, 'number', 0, 0),
+ 'glyph.w' : (1, 'number', 0, 0),
+ 'sh' : (1, 'number', 0, 0),
+ 'word' : (0, 'number', 1, 1),
+ 'src' : (1, 'scalar_number', 0, 0),
+ 'rel' : (1, 'number', 0, 0),
+ 'row' : (1, 'number', 0, 0),
+ 'startID' : (1, 'number', 0, 1),
+ 'startID.page' : (1, 'number', 0, 0),
+ 'glyphID' : (1, 'number', 0, 0),
+ 'rootID' : (1, 'number', 0, 0),
+ 'stemID' : (1, 'number', 0, 0),
+ 'margin-top' : (1, 'number', 0, 0),
+ 'stemPage' : (1, 'number', 0, 0),
+ 'dehyphen' : (1, 'number', 1, 1),
+ 'rootID' : (1, 'number', 0, 0),
+ 'paraCont' : (1, 'number', 1, 1),
+ 'paraStems' : (1, 'number', 1, 1),
+ 'wordStems' : (1, 'number', 1, 1),
+ 'original' : (0, 'number', 0, 1),
+ 'use' : (1, 'number', 0, 0),
+ 'vtx' : (1, 'number', 0, 1),
+ 'len' : (1, 'number', 0, 1),
+ 'dpi' : (1, 'number', 0, 0),
+ 'n' : (1, 'number', 0, 0),
+ 'id' : (1, 'number', 0, 0),
+ 'ref' : (1, 'number', 0, 0),
+ 'pnum' : (1, 'number', 0, 0),
+ 'pid' : (1, 'text', 0, 0),
+ 'info' : (0, 'number', 1, 0),
+ 'bl' : (1, 'raw', 0, 0),
+ 'firstGlyph' : (1, 'raw', 0, 0),
+ 'lastGlyph' : (1, 'raw', 0, 0),
+ 'ocrText' : (1, 'text', 0, 0),
+ 'title' : (1, 'text', 0, 0),
+ 'href' : (1, 'text', 0, 0),
+ '_parent_type' : (1, 'text', 0, 0),
+ 'attr' : (1, 'scalar_text', 0, 0),
+ 'justify' : (1, 'scalar_text', 0, 0),
+ 'align' : (1, 'scalar_text', 0, 0),
+ 'layout' : (1, 'scalar_text', 0, 0),
+ 'pageid' : (1, 'scalar_text', 0, 0),
+ 'pagelabel' : (1, 'scalar_text', 0, 0),
+ 'type' : (1, 'text', 0, 0),
+ 'class' : (1, 'scalar_text', 0, 0),
+ 'container' : (1, 'scalar_text', 0, 0),
+ '_after_class' : (1, 'scalar_text', 0, 0),
+ '_tag' : (1, 'scalar_text', 0, 0),
+ 'pos' : (1, 'scalar_text', 0, 0),
+ 'page_num' : (1, 'scalar_text', 0, 0),
+ 'page_type' : (1, 'scalar_text', 0, 0),
+ 'findlists' : (1, 'scalar_text', 0, 0),
+ 'FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
+ 'FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
+ 'Schema_id' : (1, 'scalar_text', 0, 0),
+ 'Schema_version' : (1, 'scalar_text', 0, 0),
+ 'Topaz_version' : (1, 'scalar_text', 0, 0),
+ 'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
+ 'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
+ 'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
+ 'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
+ 'chapterheaders' : (1, 'scalar_text', 0, 0),
+ 'creation_date' : (1, 'scalar_text', 0, 0),
+ 'header_footer' : (1, 'scalar_text', 0, 0),
+ 'init_from_ocr' : (1, 'scalar_text', 0, 0),
+ 'letter_insertion' : (1, 'scalar_text', 0, 0),
+ 'xmlinj_convert' : (1, 'scalar_text', 0, 0),
+ 'xmlinj_reflow' : (1, 'scalar_text', 0, 0),
+ 'xmlinj_transform' : (1, 'scalar_text', 0, 0),
+ }
+
+
+ # full tag path record keeping routines
+ def tag_push(self, token):
+ self.tagpath.append(token)
+ def tag_pop(self):
+ if len(self.tagpath) > 0 :
+ self.tagpath.pop()
+ def tagpath_len(self):
+ return len(self.tagpath)
+ def get_tagpath(self, i):
+ cnt = len(self.tagpath)
+ if i < cnt : result = self.tagpath[i]
+ for j in xrange(i+1, cnt) :
+ result += '.' + self.tagpath[j]
+ return result
+
+
+ # list of absolute command byte values values that indicate
+ # various types of loop meachanisms typically used to generate vectors
+
+ cmd_list = (0x76, 0x76)
+
+ # peek at and return 1 byte that is ahead by i bytes
+ def peek(self, aheadi):
+ c = self.fo.read(aheadi)
+ if (len(c) == 0):
+ return None
+ self.fo.seek(-aheadi,1)
+ c = c[-1:]
+ return ord(c)
+
+
+ # get the next value from the file being processed
+ def getNext(self):
+ nbyte = self.peek(1);
+ if (nbyte == None):
+ return None
+ val = readEncodedNumber(self.fo)
+ return val
+
+
+ # format an arg by argtype
+ def formatArg(self, arg, argtype):
+ if (argtype == 'text') or (argtype == 'scalar_text') :
+ result = self.dict.lookup(arg)
+ elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
+ result = arg
+ elif (argtype == 'snippets') :
+ result = arg
+ else :
+ print "Error Unknown argtype %s" % argtype
+ sys.exit(-2)
+ return result
+
+
+ # process the next tag token, recursively handling subtags,
+ # arguments, and commands
+ def procToken(self, token):
+
+ known_token = False
+ self.tag_push(token)
+
+ if self.debug : print 'Processing: ', self.get_tagpath(0)
+ cnt = self.tagpath_len()
+ for j in xrange(cnt):
+ tkn = self.get_tagpath(j)
+ if tkn in self.token_tags :
+ num_args = self.token_tags[tkn][0]
+ argtype = self.token_tags[tkn][1]
+ subtags = self.token_tags[tkn][2]
+ splcase = self.token_tags[tkn][3]
+ ntags = -1
+ known_token = True
+ break
+
+ if known_token :
+
+ # handle subtags if present
+ subtagres = []
+ if (splcase == 1):
+ # this type of tag uses of escape marker 0x74 indicate subtag count
+ if self.peek(1) == 0x74:
+ skip = readEncodedNumber(self.fo)
+ subtags = 1
+ num_args = 0
+
+ if (subtags == 1):
+ ntags = readEncodedNumber(self.fo)
+ if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
+ for j in xrange(ntags):
+ val = readEncodedNumber(self.fo)
+ subtagres.append(self.procToken(self.dict.lookup(val)))
+
+ # arguments can be scalars or vectors of text or numbers
+ argres = []
+ if num_args > 0 :
+ firstarg = self.peek(1)
+ if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
+ # single argument is a variable length vector of data
+ arg = readEncodedNumber(self.fo)
+ argres = self.decodeCMD(arg,argtype)
+ else :
+ # num_arg scalar arguments
+ for i in xrange(num_args):
+ argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
+
+ # build the return tag
+ result = []
+ tkn = self.get_tagpath(0)
+ result.append(tkn)
+ result.append(subtagres)
+ result.append(argtype)
+ result.append(argres)
+ self.tag_pop()
+ return result
+
+ # all tokens that need to be processed should be in the hash
+ # table if it may indicate a problem, either new token
+ # or an out of sync condition
+ else:
+ result = []
+ if (self.debug):
+ print 'Unknown Token:', token
+ self.tag_pop()
+ return result
+
+
+ # special loop used to process code snippets
+ # it is NEVER used to format arguments.
+ # builds the snippetList
+ def doLoop72(self, argtype):
+ cnt = readEncodedNumber(self.fo)
+ if self.debug :
+ result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
+ result += 'of the document is indicated by snippet number sets at the\n'
+ result += 'end of each snippet. \n'
+ print result
+ for i in xrange(cnt):
+ if self.debug: print 'Snippet:',str(i)
+ snippet = []
+ snippet.append(i)
+ val = readEncodedNumber(self.fo)
+ snippet.append(self.procToken(self.dict.lookup(val)))
+ self.snippetList.append(snippet)
+ return
+
+
+ # loop: pass though values unchanged
+ # DO NOT CHANGE - this has proven to be correct
+ def doLoop76Mode0(self, argtype, cnt):
+ result = []
+ for i in xrange(cnt):
+ result.append(self.formatArg(readEncodedNumber(self.fo), argtype))
+ return result
+
+
+ # loop generating values relative to the *negative*
+ # of the offset - don't ask why - it just is
+ # DO NOT CHANGE - this has proven to be correct
+ def doLoop76Mode1(self, argtype, cnt):
+ result = []
+ offset = -readEncodedNumber(self.fo)
+ for i in xrange(cnt):
+ val = readEncodedNumber(self.fo) + offset
+ result.append(self.formatArg(val, argtype))
+ return result
+
+
+ # loop generating values with starting value and accumulation
+ # DO NOT CHANGE - this has proven to be the correct
+ def doLoop76Mode2(self, argtype, cnt):
+ result = []
+ ptr = readEncodedNumber(self.fo)
+ result.append(self.formatArg(ptr, argtype))
+ for i in xrange(cnt-1):
+ ptr = ptr + readEncodedNumber(self.fo)
+ result.append(self.formatArg(ptr, argtype))
+ return result
+
+
+ # loop generating values with starting value and accumulation
+ # **after** subtracting adjustment value from each
+ # DO NOT CHANGE - this has been proven to be correct
+ def doLoop76Mode3(self, argtype, cnt):
+ result = []
+ adj = readEncodedNumber(self.fo)
+ ptr = readEncodedNumber(self.fo)
+ ptr = ptr - adj
+ result.append(self.formatArg(ptr, argtype))
+ for i in xrange(cnt-1):
+ ptr = ptr + readEncodedNumber(self.fo) - adj
+ result.append(self.formatArg(ptr,argtype))
+ return result
+
+
+ # loop using runing sum of data values and starting value
+ # with accumulation to get new value
+ # Again, don't ask it took me forever to figure this out
+ # DO NOT CHANGE - this has been proven to be correct
+ def doLoop76Mode4(self, argtype, cnt):
+ result = []
+ val = readEncodedNumber(self.fo)
+ runsum = val
+ ptr = val
+ result.append(self.formatArg(ptr, argtype))
+ for i in xrange(cnt-1):
+ runsum += readEncodedNumber(self.fo)
+ ptr = ptr + runsum
+ result.append(self.formatArg(ptr,argtype))
+ return result
+
+
+ # loop using and extra value as an adjustment
+ # and a running sum of the values after subtracting
+ # the adjustment, added to a ptr to get a new pointer
+ def doLoop76Mode5(self, argtype, cnt):
+ result = []
+ adj = readEncodedNumber(self.fo)
+ ptr = 0
+ runsum = 0
+ for i in xrange(cnt):
+ val = readEncodedNumber(self.fo)
+ runsum += (val - adj)
+ ptr = ptr +runsum
+ result.append(self.formatArg(ptr,argtype))
+ return result
+
+
+ # FIXME: I have only 4 points to work this out with inside my book
+ # So may be wrong but it is correct for my 4 points
+ def doLoop76Mode6(self, argtype, cnt):
+ result = []
+ oldval = 0
+ for i in xrange(cnt):
+ val = readEncodedNumber(self.fo)
+ ptr= (3 * oldval) + val + 1
+ result.append(self.formatArg(ptr,argtype))
+ oldval = val
+ return result
+
+
+
+ # dispatches loop commands bytes with various modes
+ # The 0x76 style loops are used to build vectors
+
+ # This was all derived by trial and error and
+ # new loop types may exist that are not handled here
+ # since they did not appear in the test cases
+
+ def decodeCMD(self, cmd, argtype):
+
+ # if (cmd == 0x72):
+ # self.doLoop72(argtype)
+ # result =[]
+ # return result
+
+ if (cmd == 0x76):
+ # loop with cnt, and mode to control loop styles
+ cnt = readEncodedNumber(self.fo)
+ mode = readEncodedNumber(self.fo)
+
+ if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
+
+ if (mode == 0x00):
+ return self.doLoop76Mode0(argtype, cnt)
+
+ elif (mode == 0x01):
+ return self.doLoop76Mode1(argtype, cnt)
+
+ elif (mode == 0x02):
+ return self.doLoop76Mode2(argtype, cnt)
+
+ elif (mode == 0x03):
+ return self.doLoop76Mode3(argtype, cnt)
+
+ elif (mode == 0x04):
+ return self.doLoop76Mode4(argtype, cnt)
+
+ elif (mode == 0x05):
+ return self.doLoop76Mode5(argtype, cnt)
+
+ elif (mode == 0x06):
+ return self.doLoop76Mode6(argtype, cnt)
+
+ else:
+
+ if self.debug :
+ # try to mark any unknown loop comands
+ # if they exist, unless they are used to process
+ # text or some other known list, we won't be able to prove them correct
+ print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode)
+ for i in xrange(cnt):
+ val = readEncodedNumber(self.fo)
+ print ' 0x%x' % val,
+ print ' '
+ result = []
+ return result
+
+ if self.dbug: print "Unknown command", cmd
+ result = []
+ return result
+
+ # add full tag path to injected snippets
+ def updateName(self, tag, prefix):
+ name = tag[0]
+ subtagList = tag[1]
+ argtype = tag[2]
+ argList = tag[3]
+ nname = prefix + '.' + name
+ nsubtaglist = []
+ for j in subtagList:
+ nsubtaglist.append(self.updateName(j,prefix))
+ ntag = []
+ ntag.append(nname)
+ ntag.append(nsubtaglist)
+ ntag.append(argtype)
+ ntag.append(argList)
+ return ntag
+
+
+
+ # perform depth first injection of specified snippets into this one
+ def injectSnippets(self, snippet):
+ snipno, tag = snippet
+ name = tag[0]
+ subtagList = tag[1]
+ argtype = tag[2]
+ argList = tag[3]
+ nsubtagList = []
+ if len(argList) > 0 :
+ for j in argList:
+ asnip = self.snippetList[j]
+ aso, atag = self.injectSnippets(asnip)
+ atag = self.updateName(atag, name)
+ nsubtagList.append(atag)
+ argtype='number'
+ argList=[]
+ if len(nsubtagList) > 0 :
+ subtagList.extend(nsubtagList)
+ tag = []
+ tag.append(name)
+ tag.append(subtagList)
+ tag.append(argtype)
+ tag.append(argList)
+ snippet = []
+ snippet.append(snipno)
+ snippet.append(tag)
+ return snippet
+
+
+
+ # format the tag for output
+ def formatTag(self, node):
+ name = node[0]
+ subtagList = node[1]
+ argtype = node[2]
+ argList = node[3]
+ fullpathname = name.split('.')
+ nodename = fullpathname.pop()
+ ilvl = len(fullpathname)
+ indent = ' ' * (3 * ilvl)
+ result = indent + '<' + nodename + '>'
+ if len(argList) > 0:
+ argres = ''
+ for j in argList:
+ if (argtype == 'text') or (argtype == 'scalar_text') :
+ argres += j + '|'
+ else :
+ argres += str(j) + ','
+ argres = argres[0:-1]
+ if argtype == 'snippets' :
+ result += 'snippets:' + argres
+ else :
+ result += argres
+ if len(subtagList) > 0 :
+ result += '\n'
+ for j in subtagList:
+ if len(j) > 0 :
+ result += self.formatTag(j)
+ result += indent + '' + nodename + '>\n'
+ else:
+ result += '' + nodename + '>\n'
+ return result
+
+
+ # flatten tag
+ def flattenTag(self, node):
+ name = node[0]
+ subtagList = node[1]
+ argtype = node[2]
+ argList = node[3]
+ result = name
+ if (len(argList) > 0):
+ argres = ''
+ for j in argList:
+ if (argtype == 'text') or (argtype == 'scalar_text') :
+ argres += j + '|'
+ else :
+ argres += str(j) + '|'
+ argres = argres[0:-1]
+ if argtype == 'snippets' :
+ result += '.snippets=' + argres
+ else :
+ result += '=' + argres
+ result += '\n'
+ for j in subtagList:
+ if len(j) > 0 :
+ result += self.flattenTag(j)
+ return result
+
+
+ # reduce create xml output
+ def formatDoc(self, flat_xml):
+ result = ''
+ for j in self.doc :
+ if len(j) > 0:
+ if flat_xml:
+ result += self.flattenTag(j)
+ else:
+ result += self.formatTag(j)
+ if self.debug : print result
+ return result
+
+
+
+ # main loop - parse the page.dat files
+ # to create structured document and snippets
+
+ # FIXME: value at end of magic appears to be a subtags count
+ # but for what? For now, inject an 'info" tag as it is in
+ # every dictionary and seems close to what is meant
+ # The alternative is to special case the last _ "0x5f" to mean something
+
+ def process(self):
+
+ # peek at the first bytes to see what type of file it is
+ magic = self.fo.read(11)
+ if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'):
+ first_token = 'info'
+ elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'):
+ skip = self.fo.read(1)
+ first_token = 'info'
+ else :
+ # other0.dat file
+ first_token = None
+ self.fo.seek(-11,1)
+
+
+ # main loop to read and build the document tree
+ while True:
+
+ if first_token != None :
+ # use "inserted" first token 'info' for page and glyph files
+ tag = self.procToken(first_token)
+ if len(tag) > 0 :
+ self.doc.append(tag)
+ first_token = None
+
+ v = self.getNext()
+ if (v == None):
+ break
+
+ if (v == 0x72):
+ self.doLoop72('number')
+ elif (v > 0) and (v < self.dict.getSize()) :
+ tag = self.procToken(self.dict.lookup(v))
+ if len(tag) > 0 :
+ self.doc.append(tag)
+ else:
+ if self.debug:
+ print "Mina Loop: Unknown value: %x" % v
+
+
+ # now do snippet injection
+ if len(self.snippetList) > 0 :
+ if self.debug : print 'Injecting Snippets:'
+ snippet = self.injectSnippets(self.snippetList[0])
+ snipno = snippet[0]
+ tag_add = snippet[1]
+ if self.debug : print self.formatTag(tag_add)
+ if len(tag_add) > 0:
+ self.doc.append(tag_add)
+
+ # handle generation of xml output
+ xmlpage = self.formatDoc(self.flat_xml)
+
+ return xmlpage
+
+
+
+def usage():
+ print 'Usage: '
+ print ' convert2xml.py dict0000.dat infile.dat '
+ print ' '
+ print ' Options:'
+ print ' -h print this usage help message '
+ print ' -d turn on debug output to check for potential errors '
+ print ' --flat-xml output the flattened xml page description only '
+ print ' '
+ print ' This program will attempt to convert a page*.dat file or '
+ print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
+ print ' '
+ print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
+ print ' the *.dat files from a Topaz format e-book.'
+
+#
+# Main
+#
+
+def main(argv):
+ dictFile = ""
+ pageFile = ""
+ debug = False
+ flat_xml = False
+ printOutput = False
+ if len(argv) == 0:
+ printOutput = True
+ argv = sys.argv
+ else :
+ argv = argv.split()
+
+ try:
+ opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
+
+ except getopt.GetoptError, err:
+
+ # print help information and exit:
+ print str(err) # will print something like "option -a not recognized"
+ usage()
+ sys.exit(2)
+
+ if len(opts) == 0 and len(args) == 0 :
+ usage()
+ sys.exit(2)
+
+ for o, a in opts:
+ if o =="-d":
+ debug=True
+ if o =="-h":
+ usage()
+ sys.exit(0)
+ if o =="--flat-xml":
+ flat_xml = True
+
+ dictFile, pageFile = args[0], args[1]
+
+ # read in the string table dictionary
+ dict = Dictionary(dictFile)
+
+ # create a page parser
+ pp = PageParser(pageFile, dict, debug, flat_xml)
+
+ xmlpage = pp.process()
+
+ if printOutput:
+ print xmlpage
+ return 0
+
+ return xmlpage
+
+if __name__ == '__main__':
+ sys.exit(main(''))
diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py
new file mode 100644
index 0000000..f038310
--- /dev/null
+++ b/Topaz_Tools/lib/decode_meta.py
@@ -0,0 +1,109 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+#
+# Get a 7 bit encoded number from string
+#
+
+def readEncodedNumber(file):
+ flag = False
+ c = file.read(1)
+ if (len(c) == 0):
+ return None
+ data = ord(c)
+
+ if data == 0xFF:
+ flag = True
+ c = file.read(1)
+ if (len(c) == 0):
+ return None
+ data = ord(c)
+
+ if data >= 0x80:
+ datax = (data & 0x7F)
+ while data >= 0x80 :
+ c = file.read(1)
+ if (len(c) == 0):
+ return None
+ data = ord(c)
+ datax = (datax <<7) + (data & 0x7F)
+ data = datax
+
+ if flag:
+ data = -data
+ return data
+
+#
+# Encode a number in 7 bit format
+#
+
+def encodeNumber(number):
+ result = ""
+ negative = False
+ flag = 0
+
+ if number < 0 :
+ number = -number + 1
+ negative = True
+
+ while True:
+ byte = number & 0x7F
+ number = number >> 7
+ byte += flag
+ result += chr(byte)
+ flag = 0x80
+ if number == 0 : break
+
+ if negative:
+ result += chr(0xFF)
+
+ return result[::-1]
+
+#
+# Get a length prefixed string from the file
+#
+def lengthPrefixString(data):
+ return encodeNumber(len(data))+data
+
+def readString(file):
+ stringLength = readEncodedNumber(file)
+ if (stringLength == None):
+ return None
+ sv = file.read(stringLength)
+ if (len(sv) != stringLength):
+ return ""
+ return unpack(str(stringLength)+"s",sv)[0]
+
+
+
+def getMetaArray(metaFile):
+ # parse the meta file into a Python dictionary (associative array)
+ result = {}
+ fo = file(metaFile,'rb')
+ size = readEncodedNumber(fo)
+ for i in xrange(size):
+ temp = readString(fo)
+ result[temp] = readString(fo)
+ fo.close()
+ return result
+
+
+
+def getMetaData(metaFile):
+ # parse the meta file
+ result = ''
+ fo = file(metaFile,'rb')
+ size = readEncodedNumber(fo)
+ for i in xrange(size):
+ result += readString(fo) + '|'
+ result += readString(fo) + '\n'
+ fo.close()
+ return result
diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py
new file mode 100644
index 0000000..1a800e8
--- /dev/null
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -0,0 +1,299 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+class DocParser(object):
+ def __init__(self, flatxml, fileid):
+ self.id = os.path.basename(fileid).replace('.dat','')
+ self.flatdoc = flatxml.split('\n')
+ self.ocrtext = []
+ self.link_id = []
+ self.link_title = []
+ self.link_page = []
+ self.dehyphen_rootid = []
+ self.paracont_stemid = []
+ self.parastems_stemid = []
+
+
+
+ # find tag if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ docList = self.flatdoc
+ cnt = len(docList)
+ if end == -1 :
+ end = cnt
+ else:
+ end = min(cnt,end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = docList[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=')
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+
+ # return list of start positions for the tagpath
+ def posinDoc(self, tagpath):
+ startpos = []
+ pos = 0
+ res = ""
+ while res != None :
+ (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+ if res != None :
+ startpos.append(foundpos)
+ pos = foundpos + 1
+ return startpos
+
+
+ # get a description of the paragraph
+ def getParaDescription(self, start, end):
+ # normal paragraph
+ (pos, pclass) = self.findinDoc('paragraph.class',start,end)
+
+ # class names are an issue given topaz starts them with numerals (not allowed)
+ # use a mix of cases, (which cause some browsers problems), and actually
+ # attach numbers after "reclustered*" to the end to deal with reflow issues
+ # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered
+ pclass = pclass.lower()
+ pclass = 'cl_' + pclass
+ p = pclass.find('reclustered')
+ if p > 0 : pclass = pclass[0:p+11]
+
+ (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
+ (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+ if (sfirst != None) and (slast != None) :
+ return pclass, int(sfirst), int(slast)
+
+ # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well
+ # so walk through this region keeping track of the first firstword, and the last lastWord
+ # on any items that have it
+ (pos, sfirst) = self.findinDoc('firstWord',start, end)
+ first = int(sfirst)
+ last = -1
+ for i in xrange(pos+1,end):
+ (pos, slast) = self.findinDoc('lastWord',i,i+1)
+ if slast != None:
+ last = int(slast)
+ return pclass, first, last
+
+
+ def buildParagraph(self, cname, first, last, type, regtype) :
+ parares = ''
+ sep =''
+ br_lb = False
+ if (regtype == 'fixed') or (regtype == 'chapterheading') :
+ br_lb = True
+ handle_links = False
+ if len(self.link_id) > 0:
+ handle_links = True
+ if (type == 'full') or (type == 'begin') :
+ parares += '
'
+ if (type == 'end'):
+ parares += ' '
+ for j in xrange(first, last) :
+ word = self.ocrtext[j]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[j]
+ if (link > 0):
+ title = self.link_title[link-1]
+ if title == "": title='_link_'
+ ptarget = self.link_page[link-1] - 1
+ linkhtml = '' % ptarget
+ linkhtml += title + ''
+ pos = parares.rfind(title)
+ if pos >= 0:
+ parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
+ else :
+ parares += linkhtml
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ if (j-1) in self.dehyphen_rootid :
+ word = ''
+ sep = ''
+ elif handle_links :
+ word = ''
+ sep = ''
+ elif br_lb :
+ word = ' \n'
+ sep = ''
+ else :
+ word = '\n'
+ sep = ''
+
+ if j in self.dehyphen_rootid :
+ word = word[0:-1]
+ sep = ''
+
+ parares += word + sep
+
+ if len(sep) > 0 : parares = parares[0:-1]
+ if (type == 'full') or (type == 'end') :
+ parares += '
'
+ return parares
+
+
+
+ # walk the document tree collecting the information needed
+ # to build an html page using the ocrText
+
+ def process(self):
+
+ htmlpage = ''
+
+ # first collect information from the xml doc that describes this page
+ (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
+ if argres : self.ocrtext = argres.split('|')
+
+ (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
+ if argres:
+ argList = argres.split('|')
+ self.dehyphen_rootid = [ int(strval) for strval in argList]
+
+ (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
+ if self.parastems_stemid == None : self.parastems_stemid = []
+
+ (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
+ if self.paracont_stemid == None : self.paracont_stemid = []
+
+
+ (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
+ if argres:
+ argList = argres.split('|')
+ self.link_id = [ int(strval) for strval in argList]
+
+ (pos, argres) = self.findinDoc('info.links.page',0,-1)
+ if argres :
+ argList = argres.split('|')
+ self.link_page = [ int(strval) for strval in argList]
+
+ (pos, argres) = self.findinDoc('info.links.title',0,-1)
+ if argres :
+ self.link_title = argres.split('|')
+ else:
+ self.link_title.append('')
+
+ (pos, pagetype) = self.findinDoc('page.type',0,-1)
+
+
+ # generate a list of each region starting point
+ # each region has one paragraph,, or one image, or one chapterheading
+ regionList= self.posinDoc('region')
+ regcnt = len(regionList)
+ regionList.append(-1)
+
+ anchorSet = False
+ breakSet = False
+
+ # process each region tag and convert what you can to html
+
+ for j in xrange(regcnt):
+ start = regionList[j]
+ end = regionList[j+1]
+
+ (pos, regtype) = self.findinDoc('region.type',start,end)
+
+ if regtype == 'graphic' :
+ if not anchorSet:
+ htmlpage += '
\n'
+ breakSet = True
+ if not anchorSet:
+ htmlpage += '
 
\n'
+ anchorSet = True
+ tag = 'h1'
+ if pclass[3:7] == 'ch1-' : tag = 'h1'
+ if pclass[3:7] == 'ch2-' : tag = 'h2'
+ if pclass[3:7] == 'ch3-' : tag = 'h3'
+ htmlpage += '<' + tag + ' class="' + pclass + '">'
+ htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype)
+ htmlpage += '' + tag + '>'
+
+ elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
+ ptype = 'full'
+ # check to see if this is a continution from the previous page
+ if (len(self.parastems_stemid) > 0):
+ ptype = 'end'
+ self.parastems_stemid=[]
+ else:
+ if not anchorSet:
+ htmlpage += '
 
\n'
+ anchorSet = True
+ (pclass, first, last) = self.getParaDescription(start,end)
+ if ptype == 'full' :
+ tag = 'p'
+ if pclass[3:6] == 'h1-' : tag = 'h4'
+ if pclass[3:6] == 'h2-' : tag = 'h5'
+ if pclass[3:6] == 'h3-' : tag = 'h6'
+ htmlpage += '<' + tag + ' class="' + pclass + '">'
+ htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype)
+ htmlpage += '' + tag + '>'
+ else :
+ htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
+
+
+ elif (regtype == 'tocentry') :
+ ptype = 'full'
+ # check to see if this is a continution from the previous page
+ if (len(self.parastems_stemid) > 0) and (j == 0):
+ # process the first paragraph as a continuation from the last page
+ ptype = 'end'
+ self.parastems_stemid = []
+ else:
+ if not anchorSet:
+ htmlpage += '
 
\n'
+ anchorSet = True
+ (pclass, first, last) = self.getParaDescription(start,end)
+ htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
+
+ else :
+ print 'Unknown region type', regtype
+ print 'Warning: skipping this region'
+
+ if len(self.paracont_stemid) > 0 :
+ if htmlpage[-4:] == '
':
+ htmlpage = htmlpage[0:-4]
+
+ return htmlpage
+
+
+ return self.convert2HTML()
+
+
+
+def convert2HTML(flatxml, fileid):
+
+ # create a document parser
+ dp = DocParser(flatxml, fileid)
+
+ htmlpage = dp.process()
+
+ return htmlpage
diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py
new file mode 100644
index 0000000..be50aae
--- /dev/null
+++ b/Topaz_Tools/lib/genhtml.py
@@ -0,0 +1,125 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+import os, sys, getopt
+
+# local routines
+import convert2xml
+import flatxml2html
+import decode_meta
+import stylexml2css
+
+
+def usage():
+ print 'Usage: '
+ print ' '
+ print ' genhtml.py unencryptedBookDir'
+ print ' '
+
+
+
+def main(argv):
+ bookDir = ''
+
+ if len(argv) == 0:
+ argv = sys.argv
+ else :
+ argv = argv.split()
+
+ try:
+ opts, args = getopt.getopt(argv[1:], "h:")
+
+ except getopt.GetoptError, err:
+ print str(err)
+ usage()
+ sys.exit(2)
+
+ if len(opts) == 0 and len(args) == 0 :
+ usage()
+ sys.exit(2)
+
+ for o, a in opts:
+ if o =="-h":
+ usage()
+ sys.exit(0)
+
+ bookDir = args[0]
+
+ if not os.path.exists(bookDir) :
+ print "Can not find directory with unencrypted book"
+ sys.exit(-1)
+
+ dictFile = os.path.join(bookDir,'dict0000.dat')
+
+ if not os.path.exists(dictFile) :
+ print "Can not find dict0000.dat file"
+ sys.exit(-1)
+
+ pageDir = os.path.join(bookDir,'page')
+ if not os.path.exists(pageDir) :
+ print "Can not find page directory in unencrypted book"
+ sys.exit(-1)
+
+ imgDir = os.path.join(bookDir,'img')
+ if not os.path.exists(imgDir) :
+ print "Can not find image directory in unencrypted book"
+ sys.exit(-1)
+
+ otherFile = os.path.join(bookDir,'other0000.dat')
+ if not os.path.exists(otherFile) :
+ print "Can not find other0000.dat in unencrypted book"
+ sys.exit(-1)
+
+ metaFile = os.path.join(bookDir,'metadata0000.dat')
+ if not os.path.exists(metaFile) :
+ print "Can not find metadata0000.dat in unencrypted book"
+ sys.exit(-1)
+
+
+ htmlFileName = "book.html"
+ htmlstr = '\n'
+
+ filenames = os.listdir(pageDir)
+ filenames = sorted(filenames)
+
+ print 'Processing ... '
+
+ htmlstr += '\n'
+
+ print ' ', 'metadata0000.dat'
+ fname = os.path.join(bookDir,'metadata0000.dat')
+ xname = os.path.join(bookDir, 'metadata.txt')
+ metastr = decode_meta.getMetaData(fname)
+ file(xname, 'wb').write(metastr)
+ meta_array = decode_meta.getMetaArray(fname)
+ htmlstr += '\n'
+ htmlstr += '\n'
+
+ print ' ', 'other0000.dat'
+ fname = os.path.join(bookDir,'other0000.dat')
+ xname = os.path.join(bookDir, 'style.css')
+ xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+ cssstr = '\n'
+ file(xname, 'wb').write(cssstr)
+ htmlstr += cssstr
+ htmlstr += '\n\n'
+
+ for filename in filenames:
+ print ' ', filename
+ fname = os.path.join(pageDir,filename)
+ flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+ htmlstr += flatxml2html.convert2HTML(flat_xml, fname)
+
+ htmlstr += '\n\n'
+
+ file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+ print 'Processing Complete'
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main(''))
+
+
diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py
new file mode 100644
index 0000000..7df8043
--- /dev/null
+++ b/Topaz_Tools/lib/gensvg.py
@@ -0,0 +1,295 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+import os, sys, getopt
+
+# local routines
+import convert2xml
+import flatxml2html
+import decode_meta
+
+
+class GParser(object):
+ def __init__(self, flatxml):
+ self.flatdoc = flatxml.split('\n')
+ self.dpi = 1440
+ self.gh = self.getData('info.glyph.h')
+ self.gw = self.getData('info.glyph.w')
+ self.guse = self.getData('info.glyph.use')
+ self.count = len(self.guse)
+ self.gvtx = self.getData('info.glyph.vtx')
+ self.glen = self.getData('info.glyph.len')
+ self.gdpi = self.getData('info.glyph.dpi')
+ self.vx = self.getData('info.vtx.x')
+ self.vy = self.getData('info.vtx.y')
+ self.vlen = self.getData('info.len.n')
+ self.glen.append(len(self.vlen))
+ self.gvtx.append(len(self.vx))
+
+ def getData(self, path):
+ result = None
+ cnt = len(self.flatdoc)
+ for j in xrange(cnt):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (name == path):
+ result = argres
+ break
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ return result
+
+ def getPath(self, gly):
+ path = ''
+ if (gly < 0) or (gly >= self.count):
+ return path
+ tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
+ ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
+ p = 0
+ for k in xrange(self.glen[gly], self.glen[gly+1]):
+ if (p == 0):
+ zx = tx[0:self.vlen[k]+1]
+ zy = ty[0:self.vlen[k]+1]
+ else:
+ zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
+ zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
+ p += 1
+ for j in xrange(0, len(zx)):
+ if (j == 0):
+ path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
+ else:
+ path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
+ path += 'z'
+ return path
+
+class PParser(object):
+ def __init__(self, flatxml):
+ self.flatdoc = flatxml.split('\n')
+ self.temp = []
+ self.ph = self.getData('page.h')[0]
+ self.pw = self.getData('page.w')[0]
+ self.gx = self.getData('info.glyph.x')
+ self.gy = self.getData('info.glyph.y')
+ self.gid = self.getData('info.glyph.glyphID')
+
+ def getData(self, path):
+ result = None
+ cnt = len(self.flatdoc)
+ for j in xrange(cnt):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (name.endswith(path)):
+ result = argres
+ break
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ return result
+
+ def getDataTemp(self, path):
+ result = None
+ cnt = len(self.temp)
+ for j in xrange(cnt):
+ item = self.temp[j]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (name.endswith(path)):
+ result = argres
+ self.temp.pop(j)
+ break
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ return result
+
+ def getImages(self):
+ result = []
+ self.temp = self.flatdoc
+ while (self.getDataTemp('region.img') != None):
+ h = self.getDataTemp('region.img.h')[0]
+ w = self.getDataTemp('region.img.w')[0]
+ x = self.getDataTemp('region.img.x')[0]
+ y = self.getDataTemp('region.img.y')[0]
+ src = self.getDataTemp('region.img.src')[0]
+ result.append('\n' % (src, x, y, w, h))
+ return result
+
+ def getGlyphs(self,glyfname):
+ result = []
+ if (self.gid != None) and (len(self.gid) > 0):
+ glyphs = []
+ for j in set(self.gid):
+ glyphs.append(j)
+ glyphs.sort()
+ gfile = open(glyfname, 'r')
+ j = 0
+ while True :
+ inp = gfile.readline()
+ if (inp == ''):
+ break
+ id='id="gl%d"' % glyphs[j]
+ if (inp.find(id) > 0):
+ result.append(inp)
+ j += 1
+ if (j == len(glyphs)):
+ break
+ gfile.close()
+ return result
+
+
+
+
+def usage():
+ print 'Usage: '
+ print ' '
+ print ' gensvg.py unencryptedBookDir'
+ print ' '
+
+
+def main(argv):
+ bookDir = ''
+
+ if len(argv) == 0:
+ argv = sys.argv
+ else :
+ argv = argv.split()
+
+ try:
+ opts, args = getopt.getopt(argv[1:], "h:")
+
+ except getopt.GetoptError, err:
+ print str(err)
+ usage()
+ sys.exit(2)
+
+ if len(opts) == 0 and len(args) == 0 :
+ usage()
+ sys.exit(2)
+
+ for o, a in opts:
+ if o =="-h":
+ usage()
+ sys.exit(0)
+
+ bookDir = args[0]
+
+ if not os.path.exists(bookDir) :
+ print "Can not find directory with unencrypted book"
+ sys.exit(-1)
+
+ dictFile = os.path.join(bookDir,'dict0000.dat')
+
+ if not os.path.exists(dictFile) :
+ print "Can not find dict0000.dat file"
+ sys.exit(-1)
+
+ pageDir = os.path.join(bookDir,'page')
+ if not os.path.exists(pageDir) :
+ print "Can not find page directory in unencrypted book"
+ sys.exit(-1)
+
+ imgDir = os.path.join(bookDir,'img')
+ if not os.path.exists(imgDir) :
+ print "Can not find image directory in unencrypted book"
+ sys.exit(-1)
+
+ glyphsDir = os.path.join(bookDir,'glyphs')
+ if not os.path.exists(glyphsDir) :
+ print "Can not find glyphs directory in unencrypted book"
+ sys.exit(-1)
+
+ metaFile = os.path.join(bookDir,'metadata0000.dat')
+ if not os.path.exists(metaFile) :
+ print "Can not find metadata0000.dat in unencrypted book"
+ sys.exit(-1)
+
+ svgDir = os.path.join(bookDir,'svg')
+ if not os.path.exists(svgDir) :
+ os.makedirs(svgDir)
+
+
+ print 'Processing Meta Data ... '
+
+ print ' ', 'metadata0000.dat'
+ fname = os.path.join(bookDir,'metadata0000.dat')
+ metadata = decode_meta.getMetaArray(fname)
+
+ print 'Processing Glyphs ... '
+
+ filenames = os.listdir(glyphsDir)
+ filenames = sorted(filenames)
+
+ glyfname = os.path.join(svgDir,'glyphs.svg')
+ glyfile = open(glyfname, 'w')
+ glyfile.write('\n')
+ glyfile.write('\n')
+ glyfile.write('\n')
+ glyfile.close()
+
+ print 'Processing Pages ... '
+
+ scaledpi = 720
+ filenames = os.listdir(pageDir)
+ filenames = sorted(filenames)
+ counter = 0
+ for filename in filenames:
+ print ' ', filename
+ fname = os.path.join(pageDir,filename)
+ flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+ pp = PParser(flat_xml)
+ pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
+ pfile.write('\n')
+ pfile.write('\n')
+ pfile.write('')
+ pfile.close()
+ counter += 1
+
+ print 'Processing Complete'
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main(''))
diff --git a/Topaz_Tools/lib/genxml.py b/Topaz_Tools/lib/genxml.py
new file mode 100644
index 0000000..c335e88
--- /dev/null
+++ b/Topaz_Tools/lib/genxml.py
@@ -0,0 +1,121 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+import os, sys, getopt
+
+# local routines
+import convert2xml
+import flatxml2html
+import decode_meta
+
+
+def usage():
+ print 'Usage: '
+ print ' '
+ print ' genxml.py dict0000.dat unencryptedBookDir'
+ print ' '
+
+
+
+def main(argv):
+ bookDir = ''
+
+ if len(argv) == 0:
+ argv = sys.argv
+ else :
+ argv = argv.split()
+
+ try:
+ opts, args = getopt.getopt(argv[1:], "h:")
+
+ except getopt.GetoptError, err:
+ print str(err)
+ usage()
+ sys.exit(2)
+
+ if len(opts) == 0 and len(args) == 0 :
+ usage()
+ sys.exit(2)
+
+ for o, a in opts:
+ if o =="-h":
+ usage()
+ sys.exit(0)
+
+ bookDir = args[0]
+
+ if not os.path.exists(bookDir) :
+ print "Can not find directory with unencrypted book"
+ sys.exit(-1)
+
+ dictFile = os.path.join(bookDir,'dict0000.dat')
+ if not os.path.exists(dictFile) :
+ print "Can not find dict0000.dat file"
+ sys.exit(-1)
+
+ pageDir = os.path.join(bookDir,'page')
+ if not os.path.exists(pageDir) :
+ print "Can not find page directory in unencrypted book"
+ sys.exit(-1)
+
+ glyphsDir = os.path.join(bookDir,'glyphs')
+ if not os.path.exists(glyphsDir) :
+ print "Can not find glyphs directory in unencrypted book"
+ sys.exit(-1)
+
+ otherFile = os.path.join(bookDir,'other0000.dat')
+ if not os.path.exists(otherFile) :
+ print "Can not find other0000.dat in unencrypted book"
+ sys.exit(-1)
+
+ metaFile = os.path.join(bookDir,'metadata0000.dat')
+ if not os.path.exists(metaFile) :
+ print "Can not find metadata0000.dat in unencrypted book"
+ sys.exit(-1)
+
+ xmlDir = os.path.join(bookDir,'xml')
+ if not os.path.exists(xmlDir):
+ os.makedirs(xmlDir)
+
+
+ print 'Processing ... '
+
+ print ' ', 'metadata0000.dat'
+ fname = os.path.join(bookDir,'metadata0000.dat')
+ xname = os.path.join(xmlDir, 'metadata.txt')
+ metastr = decode_meta.getMetaData(fname)
+ file(xname, 'wb').write(metastr)
+
+ print ' ', 'other0000.dat'
+ fname = os.path.join(bookDir,'other0000.dat')
+ xname = os.path.join(xmlDir, 'stylesheet.xml')
+ xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
+ file(xname, 'wb').write(xmlstr)
+
+ filenames = os.listdir(pageDir)
+ filenames = sorted(filenames)
+
+ for filename in filenames:
+ print ' ', filename
+ fname = os.path.join(pageDir,filename)
+ xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
+ xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
+ file(xname, 'wb').write(xmlstr)
+
+ filenames = os.listdir(glyphsDir)
+ filenames = sorted(filenames)
+
+ for filename in filenames:
+ print ' ', filename
+ fname = os.path.join(glyphsDir,filename)
+ xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
+ xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
+ file(xname, 'wb').write(xmlstr)
+
+
+ print 'Processing Complete'
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main(''))
diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt
new file mode 100644
index 0000000..4a79d20
--- /dev/null
+++ b/Topaz_Tools/lib/readme.txt
@@ -0,0 +1,75 @@
+This is experimental and it will probably not work for you but...
+
+ALSO: Please do not use any of this to steal. Theft is wrong.
+ This is meant to allow conversion of Topaz books for other book readers you own
+
+Here are the steps:
+
+1. Unzip the topazscripts.zip file to get the full set of python scripts.
+The files you should have after unzipping are:
+
+cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files
+decode_meta.py - converts metadata0000.dat to human readable text (for the most part)
+convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
+flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
+stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
+genxml.py - main program to convert everything to xml
+genhtml.py - main program to generate "book.html"
+gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
+
+Please note, gensvg.py, genhtml.py, and genxml.py import and use
+decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py
+so please keep all of these python scripts together in the same place.
+
+
+
+2. Remove the DRM from the Topaz book and build a directory
+of its contents as files
+
+All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else
+would be possible
+
+ cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
+
+This should create a directory called "TARGETDIR" in your current directory.
+It should have the following files in it:
+
+metadata0000.dat - metadata info
+other0000.dat - information used to create a style sheet
+dict0000.dat - dictionary of words used to build page descriptions
+page - directory filled with page*.dat files
+glyphs - directory filled with glyphs*.dat files
+
+
+
+3. Convert the files in "TARGETDIR" to their xml descriptions
+which can be found in TARGETDIR/xml/ upon completion.
+
+ genxml.py TARGETDIR
+
+
+
+4. Create book.html which can be found in "TARGETDIR" after
+completion. This html conversion can not fully capture
+all of the layouts actually used in the book and needs to
+be edited to include special font handling such as bold
+or italics that can not be determined from the ocrText
+information or the style information. If you want to
+see things exactly as they were, see step 5 below.
+
+ genhtml.py TARGETDIR
+
+
+
+5. Create an svg description of each page which can
+be found in TARGETDIR/svg/ upon completion.
+
+All thanks go to CLARKNOVA for this program. This program is
+needed to actually see the true image of each page so that hand
+editing of the html created by step 4 can be done.
+
+Or use the resulting svg files to read each page of the book
+exactly as it has been laid out originally.
+
+ gensvg.py TARGETDIR
+
diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py
new file mode 100644
index 0000000..cf02984
--- /dev/null
+++ b/Topaz_Tools/lib/stylexml2css.py
@@ -0,0 +1,221 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+class DocParser(object):
+ def __init__(self, flatxml):
+ self.flatdoc = flatxml.split('\n')
+
+ stags = {
+ 'paragraph' : 'p',
+ 'graphic' : '.graphic'
+ }
+
+ attr_val_map = {
+ 'hang' : ('text-indent: ', 135),
+ 'indent' : ('text-indent: ', 135),
+ 'line-space' : ('line-height: ', 190),
+ 'margin-bottom' : ('margin-bottom: ', 135),
+ 'margin-left' : ('margin-left: ', 135),
+ 'margin-right' : ('margin-right: ', 135),
+ 'margin-top' : ('margin-top: ', 135),
+ 'space-after' : ('padding-bottom: ', 135),
+ }
+
+ attr_str_map = {
+ 'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
+ 'align-left' : 'text-align: left;',
+ 'align-right' : 'text-align: right;',
+ 'align-justify' : 'text-align: justify;',
+ 'display-inline' : 'display: inline;',
+ 'pos-left' : 'text-align: left;',
+ 'pos-right' : 'text-align: right;',
+ 'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
+ }
+
+
+ # find tag if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ docList = self.flatdoc
+ cnt = len(docList)
+ if end == -1 :
+ end = cnt
+ else:
+ end = min(cnt,end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = docList[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=')
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+
+ # return list of start positions for the tagpath
+ def posinDoc(self, tagpath):
+ startpos = []
+ pos = 0
+ res = ""
+ while res != None :
+ (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+ if res != None :
+ startpos.append(foundpos)
+ pos = foundpos + 1
+ return startpos
+
+
+ def process(self):
+
+ csspage = ''
+
+ # generate a list of each