diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py new file mode 100644 index 0000000..9cd32de --- /dev/null +++ b/Topaz_Tools/lib/cmbtc_dump.py @@ -0,0 +1,865 @@ +#! /usr/bin/python + +""" + +Comprehensive Mazama Book DRM with Topaz Cryptography V2.0 + +-----BEGIN PUBLIC KEY----- +MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6 +M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC +B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx +y2/pHuYme7U1TsgSjwIDAQAB +-----END PUBLIC KEY----- + +""" + +from __future__ import with_statement + +import csv +import sys +import os +import getopt +import zlib +from struct import pack +from struct import unpack +from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \ + create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \ + string_at, Structure, c_void_p, cast +import _winreg as winreg +import Tkinter +import Tkconstants +import tkMessageBox +import traceback +import hashlib + +MAX_PATH = 255 + +kernel32 = windll.kernel32 +advapi32 = windll.advapi32 +crypt32 = windll.crypt32 + +global kindleDatabase +global bookFile +global bookPayloadOffset +global bookHeaderRecords +global bookMetadata +global bookKey +global command + +# +# Various character maps used to decrypt books. Probably supposed to act as obfuscation +# + +charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M" +charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_" +charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" +charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789" + +# +# Exceptions for all the problems that might happen during the script +# + +class CMBDTCError(Exception): + pass + +class CMBDTCFatal(Exception): + pass + +# +# Stolen stuff +# + +class DataBlob(Structure): + _fields_ = [('cbData', c_uint), + ('pbData', c_void_p)] +DataBlob_p = POINTER(DataBlob) + +def GetSystemDirectory(): + GetSystemDirectoryW = kernel32.GetSystemDirectoryW + GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint] + GetSystemDirectoryW.restype = c_uint + def GetSystemDirectory(): + buffer = create_unicode_buffer(MAX_PATH + 1) + GetSystemDirectoryW(buffer, len(buffer)) + return buffer.value + return GetSystemDirectory +GetSystemDirectory = GetSystemDirectory() + + +def GetVolumeSerialNumber(): + GetVolumeInformationW = kernel32.GetVolumeInformationW + GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint, + POINTER(c_uint), POINTER(c_uint), + POINTER(c_uint), c_wchar_p, c_uint] + GetVolumeInformationW.restype = c_uint + def GetVolumeSerialNumber(path): + vsn = c_uint(0) + GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0) + return vsn.value + return GetVolumeSerialNumber +GetVolumeSerialNumber = GetVolumeSerialNumber() + + +def GetUserName(): + GetUserNameW = advapi32.GetUserNameW + GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)] + GetUserNameW.restype = c_uint + def GetUserName(): + buffer = create_unicode_buffer(32) + size = c_uint(len(buffer)) + while not GetUserNameW(buffer, byref(size)): + buffer = create_unicode_buffer(len(buffer) * 2) + size.value = len(buffer) + return buffer.value.encode('utf-16-le')[::2] + return GetUserName +GetUserName = GetUserName() + + +def CryptUnprotectData(): + _CryptUnprotectData = crypt32.CryptUnprotectData + _CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p, + c_void_p, c_void_p, c_uint, DataBlob_p] + _CryptUnprotectData.restype = c_uint + def CryptUnprotectData(indata, entropy): + indatab = create_string_buffer(indata) + indata = DataBlob(len(indata), cast(indatab, c_void_p)) + entropyb = create_string_buffer(entropy) + entropy = DataBlob(len(entropy), cast(entropyb, c_void_p)) + outdata = DataBlob() + if not _CryptUnprotectData(byref(indata), None, byref(entropy), + None, None, 0, byref(outdata)): + raise CMBDTCFatal("Failed to Unprotect Data") + return string_at(outdata.pbData, outdata.cbData) + return CryptUnprotectData +CryptUnprotectData = CryptUnprotectData() + +# +# Returns the MD5 digest of "message" +# + +def MD5(message): + ctx = hashlib.md5() + ctx.update(message) + return ctx.digest() + +# +# Returns the MD5 digest of "message" +# + +def SHA1(message): + ctx = hashlib.sha1() + ctx.update(message) + return ctx.digest() + +# +# Open the book file at path +# + +def openBook(path): + try: + return open(path,'rb') + except: + raise CMBDTCFatal("Could not open book file: " + path) +# +# Encode the bytes in data with the characters in map +# + +def encode(data, map): + result = "" + for char in data: + value = ord(char) + Q = (value ^ 0x80) // len(map) + R = value % len(map) + result += map[Q] + result += map[R] + return result + +# +# Hash the bytes in data and then encode the digest with the characters in map +# + +def encodeHash(data,map): + return encode(MD5(data),map) + +# +# Decode the string in data with the characters in map. Returns the decoded bytes +# + +def decode(data,map): + result = "" + for i in range (0,len(data),2): + high = map.find(data[i]) + low = map.find(data[i+1]) + value = (((high * 0x40) ^ 0x80) & 0xFF) + low + result += pack("B",value) + return result + +# +# Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application) +# + +def openKindleInfo(): + regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\") + path = winreg.QueryValueEx(regkey, 'Local AppData')[0] + return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r') + +# +# Parse the Kindle.info file and return the records as a list of key-values +# + +def parseKindleInfo(): + DB = {} + infoReader = openKindleInfo() + infoReader.read(1) + data = infoReader.read() + items = data.split('{') + + for item in items: + splito = item.split(':') + DB[splito[0]] =splito[1] + return DB + +# +# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal) +# + +def findNameForHash(hash): + names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"] + result = "" + for name in names: + if hash == encodeHash(name, charMap2): + result = name + break + return name + +# +# Print all the records from the kindle.info file (option -i) +# + +def printKindleInfo(): + for record in kindleDatabase: + name = findNameForHash(record) + if name != "" : + print (name) + print ("--------------------------\n") + else : + print ("Unknown Record") + print getKindleInfoValueForHash(record) + print "\n" +# +# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record +# + +def getKindleInfoValueForHash(hashedKey): + global kindleDatabase + encryptedValue = decode(kindleDatabase[hashedKey],charMap2) + return CryptUnprotectData(encryptedValue,"") + +# +# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record +# + +def getKindleInfoValueForKey(key): + return getKindleInfoValueForHash(encodeHash(key,charMap2)) + +# +# Get a 7 bit encoded number from the book file +# + +def bookReadEncodedNumber(): + flag = False + data = ord(bookFile.read(1)) + + if data == 0xFF: + flag = True + data = ord(bookFile.read(1)) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + data = ord(bookFile.read(1)) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + +# +# Encode a number in 7 bit format +# + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + +# +# Get a length prefixed string from the file +# + +def bookReadString(): + stringLength = bookReadEncodedNumber() + return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0] + +# +# Returns a length prefixed string +# + +def lengthPrefixString(data): + return encodeNumber(len(data))+data + + +# +# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...] +# + +def bookReadHeaderRecordData(): + nbValues = bookReadEncodedNumber() + values = [] + for i in range (0,nbValues): + values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()]) + return values + +# +# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...] +# + +def parseTopazHeaderRecord(): + if ord(bookFile.read(1)) != 0x63: + raise CMBDTCFatal("Parse Error : Invalid Header") + + tag = bookReadString() + record = bookReadHeaderRecordData() + return [tag,record] + +# +# Parse the header of a Topaz file, get all the header records and the offset for the payload +# + +def parseTopazHeader(): + global bookHeaderRecords + global bookPayloadOffset + magic = unpack("4s",bookFile.read(4))[0] + + if magic != 'TPZ0': + raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file") + + nbRecords = bookReadEncodedNumber() + bookHeaderRecords = {} + + for i in range (0,nbRecords): + result = parseTopazHeaderRecord() + print result[0], result[1] + bookHeaderRecords[result[0]] = result[1] + + if ord(bookFile.read(1)) != 0x64 : + raise CMBDTCFatal("Parse Error : Invalid Header") + + bookPayloadOffset = bookFile.tell() + +# +# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed +# Correction, the record is correctly decompressed too +# + +def getBookPayloadRecord(name, index): + encrypted = False + compressed = False + + try: + recordOffset = bookHeaderRecords[name][index][0] + except: + raise CMBDTCFatal("Parse Error : Invalid Record, record not found") + + bookFile.seek(bookPayloadOffset + recordOffset) + + tag = bookReadString() + if tag != name : + raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match") + + recordIndex = bookReadEncodedNumber() + + if recordIndex < 0 : + encrypted = True + recordIndex = -recordIndex -1 + + if recordIndex != index : + raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match") + + if (bookHeaderRecords[name][index][2] > 0): + compressed = True + record = bookFile.read(bookHeaderRecords[name][index][2]) + else: + record = bookFile.read(bookHeaderRecords[name][index][1]) + + if encrypted: + ctx = topazCryptoInit(bookKey) + record = topazCryptoDecrypt(record,ctx) + + if compressed: + record = zlib.decompress(record) + + return record + +# +# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename" +# + +def extractBookPayloadRecord(name, index, filename): + compressed = False + + try: + compressed = bookHeaderRecords[name][index][2] != 0 + record = getBookPayloadRecord(name,index) + except: + print("Could not find record") + + # if compressed: + # try: + # record = zlib.decompress(record) + # except: + # raise CMBDTCFatal("Could not decompress record") + + if filename != "": + try: + file = open(filename,"wb") + file.write(record) + file.close() + except: + raise CMBDTCFatal("Could not write to destination file") + else: + print(record) + +# +# return next record [key,value] from the book metadata from the current book position +# + +def readMetadataRecord(): + return [bookReadString(),bookReadString()] + +# +# Parse the metadata record from the book payload and return a list of [key,values] +# + +def parseMetadata(): + global bookHeaderRecords + global bookPayloadAddress + global bookMetadata + bookMetadata = {} + bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0]) + tag = bookReadString() + if tag != "metadata" : + raise CMBDTCFatal("Parse Error : Record Names Don't Match") + + flags = ord(bookFile.read(1)) + nbRecords = ord(bookFile.read(1)) + + for i in range (0,nbRecords) : + record =readMetadataRecord() + bookMetadata[record[0]] = record[1] + +# +# Returns two bit at offset from a bit field +# + +def getTwoBitsFromBitField(bitField,offset): + byteNumber = offset // 4 + bitPosition = 6 - 2*(offset % 4) + + return ord(bitField[byteNumber]) >> bitPosition & 3 + +# +# Returns the six bits at offset from a bit field +# + +def getSixBitsFromBitField(bitField,offset): + offset *= 3 + value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2) + return value + +# +# 8 bits to six bits encoding from hash to generate PID string +# + +def encodePID(hash): + global charMap3 + PID = "" + for position in range (0,8): + PID += charMap3[getSixBitsFromBitField(hash,position)] + return PID + +# +# Context initialisation for the Topaz Crypto +# + +def topazCryptoInit(key): + ctx1 = 0x0CAFFE19E + + for keyChar in key: + keyByte = ord(keyChar) + ctx2 = ctx1 + ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF ) + return [ctx1,ctx2] + +# +# decrypt data with the context prepared by topazCryptoInit() +# + +def topazCryptoDecrypt(data, ctx): + ctx1 = ctx[0] + ctx2 = ctx[1] + + plainText = "" + + for dataChar in data: + dataByte = ord(dataChar) + m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF + ctx2 = ctx1 + ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF) + plainText += chr(m) + + return plainText + +# +# Decrypt a payload record with the PID +# + +def decryptRecord(data,PID): + ctx = topazCryptoInit(PID) + return topazCryptoDecrypt(data, ctx) + +# +# Try to decrypt a dkey record (contains the book PID) +# + +def decryptDkeyRecord(data,PID): + record = decryptRecord(data,PID) + fields = unpack("3sB8sB8s3s",record) + + if fields[0] != "PID" or fields[5] != "pid" : + raise CMBDTCError("Didn't find PID magic numbers in record") + elif fields[1] != 8 or fields[3] != 8 : + raise CMBDTCError("Record didn't contain correct length fields") + elif fields[2] != PID : + raise CMBDTCError("Record didn't contain PID") + + return fields[4] + +# +# Decrypt all the book's dkey records (contain the book PID) +# + +def decryptDkeyRecords(data,PID): + nbKeyRecords = ord(data[0]) + records = [] + data = data[1:] + for i in range (0,nbKeyRecords): + length = ord(data[0]) + try: + key = decryptDkeyRecord(data[1:length+1],PID) + records.append(key) + except CMBDTCError: + pass + data = data[1+length:] + + return records + +# +# Encryption table used to generate the device PID +# + +def generatePidEncryptionTable() : + table = [] + for counter1 in range (0,0x100): + value = counter1 + for counter2 in range (0,8): + if (value & 1 == 0) : + value = value >> 1 + else : + value = value >> 1 + value = value ^ 0xEDB88320 + table.append(value) + return table + +# +# Seed value used to generate the device PID +# + +def generatePidSeed(table,dsn) : + value = 0 + for counter in range (0,4) : + index = (ord(dsn[counter]) ^ value) &0xFF + value = (value >> 8) ^ table[index] + return value + +# +# Generate the device PID +# + +def generateDevicePID(table,dsn,nbRoll): + seed = generatePidSeed(table,dsn) + pidAscii = "" + pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF] + index = 0 + + for counter in range (0,nbRoll): + pid[index] = pid[index] ^ ord(dsn[counter]) + index = (index+1) %8 + + for counter in range (0,8): + index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7) + pidAscii += charMap4[index] + return pidAscii + +# +# Create decrypted book payload +# + +def createDecryptedPayload(payload): + for headerRecord in bookHeaderRecords: + name = headerRecord + if name != "dkey" : + ext = '.dat' + if name == 'img' : ext = '.jpg' + for index in range (0,len(bookHeaderRecords[name])) : + fnum = "%04d" % index + fname = name + fnum + ext + destdir = payload + if name == 'img': + destdir = os.path.join(payload,'img') + if name == 'page': + destdir = os.path.join(payload,'page') + if name == 'glyphs': + destdir = os.path.join(payload,'glyphs') + outputFile = os.path.join(destdir,fname) + file(outputFile, 'wb').write(getBookPayloadRecord(name, index)) + + +# Create decrypted book +# + +def createDecryptedBook(outdir): + if not os.path.exists(outdir): + os.makedirs(outdir) + + destdir = os.path.join(outdir,'img') + if not os.path.exists(destdir): + os.makedirs(destdir) + + destdir = os.path.join(outdir,'page') + if not os.path.exists(destdir): + os.makedirs(destdir) + + destdir = os.path.join(outdir,'glyphs') + if not os.path.exists(destdir): + os.makedirs(destdir) + + createDecryptedPayload(outdir) + + +# +# Set the command to execute by the programm according to cmdLine parameters +# + +def setCommand(name) : + global command + if command != "" : + raise CMBDTCFatal("Invalid command line parameters") + else : + command = name + +# +# Program usage +# + +def usage(): + print("\nUsage:") + print("\ncmbtc_dump.py [options] bookFileName\n") + print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)") + print("-d Dumps the unencrypted book as files to outdir") + print("-o Output directory to save book files to") + print("-v Verbose (can be used several times)") + print("-i Prints kindle.info database") + +# +# Main +# + +def main(argv=sys.argv): + global kindleDatabase + global bookMetadata + global bookKey + global bookFile + global command + + progname = os.path.basename(argv[0]) + + verbose = 0 + recordName = "" + recordIndex = 0 + outdir = "" + PIDs = [] + kindleDatabase = None + command = "" + + + try: + opts, args = getopt.getopt(sys.argv[1:], "vi:o:p:d") + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o == "-v": + verbose+=1 + if o == "-i": + setCommand("printInfo") + if o =="-o": + if a == None : + raise CMBDTCFatal("Invalid parameter for -o") + outdir = a + if o =="-p": + PIDs.append(a) + if o =="-d": + setCommand("doit") + + if command == "" : + raise CMBDTCFatal("No action supplied on command line") + + # + # Read the encrypted database + # + + try: + kindleDatabase = parseKindleInfo() + except Exception as message: + if verbose>0: + print(message) + + if kindleDatabase != None : + if command == "printInfo" : + printKindleInfo() + + # + # Compute the DSN + # + + # Get the Mazama Random number + MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber") + + # Get the HDD serial + encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1) + + # Get the current user name + encodedUsername = encodeHash(GetUserName(),charMap1) + + # concat, hash and encode + DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1) + + if verbose >1: + print("DSN: " + DSN) + + # + # Compute the device PID + # + + table = generatePidEncryptionTable() + devicePID = generateDevicePID(table,DSN,4) + PIDs.append(devicePID) + + if verbose > 0: + print("Device PID: " + devicePID) + + # + # Open book and parse metadata + # + + if len(args) == 1: + + bookFile = openBook(args[0]) + parseTopazHeader() + parseMetadata() + + # + # Compute book PID + # + + # Get the account token + + if kindleDatabase != None: + kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens") + + if verbose >1: + print("Account Token: " + kindleAccountToken) + + keysRecord = bookMetadata["keys"] + keysRecordRecord = bookMetadata[keysRecord] + + pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord) + + bookPID = encodePID(pidHash) + PIDs.append(bookPID) + + if verbose > 0: + print ("Book PID: " + bookPID ) + + # + # Decrypt book key + # + + dkey = getBookPayloadRecord('dkey', 0) + + bookKeys = [] + for PID in PIDs : + bookKeys+=decryptDkeyRecords(dkey,PID) + + if len(bookKeys) == 0 : + if verbose > 0 : + print ("Book key could not be found. Maybe this book is not registered with this device.") + else : + bookKey = bookKeys[0] + if verbose > 0: + print("Book key: " + bookKey.encode('hex')) + + + + if command == "printRecord" : + extractBookPayloadRecord(recordName,int(recordIndex),outputFile) + if outputFile != "" and verbose>0 : + print("Wrote record to file: "+outputFile) + elif command == "doit" : + if outdir != "" : + createDecryptedBook(outdir) + if verbose >0 : + print ("Decrypted book saved. Don't pirate!") + elif verbose > 0: + print("Output directory name was not supplied.") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py new file mode 100644 index 0000000..86d08d4 --- /dev/null +++ b/Topaz_Tools/lib/convert2xml.py @@ -0,0 +1,821 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + + +# Get a 7 bit encoded number from string. The most +# significant byte comes first and has the high bit (8th) set + +def readEncodedNumber(file): + flag = False + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data == 0xFF: + flag = True + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + + +# returns a binary string that encodes a number into 7 bits +# most significant byte first which has the high bit set + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + + + +# create / read a length prefixed string from the file + +def lengthPrefixString(data): + return encodeNumber(len(data))+data + +def readString(file): + stringLength = readEncodedNumber(file) + if (stringLength == None): + return "" + sv = file.read(stringLength) + if (len(sv) != stringLength): + return "" + return unpack(str(stringLength)+"s",sv)[0] + + +# convert a binary string generated by encodeNumber (7 bit encoded number) +# to the value you would find inside the page*.dat files to be processed + +def convert(i): + result = '' + val = encodeNumber(i) + for j in xrange(len(val)): + c = ord(val[j:j+1]) + result += '%02x' % c + return result + + + +# the complete string table used to store all book text content +# as well as the xml tokens and values that make sense out of it + +class Dictionary(object): + def __init__(self, dictFile): + self.filename = dictFile + self.size = 0 + self.fo = file(dictFile,'rb') + self.stable = [] + self.size = readEncodedNumber(self.fo) + for i in xrange(self.size): + self.stable.append(self.escapestr(readString(self.fo))) + self.pos = 0 + + def escapestr(self, str): + str = str.replace('&','&') + str = str.replace('<','<') + str = str.replace('>','>') + str = str.replace('=','=') + return str + + def lookup(self,val): + if ((val >= 0) and (val < self.size)) : + self.pos = val + return self.stable[self.pos] + else: + print "Error - %d outside of string table limits" % val + sys.exit(-1) + + def getSize(self): + return self.size + + def getPos(self): + return self.pos + + def dumpDict(self): + for i in xrange(self.size): + print "%d %s %s" % (i, convert(i), self.stable[i]) + return + +# parses the xml snippets that are represented by each page*.dat file. +# also parses the other0.dat file - the main stylesheet +# and information used to inject the xml snippets into page*.dat files + +class PageParser(object): + def __init__(self, filename, dict, debug, flat_xml): + self.fo = file(filename,'rb') + self.id = os.path.basename(filename).replace('.dat','') + self.dict = dict + self.debug = debug + self.flat_xml = flat_xml + self.tagpath = [] + self.doc = [] + self.snippetList = [] + + + # hash table used to enable the decoding process + # This has all been developed by trial and error so it may still have omissions or + # contain errors + # Format: + # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) + + token_tags = { + 'book' : (1, 'snippets', 1, 0), + 'version' : (1, 'snippets', 1, 0), + 'stylesheet' : (1, 'snippets', 1, 0), + 'links' : (0, 'number', 0, 1), + 'pages' : (0, 'number', 0, 1), + 'page' : (1, 'snippets', 1, 0), + 'group' : (1, 'snippets', 1, 0), + 'region' : (1, 'snippets', 1, 0), + 'reflow' : (1, 'number', 1, 0), + 'img' : (1, 'snippets', 1, 0), + 'paragraph' : (1, 'snippets', 1, 0), + 'extratokens' : (1, 'snippets', 1, 0), + 'style' : (1, 'snippets', 1, 0), + 'rule' : (1, 'snippets', 1, 0), + '_span' : (1, 'snippets', 1, 0), + 'word_semantic': (1, 'snippets', 1, 1), + 'value' : (1, 'scalar_text', 0, 0), + 'h' : (1, 'scalar_number', 0, 0), + 'w' : (1, 'scalar_number', 0, 0), + 'firstWord' : (1, 'scalar_number', 0, 0), + 'lastWord' : (1, 'scalar_number', 0, 0), + 'x' : (1, 'number', 0, 0), + 'y' : (1, 'number', 0, 0), + 'links.page' : (1, 'number', 0, 0), + 'link_id' : (1, 'number', 0, 0), + 'glyph' : (0, 'number', 1, 1), + 'glyph.h' : (1, 'number', 0, 0), + 'glyph.w' : (1, 'number', 0, 0), + 'sh' : (1, 'number', 0, 0), + 'word' : (0, 'number', 1, 1), + 'src' : (1, 'scalar_number', 0, 0), + 'rel' : (1, 'number', 0, 0), + 'row' : (1, 'number', 0, 0), + 'startID' : (1, 'number', 0, 1), + 'startID.page' : (1, 'number', 0, 0), + 'glyphID' : (1, 'number', 0, 0), + 'rootID' : (1, 'number', 0, 0), + 'stemID' : (1, 'number', 0, 0), + 'margin-top' : (1, 'number', 0, 0), + 'stemPage' : (1, 'number', 0, 0), + 'dehyphen' : (1, 'number', 1, 1), + 'rootID' : (1, 'number', 0, 0), + 'paraCont' : (1, 'number', 1, 1), + 'paraStems' : (1, 'number', 1, 1), + 'wordStems' : (1, 'number', 1, 1), + 'original' : (0, 'number', 0, 1), + 'use' : (1, 'number', 0, 0), + 'vtx' : (1, 'number', 0, 1), + 'len' : (1, 'number', 0, 1), + 'dpi' : (1, 'number', 0, 0), + 'n' : (1, 'number', 0, 0), + 'id' : (1, 'number', 0, 0), + 'ref' : (1, 'number', 0, 0), + 'pnum' : (1, 'number', 0, 0), + 'pid' : (1, 'text', 0, 0), + 'info' : (0, 'number', 1, 0), + 'bl' : (1, 'raw', 0, 0), + 'firstGlyph' : (1, 'raw', 0, 0), + 'lastGlyph' : (1, 'raw', 0, 0), + 'ocrText' : (1, 'text', 0, 0), + 'title' : (1, 'text', 0, 0), + 'href' : (1, 'text', 0, 0), + '_parent_type' : (1, 'text', 0, 0), + 'attr' : (1, 'scalar_text', 0, 0), + 'justify' : (1, 'scalar_text', 0, 0), + 'align' : (1, 'scalar_text', 0, 0), + 'layout' : (1, 'scalar_text', 0, 0), + 'pageid' : (1, 'scalar_text', 0, 0), + 'pagelabel' : (1, 'scalar_text', 0, 0), + 'type' : (1, 'text', 0, 0), + 'class' : (1, 'scalar_text', 0, 0), + 'container' : (1, 'scalar_text', 0, 0), + '_after_class' : (1, 'scalar_text', 0, 0), + '_tag' : (1, 'scalar_text', 0, 0), + 'pos' : (1, 'scalar_text', 0, 0), + 'page_num' : (1, 'scalar_text', 0, 0), + 'page_type' : (1, 'scalar_text', 0, 0), + 'findlists' : (1, 'scalar_text', 0, 0), + 'FlowEdit_1_id' : (1, 'scalar_text', 0, 0), + 'FlowEdit_1_version' : (1, 'scalar_text', 0, 0), + 'Schema_id' : (1, 'scalar_text', 0, 0), + 'Schema_version' : (1, 'scalar_text', 0, 0), + 'Topaz_version' : (1, 'scalar_text', 0, 0), + 'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), + 'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), + 'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), + 'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), + 'chapterheaders' : (1, 'scalar_text', 0, 0), + 'creation_date' : (1, 'scalar_text', 0, 0), + 'header_footer' : (1, 'scalar_text', 0, 0), + 'init_from_ocr' : (1, 'scalar_text', 0, 0), + 'letter_insertion' : (1, 'scalar_text', 0, 0), + 'xmlinj_convert' : (1, 'scalar_text', 0, 0), + 'xmlinj_reflow' : (1, 'scalar_text', 0, 0), + 'xmlinj_transform' : (1, 'scalar_text', 0, 0), + } + + + # full tag path record keeping routines + def tag_push(self, token): + self.tagpath.append(token) + def tag_pop(self): + if len(self.tagpath) > 0 : + self.tagpath.pop() + def tagpath_len(self): + return len(self.tagpath) + def get_tagpath(self, i): + cnt = len(self.tagpath) + if i < cnt : result = self.tagpath[i] + for j in xrange(i+1, cnt) : + result += '.' + self.tagpath[j] + return result + + + # list of absolute command byte values values that indicate + # various types of loop meachanisms typically used to generate vectors + + cmd_list = (0x76, 0x76) + + # peek at and return 1 byte that is ahead by i bytes + def peek(self, aheadi): + c = self.fo.read(aheadi) + if (len(c) == 0): + return None + self.fo.seek(-aheadi,1) + c = c[-1:] + return ord(c) + + + # get the next value from the file being processed + def getNext(self): + nbyte = self.peek(1); + if (nbyte == None): + return None + val = readEncodedNumber(self.fo) + return val + + + # format an arg by argtype + def formatArg(self, arg, argtype): + if (argtype == 'text') or (argtype == 'scalar_text') : + result = self.dict.lookup(arg) + elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') : + result = arg + elif (argtype == 'snippets') : + result = arg + else : + print "Error Unknown argtype %s" % argtype + sys.exit(-2) + return result + + + # process the next tag token, recursively handling subtags, + # arguments, and commands + def procToken(self, token): + + known_token = False + self.tag_push(token) + + if self.debug : print 'Processing: ', self.get_tagpath(0) + cnt = self.tagpath_len() + for j in xrange(cnt): + tkn = self.get_tagpath(j) + if tkn in self.token_tags : + num_args = self.token_tags[tkn][0] + argtype = self.token_tags[tkn][1] + subtags = self.token_tags[tkn][2] + splcase = self.token_tags[tkn][3] + ntags = -1 + known_token = True + break + + if known_token : + + # handle subtags if present + subtagres = [] + if (splcase == 1): + # this type of tag uses of escape marker 0x74 indicate subtag count + if self.peek(1) == 0x74: + skip = readEncodedNumber(self.fo) + subtags = 1 + num_args = 0 + + if (subtags == 1): + ntags = readEncodedNumber(self.fo) + if self.debug : print 'subtags: ' + token + ' has ' + str(ntags) + for j in xrange(ntags): + val = readEncodedNumber(self.fo) + subtagres.append(self.procToken(self.dict.lookup(val))) + + # arguments can be scalars or vectors of text or numbers + argres = [] + if num_args > 0 : + firstarg = self.peek(1) + if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'): + # single argument is a variable length vector of data + arg = readEncodedNumber(self.fo) + argres = self.decodeCMD(arg,argtype) + else : + # num_arg scalar arguments + for i in xrange(num_args): + argres.append(self.formatArg(readEncodedNumber(self.fo), argtype)) + + # build the return tag + result = [] + tkn = self.get_tagpath(0) + result.append(tkn) + result.append(subtagres) + result.append(argtype) + result.append(argres) + self.tag_pop() + return result + + # all tokens that need to be processed should be in the hash + # table if it may indicate a problem, either new token + # or an out of sync condition + else: + result = [] + if (self.debug): + print 'Unknown Token:', token + self.tag_pop() + return result + + + # special loop used to process code snippets + # it is NEVER used to format arguments. + # builds the snippetList + def doLoop72(self, argtype): + cnt = readEncodedNumber(self.fo) + if self.debug : + result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n' + result += 'of the document is indicated by snippet number sets at the\n' + result += 'end of each snippet. \n' + print result + for i in xrange(cnt): + if self.debug: print 'Snippet:',str(i) + snippet = [] + snippet.append(i) + val = readEncodedNumber(self.fo) + snippet.append(self.procToken(self.dict.lookup(val))) + self.snippetList.append(snippet) + return + + + # loop: pass though values unchanged + # DO NOT CHANGE - this has proven to be correct + def doLoop76Mode0(self, argtype, cnt): + result = [] + for i in xrange(cnt): + result.append(self.formatArg(readEncodedNumber(self.fo), argtype)) + return result + + + # loop generating values relative to the *negative* + # of the offset - don't ask why - it just is + # DO NOT CHANGE - this has proven to be correct + def doLoop76Mode1(self, argtype, cnt): + result = [] + offset = -readEncodedNumber(self.fo) + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + offset + result.append(self.formatArg(val, argtype)) + return result + + + # loop generating values with starting value and accumulation + # DO NOT CHANGE - this has proven to be the correct + def doLoop76Mode2(self, argtype, cnt): + result = [] + ptr = readEncodedNumber(self.fo) + result.append(self.formatArg(ptr, argtype)) + for i in xrange(cnt-1): + ptr = ptr + readEncodedNumber(self.fo) + result.append(self.formatArg(ptr, argtype)) + return result + + + # loop generating values with starting value and accumulation + # **after** subtracting adjustment value from each + # DO NOT CHANGE - this has been proven to be correct + def doLoop76Mode3(self, argtype, cnt): + result = [] + adj = readEncodedNumber(self.fo) + ptr = readEncodedNumber(self.fo) + ptr = ptr - adj + result.append(self.formatArg(ptr, argtype)) + for i in xrange(cnt-1): + ptr = ptr + readEncodedNumber(self.fo) - adj + result.append(self.formatArg(ptr,argtype)) + return result + + + # loop using runing sum of data values and starting value + # with accumulation to get new value + # Again, don't ask it took me forever to figure this out + # DO NOT CHANGE - this has been proven to be correct + def doLoop76Mode4(self, argtype, cnt): + result = [] + val = readEncodedNumber(self.fo) + runsum = val + ptr = val + result.append(self.formatArg(ptr, argtype)) + for i in xrange(cnt-1): + runsum += readEncodedNumber(self.fo) + ptr = ptr + runsum + result.append(self.formatArg(ptr,argtype)) + return result + + + # loop using and extra value as an adjustment + # and a running sum of the values after subtracting + # the adjustment, added to a ptr to get a new pointer + def doLoop76Mode5(self, argtype, cnt): + result = [] + adj = readEncodedNumber(self.fo) + ptr = 0 + runsum = 0 + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + runsum += (val - adj) + ptr = ptr +runsum + result.append(self.formatArg(ptr,argtype)) + return result + + + # FIXME: I have only 4 points to work this out with inside my book + # So may be wrong but it is correct for my 4 points + def doLoop76Mode6(self, argtype, cnt): + result = [] + oldval = 0 + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + ptr= (3 * oldval) + val + 1 + result.append(self.formatArg(ptr,argtype)) + oldval = val + return result + + + + # dispatches loop commands bytes with various modes + # The 0x76 style loops are used to build vectors + + # This was all derived by trial and error and + # new loop types may exist that are not handled here + # since they did not appear in the test cases + + def decodeCMD(self, cmd, argtype): + + # if (cmd == 0x72): + # self.doLoop72(argtype) + # result =[] + # return result + + if (cmd == 0x76): + # loop with cnt, and mode to control loop styles + cnt = readEncodedNumber(self.fo) + mode = readEncodedNumber(self.fo) + + if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' + + if (mode == 0x00): + return self.doLoop76Mode0(argtype, cnt) + + elif (mode == 0x01): + return self.doLoop76Mode1(argtype, cnt) + + elif (mode == 0x02): + return self.doLoop76Mode2(argtype, cnt) + + elif (mode == 0x03): + return self.doLoop76Mode3(argtype, cnt) + + elif (mode == 0x04): + return self.doLoop76Mode4(argtype, cnt) + + elif (mode == 0x05): + return self.doLoop76Mode5(argtype, cnt) + + elif (mode == 0x06): + return self.doLoop76Mode6(argtype, cnt) + + else: + + if self.debug : + # try to mark any unknown loop comands + # if they exist, unless they are used to process + # text or some other known list, we won't be able to prove them correct + print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode) + for i in xrange(cnt): + val = readEncodedNumber(self.fo) + print ' 0x%x' % val, + print ' ' + result = [] + return result + + if self.dbug: print "Unknown command", cmd + result = [] + return result + + # add full tag path to injected snippets + def updateName(self, tag, prefix): + name = tag[0] + subtagList = tag[1] + argtype = tag[2] + argList = tag[3] + nname = prefix + '.' + name + nsubtaglist = [] + for j in subtagList: + nsubtaglist.append(self.updateName(j,prefix)) + ntag = [] + ntag.append(nname) + ntag.append(nsubtaglist) + ntag.append(argtype) + ntag.append(argList) + return ntag + + + + # perform depth first injection of specified snippets into this one + def injectSnippets(self, snippet): + snipno, tag = snippet + name = tag[0] + subtagList = tag[1] + argtype = tag[2] + argList = tag[3] + nsubtagList = [] + if len(argList) > 0 : + for j in argList: + asnip = self.snippetList[j] + aso, atag = self.injectSnippets(asnip) + atag = self.updateName(atag, name) + nsubtagList.append(atag) + argtype='number' + argList=[] + if len(nsubtagList) > 0 : + subtagList.extend(nsubtagList) + tag = [] + tag.append(name) + tag.append(subtagList) + tag.append(argtype) + tag.append(argList) + snippet = [] + snippet.append(snipno) + snippet.append(tag) + return snippet + + + + # format the tag for output + def formatTag(self, node): + name = node[0] + subtagList = node[1] + argtype = node[2] + argList = node[3] + fullpathname = name.split('.') + nodename = fullpathname.pop() + ilvl = len(fullpathname) + indent = ' ' * (3 * ilvl) + result = indent + '<' + nodename + '>' + if len(argList) > 0: + argres = '' + for j in argList: + if (argtype == 'text') or (argtype == 'scalar_text') : + argres += j + '|' + else : + argres += str(j) + ',' + argres = argres[0:-1] + if argtype == 'snippets' : + result += 'snippets:' + argres + else : + result += argres + if len(subtagList) > 0 : + result += '\n' + for j in subtagList: + if len(j) > 0 : + result += self.formatTag(j) + result += indent + '' + nodename + '>\n' + else: + result += '' + nodename + '>\n' + return result + + + # flatten tag + def flattenTag(self, node): + name = node[0] + subtagList = node[1] + argtype = node[2] + argList = node[3] + result = name + if (len(argList) > 0): + argres = '' + for j in argList: + if (argtype == 'text') or (argtype == 'scalar_text') : + argres += j + '|' + else : + argres += str(j) + '|' + argres = argres[0:-1] + if argtype == 'snippets' : + result += '.snippets=' + argres + else : + result += '=' + argres + result += '\n' + for j in subtagList: + if len(j) > 0 : + result += self.flattenTag(j) + return result + + + # reduce create xml output + def formatDoc(self, flat_xml): + result = '' + for j in self.doc : + if len(j) > 0: + if flat_xml: + result += self.flattenTag(j) + else: + result += self.formatTag(j) + if self.debug : print result + return result + + + + # main loop - parse the page.dat files + # to create structured document and snippets + + # FIXME: value at end of magic appears to be a subtags count + # but for what? For now, inject an 'info" tag as it is in + # every dictionary and seems close to what is meant + # The alternative is to special case the last _ "0x5f" to mean something + + def process(self): + + # peek at the first bytes to see what type of file it is + magic = self.fo.read(11) + if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'): + first_token = 'info' + elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'): + skip = self.fo.read(1) + first_token = 'info' + else : + # other0.dat file + first_token = None + self.fo.seek(-11,1) + + + # main loop to read and build the document tree + while True: + + if first_token != None : + # use "inserted" first token 'info' for page and glyph files + tag = self.procToken(first_token) + if len(tag) > 0 : + self.doc.append(tag) + first_token = None + + v = self.getNext() + if (v == None): + break + + if (v == 0x72): + self.doLoop72('number') + elif (v > 0) and (v < self.dict.getSize()) : + tag = self.procToken(self.dict.lookup(v)) + if len(tag) > 0 : + self.doc.append(tag) + else: + if self.debug: + print "Mina Loop: Unknown value: %x" % v + + + # now do snippet injection + if len(self.snippetList) > 0 : + if self.debug : print 'Injecting Snippets:' + snippet = self.injectSnippets(self.snippetList[0]) + snipno = snippet[0] + tag_add = snippet[1] + if self.debug : print self.formatTag(tag_add) + if len(tag_add) > 0: + self.doc.append(tag_add) + + # handle generation of xml output + xmlpage = self.formatDoc(self.flat_xml) + + return xmlpage + + + +def usage(): + print 'Usage: ' + print ' convert2xml.py dict0000.dat infile.dat ' + print ' ' + print ' Options:' + print ' -h print this usage help message ' + print ' -d turn on debug output to check for potential errors ' + print ' --flat-xml output the flattened xml page description only ' + print ' ' + print ' This program will attempt to convert a page*.dat file or ' + print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. ' + print ' ' + print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump ' + print ' the *.dat files from a Topaz format e-book.' + +# +# Main +# + +def main(argv): + dictFile = "" + pageFile = "" + debug = False + flat_xml = False + printOutput = False + if len(argv) == 0: + printOutput = True + argv = sys.argv + else : + argv = argv.split() + + try: + opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"]) + + except getopt.GetoptError, err: + + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) + + for o, a in opts: + if o =="-d": + debug=True + if o =="-h": + usage() + sys.exit(0) + if o =="--flat-xml": + flat_xml = True + + dictFile, pageFile = args[0], args[1] + + # read in the string table dictionary + dict = Dictionary(dictFile) + + # create a page parser + pp = PageParser(pageFile, dict, debug, flat_xml) + + xmlpage = pp.process() + + if printOutput: + print xmlpage + return 0 + + return xmlpage + +if __name__ == '__main__': + sys.exit(main('')) diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py new file mode 100644 index 0000000..f038310 --- /dev/null +++ b/Topaz_Tools/lib/decode_meta.py @@ -0,0 +1,109 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + +# +# Get a 7 bit encoded number from string +# + +def readEncodedNumber(file): + flag = False + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data == 0xFF: + flag = True + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + + if data >= 0x80: + datax = (data & 0x7F) + while data >= 0x80 : + c = file.read(1) + if (len(c) == 0): + return None + data = ord(c) + datax = (datax <<7) + (data & 0x7F) + data = datax + + if flag: + data = -data + return data + +# +# Encode a number in 7 bit format +# + +def encodeNumber(number): + result = "" + negative = False + flag = 0 + + if number < 0 : + number = -number + 1 + negative = True + + while True: + byte = number & 0x7F + number = number >> 7 + byte += flag + result += chr(byte) + flag = 0x80 + if number == 0 : break + + if negative: + result += chr(0xFF) + + return result[::-1] + +# +# Get a length prefixed string from the file +# +def lengthPrefixString(data): + return encodeNumber(len(data))+data + +def readString(file): + stringLength = readEncodedNumber(file) + if (stringLength == None): + return None + sv = file.read(stringLength) + if (len(sv) != stringLength): + return "" + return unpack(str(stringLength)+"s",sv)[0] + + + +def getMetaArray(metaFile): + # parse the meta file into a Python dictionary (associative array) + result = {} + fo = file(metaFile,'rb') + size = readEncodedNumber(fo) + for i in xrange(size): + temp = readString(fo) + result[temp] = readString(fo) + fo.close() + return result + + + +def getMetaData(metaFile): + # parse the meta file + result = '' + fo = file(metaFile,'rb') + size = readEncodedNumber(fo) + for i in xrange(size): + result += readString(fo) + '|' + result += readString(fo) + '\n' + fo.close() + return result diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py new file mode 100644 index 0000000..1a800e8 --- /dev/null +++ b/Topaz_Tools/lib/flatxml2html.py @@ -0,0 +1,299 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + + +class DocParser(object): + def __init__(self, flatxml, fileid): + self.id = os.path.basename(fileid).replace('.dat','') + self.flatdoc = flatxml.split('\n') + self.ocrtext = [] + self.link_id = [] + self.link_title = [] + self.link_page = [] + self.dehyphen_rootid = [] + self.paracont_stemid = [] + self.parastems_stemid = [] + + + + # find tag if within pos to end inclusive + def findinDoc(self, tagpath, pos, end) : + result = None + docList = self.flatdoc + cnt = len(docList) + if end == -1 : + end = cnt + else: + end = min(cnt,end) + foundat = -1 + for j in xrange(pos, end): + item = docList[j] + if item.find('=') >= 0: + (name, argres) = item.split('=') + else : + name = item + argres = '' + if name.endswith(tagpath) : + result = argres + foundat = j + break + return foundat, result + + + # return list of start positions for the tagpath + def posinDoc(self, tagpath): + startpos = [] + pos = 0 + res = "" + while res != None : + (foundpos, res) = self.findinDoc(tagpath, pos, -1) + if res != None : + startpos.append(foundpos) + pos = foundpos + 1 + return startpos + + + # get a description of the paragraph + def getParaDescription(self, start, end): + # normal paragraph + (pos, pclass) = self.findinDoc('paragraph.class',start,end) + + # class names are an issue given topaz starts them with numerals (not allowed) + # use a mix of cases, (which cause some browsers problems), and actually + # attach numbers after "reclustered*" to the end to deal with reflow issues + # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered + pclass = pclass.lower() + pclass = 'cl_' + pclass + p = pclass.find('reclustered') + if p > 0 : pclass = pclass[0:p+11] + + (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) + (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) + if (sfirst != None) and (slast != None) : + return pclass, int(sfirst), int(slast) + + # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well + # so walk through this region keeping track of the first firstword, and the last lastWord + # on any items that have it + (pos, sfirst) = self.findinDoc('firstWord',start, end) + first = int(sfirst) + last = -1 + for i in xrange(pos+1,end): + (pos, slast) = self.findinDoc('lastWord',i,i+1) + if slast != None: + last = int(slast) + return pclass, first, last + + + def buildParagraph(self, cname, first, last, type, regtype) : + parares = '' + sep ='' + br_lb = False + if (regtype == 'fixed') or (regtype == 'chapterheading') : + br_lb = True + handle_links = False + if len(self.link_id) > 0: + handle_links = True + if (type == 'full') or (type == 'begin') : + parares += '
'
+ if (type == 'end'):
+ parares += ' '
+ for j in xrange(first, last) :
+ word = self.ocrtext[j]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[j]
+ if (link > 0):
+ title = self.link_title[link-1]
+ if title == "": title='_link_'
+ ptarget = self.link_page[link-1] - 1
+ linkhtml = '' % ptarget
+ linkhtml += title + ''
+ pos = parares.rfind(title)
+ if pos >= 0:
+ parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
+ else :
+ parares += linkhtml
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ if (j-1) in self.dehyphen_rootid :
+ word = ''
+ sep = ''
+ elif handle_links :
+ word = ''
+ sep = ''
+ elif br_lb :
+ word = '
\n'
+ sep = ''
+ else :
+ word = '\n'
+ sep = ''
+
+ if j in self.dehyphen_rootid :
+ word = word[0:-1]
+ sep = ''
+
+ parares += word + sep
+
+ if len(sep) > 0 : parares = parares[0:-1]
+ if (type == 'full') or (type == 'end') :
+ parares += '