From 93f02c625a480e7d96a051820d71e565b7c729aa Mon Sep 17 00:00:00 2001 From: Apprentice Alf Date: Fri, 28 Oct 2011 07:24:15 +0100 Subject: [PATCH] tools v4.8 --- .../K4MobiDeDRM_plugin/__init__.py | 2 +- .../K4MobiDeDRM_plugin/convert2xml.py | 18 +- .../K4MobiDeDRM_plugin/flatxml2html.py | 82 ++++++++- .../K4MobiDeDRM_plugin/flatxml2svg.py | 137 ++++++++++++-- Calibre_Plugins/K4MobiDeDRM_plugin/genbook.py | 171 ++++++++++++++---- .../K4MobiDeDRM_plugin/k4mobidedrm_orig.py | 15 +- .../K4MobiDeDRM_plugin/stylexml2css.py | 16 +- .../K4MobiDeDRM_plugin/topazextract.py | 10 +- Calibre_Plugins/eReaderPDB2PML_plugin.zip | Bin 13530 -> 13882 bytes .../eReaderPDB2PML_plugin/__init__.py | 4 +- .../eReaderPDB2PML_plugin/erdr2pml.py | 57 ++++-- Calibre_Plugins/k4mobidedrm_plugin.zip | Bin 49841 -> 51886 bytes DeDRM_Macintosh_Application/DeDRM.app.txt | Bin 107032 -> 107246 bytes .../DeDRM.app/Contents/Info.plist | 16 +- .../Contents/Resources/Scripts/main.scpt | Bin 230764 -> 236044 bytes .../Contents/Resources/convert2xml.py | 18 +- .../Resources/description.rtfd/TXT.rtf | 2 +- .../DeDRM.app/Contents/Resources/erdr2pml.py | 57 ++++-- .../Contents/Resources/flatxml2html.py | 82 ++++++++- .../Contents/Resources/flatxml2svg.py | 137 ++++++++++++-- .../DeDRM.app/Contents/Resources/genbook.py | 171 ++++++++++++++---- .../Contents/Resources/k4mobidedrm.py | 15 +- .../Contents/Resources/stylexml2css.py | 16 +- .../Contents/Resources/topazextract.py | 10 +- .../DeDRM_WinApp/DeDRM_lib/lib/convert2xml.py | 18 +- .../DeDRM_WinApp/DeDRM_lib/lib/erdr2pml.py | 57 ++++-- .../DeDRM_lib/lib/flatxml2html.py | 82 ++++++++- .../DeDRM_WinApp/DeDRM_lib/lib/flatxml2svg.py | 137 ++++++++++++-- .../DeDRM_WinApp/DeDRM_lib/lib/genbook.py | 171 ++++++++++++++---- .../DeDRM_WinApp/DeDRM_lib/lib/k4mobidedrm.py | 15 +- .../DeDRM_lib/lib/stylexml2css.py | 16 +- .../DeDRM_lib/lib/topazextract.py | 10 +- KindleBooks/lib/convert2xml.py | 18 +- KindleBooks/lib/flatxml2html.py | 82 ++++++++- KindleBooks/lib/flatxml2svg.py | 137 ++++++++++++-- KindleBooks/lib/genbook.py | 171 ++++++++++++++---- KindleBooks/lib/k4mobidedrm.py | 15 +- KindleBooks/lib/stylexml2css.py | 16 +- KindleBooks/lib/topazextract.py | 10 +- ReadMe_First.txt | 40 +--- ReadMe_Linux_Users.txt | 113 ++++++++++++ .../lib/eReaderPDB2PML_plugin.py | 2 +- eReader_PDB_Tools/lib/erdr2pml.py | 57 ++++-- 43 files changed, 1785 insertions(+), 418 deletions(-) create mode 100644 ReadMe_Linux_Users.txt diff --git a/Calibre_Plugins/K4MobiDeDRM_plugin/__init__.py b/Calibre_Plugins/K4MobiDeDRM_plugin/__init__.py index 7081e78..30c1e13 100644 --- a/Calibre_Plugins/K4MobiDeDRM_plugin/__init__.py +++ b/Calibre_Plugins/K4MobiDeDRM_plugin/__init__.py @@ -19,7 +19,7 @@ class K4DeDRM(FileTypePlugin): description = 'Removes DRM from Mobipocket, Kindle/Mobi, Kindle/Topaz and Kindle/Print Replica files. Provided by the work of many including DiapDealer, SomeUpdates, IHeartCabbages, CMBDTC, Skindle, DarkReverser, ApprenticeAlf, etc.' supported_platforms = ['osx', 'windows', 'linux'] # Platforms this plugin will run on author = 'DiapDealer, SomeUpdates' # The author of this plugin - version = (0, 3, 7) # The version number of this plugin + version = (0, 3, 8) # The version number of this plugin file_types = set(['prc','mobi','azw','azw1','azw4','tpz']) # The file types that this plugin will be applied to on_import = True # Run this plugin during the import priority = 210 # run this plugin before mobidedrm, k4pcdedrm, k4dedrm diff --git a/Calibre_Plugins/K4MobiDeDRM_plugin/convert2xml.py b/Calibre_Plugins/K4MobiDeDRM_plugin/convert2xml.py index 3c27ed0..0328206 100644 --- a/Calibre_Plugins/K4MobiDeDRM_plugin/convert2xml.py +++ b/Calibre_Plugins/K4MobiDeDRM_plugin/convert2xml.py @@ -20,6 +20,8 @@ import getopt from struct import pack from struct import unpack +class TpzDRMError(Exception): + pass # Get a 7 bit encoded number from string. The most # significant byte comes first and has the high bit (8th) set @@ -138,7 +140,8 @@ class Dictionary(object): return self.stable[self.pos] else: print "Error - %d outside of string table limits" % val - sys.exit(-1) + raise TpzDRMError('outside of string table limits') + # sys.exit(-1) def getSize(self): return self.size @@ -258,6 +261,11 @@ class PageParser(object): 'paragraph.class' : (1, 'scalar_text', 0, 0), 'paragraph.firstWord' : (1, 'scalar_number', 0, 0), 'paragraph.lastWord' : (1, 'scalar_number', 0, 0), + 'paragraph.lastWord' : (1, 'scalar_number', 0, 0), + 'paragraph.gridSize' : (1, 'scalar_number', 0, 0), + 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0), + 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0), + 'word_semantic' : (1, 'snippets', 1, 1), 'word_semantic.type' : (1, 'scalar_text', 0, 0), @@ -272,11 +280,17 @@ class PageParser(object): '_span' : (1, 'snippets', 1, 0), '_span.firstWord' : (1, 'scalar_number', 0, 0), - '-span.lastWord' : (1, 'scalar_number', 0, 0), + '_span.lastWord' : (1, 'scalar_number', 0, 0), + '_span.gridSize' : (1, 'scalar_number', 0, 0), + '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0), + '_span.gridTopCenter' : (1, 'scalar_number', 0, 0), 'span' : (1, 'snippets', 1, 0), 'span.firstWord' : (1, 'scalar_number', 0, 0), 'span.lastWord' : (1, 'scalar_number', 0, 0), + 'span.gridSize' : (1, 'scalar_number', 0, 0), + 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0), + 'span.gridTopCenter' : (1, 'scalar_number', 0, 0), 'extratokens' : (1, 'snippets', 1, 0), 'extratokens.type' : (1, 'scalar_text', 0, 0), diff --git a/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2html.py b/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2html.py index ae2c8dd..3b32fc0 100644 --- a/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2html.py +++ b/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2html.py @@ -271,6 +271,9 @@ class DocParser(object): pclass = self.getClass(pclass) + # if paragraph uses extratokens (extra glyphs) then make it fixed + (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end) + # build up a description of the paragraph in result and return it # first check for the basic - all words paragraph (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) @@ -280,6 +283,7 @@ class DocParser(object): last = int(slast) makeImage = (regtype == 'vertical') or (regtype == 'table') + makeImage = makeImage or (extraglyphs != None) if self.fixedimage: makeImage = makeImage or (regtype == 'fixed') @@ -353,6 +357,8 @@ class DocParser(object): word_class = '' + word_semantic_type = '' + while (line < end) : (name, argres) = self.lineinDoc(line) @@ -512,6 +518,72 @@ class DocParser(object): return parares + def buildTOCEntry(self, pdesc) : + parares = '' + sep ='' + tocentry = '' + handle_links = len(self.link_id) > 0 + + lstart = 0 + + cnt = len(pdesc) + for j in xrange( 0, cnt) : + + (wtype, num) = pdesc[j] + + if wtype == 'ocr' : + word = self.ocrtext[num] + sep = ' ' + + if handle_links: + link = self.link_id[num] + if (link > 0): + linktype = self.link_type[link-1] + title = self.link_title[link-1] + title = title.rstrip('. ') + alt_title = parares[lstart:] + alt_title = alt_title.strip() + # now strip off the actual printed page number + alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.') + alt_title = alt_title.rstrip('. ') + # skip over any external links - can't have them in a books toc + if linktype == 'external' : + title = '' + alt_title = '' + linkpage = '' + else : + if len(self.link_page) >= link : + ptarget = self.link_page[link-1] - 1 + linkpage = '%04d' % ptarget + else : + # just link to the current page + linkpage = self.id[4:] + if len(alt_title) >= len(title): + title = alt_title + if title != '' and linkpage != '': + tocentry += title + '|' + linkpage + '\n' + lstart = len(parares) + if word == '_link_' : word = '' + elif (link < 0) : + if word == '_link_' : word = '' + + if word == '_lb_': + word = '' + sep = '' + + if num in self.dehyphen_rootid : + word = word[0:-1] + sep = '' + + parares += word + sep + + else : + continue + + return tocentry + + + # walk the document tree collecting the information needed # to build an html page using the ocrText @@ -519,6 +591,7 @@ class DocParser(object): def process(self): htmlpage = '' + tocinfo = '' # get the ocr text (pos, argres) = self.findinDoc('info.word.ocrText',0,-1) @@ -644,9 +717,9 @@ class DocParser(object): ptype = 'end' first_para_continued = False (pclass, pdesc) = self.getParaDescription(start,end, regtype) + tocinfo += self.buildTOCEntry(pdesc) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - elif (regtype == 'vertical') or (regtype == 'table') : ptype = 'full' if inGroup: @@ -704,12 +777,11 @@ class DocParser(object): htmlpage = htmlpage[0:-4] last_para_continued = False - return htmlpage - + return htmlpage, tocinfo def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage): # create a document parser dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage) - htmlpage = dp.process() - return htmlpage + htmlpage, tocinfo = dp.process() + return htmlpage, tocinfo diff --git a/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2svg.py b/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2svg.py index 6f6795d..49cf6f5 100644 --- a/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2svg.py +++ b/Calibre_Plugins/K4MobiDeDRM_plugin/flatxml2svg.py @@ -10,17 +10,94 @@ from struct import unpack class PParser(object): - def __init__(self, gd, flatxml): + def __init__(self, gd, flatxml, meta_array): self.gd = gd self.flatdoc = flatxml.split('\n') + self.docSize = len(self.flatdoc) self.temp = [] - foo = self.getData('page.h') or self.getData('book.h') - self.ph = foo[0] - foo = self.getData('page.w') or self.getData('book.w') - self.pw = foo[0] - self.gx = self.getData('info.glyph.x') - self.gy = self.getData('info.glyph.y') - self.gid = self.getData('info.glyph.glyphID') + + self.ph = -1 + self.pw = -1 + startpos = self.posinDoc('page.h') or self.posinDoc('book.h') + for p in startpos: + (name, argres) = self.lineinDoc(p) + self.ph = max(self.ph, int(argres)) + startpos = self.posinDoc('page.w') or self.posinDoc('book.w') + for p in startpos: + (name, argres) = self.lineinDoc(p) + self.pw = max(self.pw, int(argres)) + + if self.ph <= 0: + self.ph = int(meta_array.get('pageHeight', '11000')) + if self.pw <= 0: + self.pw = int(meta_array.get('pageWidth', '8500')) + + res = [] + startpos = self.posinDoc('info.glyph.x') + for p in startpos: + argres = self.getDataatPos('info.glyph.x', p) + res.extend(argres) + self.gx = res + + res = [] + startpos = self.posinDoc('info.glyph.y') + for p in startpos: + argres = self.getDataatPos('info.glyph.y', p) + res.extend(argres) + self.gy = res + + res = [] + startpos = self.posinDoc('info.glyph.glyphID') + for p in startpos: + argres = self.getDataatPos('info.glyph.glyphID', p) + res.extend(argres) + self.gid = res + + + # return tag at line pos in document + def lineinDoc(self, pos) : + if (pos >= 0) and (pos < self.docSize) : + item = self.flatdoc[pos] + if item.find('=') >= 0: + (name, argres) = item.split('=',1) + else : + name = item + argres = '' + return name, argres + + # find tag in doc if within pos to end inclusive + def findinDoc(self, tagpath, pos, end) : + result = None + if end == -1 : + end = self.docSize + else: + end = min(self.docSize, end) + foundat = -1 + for j in xrange(pos, end): + item = self.flatdoc[j] + if item.find('=') >= 0: + (name, argres) = item.split('=',1) + else : + name = item + argres = '' + if name.endswith(tagpath) : + result = argres + foundat = j + break + return foundat, result + + # return list of start positions for the tagpath + def posinDoc(self, tagpath): + startpos = [] + pos = 0 + res = "" + while res != None : + (foundpos, res) = self.findinDoc(tagpath, pos, -1) + if res != None : + startpos.append(foundpos) + pos = foundpos + 1 + return startpos + def getData(self, path): result = None cnt = len(self.flatdoc) @@ -39,6 +116,23 @@ class PParser(object): for j in xrange(0,len(argres)): argres[j] = int(argres[j]) return result + + def getDataatPos(self, path, pos): + result = None + item = self.flatdoc[pos] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + if (name.endswith(path)): + result = argres + return result + def getDataTemp(self, path): result = None cnt = len(self.temp) @@ -58,6 +152,7 @@ class PParser(object): for j in xrange(0,len(argres)): argres[j] = int(argres[j]) return result + def getImages(self): result = [] self.temp = self.flatdoc @@ -69,6 +164,7 @@ class PParser(object): src = self.getDataTemp('img.src')[0] result.append('\n' % (src, x, y, w, h)) return result + def getGlyphs(self): result = [] if (self.gid != None) and (len(self.gid) > 0): @@ -84,25 +180,25 @@ class PParser(object): return result -def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi): +def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi): ml = '' - pp = PParser(gdict, flat_xml) + pp = PParser(gdict, flat_xml, meta_array) ml += '\n' if (raw): ml += '\n' ml += '\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1) - ml += 'Page %d - %s by %s\n' % (counter, meta_array['Title'],meta_array['Authors']) + ml += 'Page %d - %s by %s\n' % (pageid, meta_array['Title'],meta_array['Authors']) else: ml += '\n' ml += '\n' - ml += 'Page %d - %s by %s\n' % (counter, meta_array['Title'],meta_array['Authors']) + ml += 'Page %d - %s by %s\n' % (pageid, meta_array['Title'],meta_array['Authors']) ml += '