From 939cdbb0c9cc48f81c269d9459556b77bb44b9fb Mon Sep 17 00:00:00 2001 From: Apprentice Harper Date: Fri, 16 Oct 2020 13:58:59 +0100 Subject: [PATCH] More fixes for Amazon books, fixing identity checks, started on Topaz. --- DeDRM_plugin/convert2xml.py | 474 +++++++++++++++++------------------ DeDRM_plugin/flatxml2html.py | 230 ++++++++--------- DeDRM_plugin/flatxml2svg.py | 38 +-- DeDRM_plugin/genbook.py | 82 +++--- DeDRM_plugin/kgenpids.py | 33 +-- DeDRM_plugin/mobidedrm.py | 6 +- DeDRM_plugin/stylexml2css.py | 127 +++++----- DeDRM_plugin/topazextract.py | 52 ++-- 8 files changed, 530 insertions(+), 512 deletions(-) diff --git a/DeDRM_plugin/convert2xml.py b/DeDRM_plugin/convert2xml.py index 3249db5..abdaeb3 100644 --- a/DeDRM_plugin/convert2xml.py +++ b/DeDRM_plugin/convert2xml.py @@ -56,7 +56,7 @@ def readEncodedNumber(file): c = file.read(1) if (len(c) == 0): return None - data = ord(c) + data = c[0] datax = (datax <<7) + (data & 0x7F) data = datax @@ -188,232 +188,232 @@ class PageParser(object): # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) token_tags = { - 'x' : (1, 'scalar_number', 0, 0), - 'y' : (1, 'scalar_number', 0, 0), - 'h' : (1, 'scalar_number', 0, 0), - 'w' : (1, 'scalar_number', 0, 0), - 'firstWord' : (1, 'scalar_number', 0, 0), - 'lastWord' : (1, 'scalar_number', 0, 0), - 'rootID' : (1, 'scalar_number', 0, 0), - 'stemID' : (1, 'scalar_number', 0, 0), - 'type' : (1, 'scalar_text', 0, 0), + b'x' : (1, 'scalar_number', 0, 0), + b'y' : (1, 'scalar_number', 0, 0), + b'h' : (1, 'scalar_number', 0, 0), + b'w' : (1, 'scalar_number', 0, 0), + b'firstWord' : (1, 'scalar_number', 0, 0), + b'lastWord' : (1, 'scalar_number', 0, 0), + b'rootID' : (1, 'scalar_number', 0, 0), + b'stemID' : (1, 'scalar_number', 0, 0), + b'type' : (1, 'scalar_text', 0, 0), - 'info' : (0, 'number', 1, 0), + b'info' : (0, 'number', 1, 0), - 'info.word' : (0, 'number', 1, 1), - 'info.word.ocrText' : (1, 'text', 0, 0), - 'info.word.firstGlyph' : (1, 'raw', 0, 0), - 'info.word.lastGlyph' : (1, 'raw', 0, 0), - 'info.word.bl' : (1, 'raw', 0, 0), - 'info.word.link_id' : (1, 'number', 0, 0), + b'info.word' : (0, 'number', 1, 1), + b'info.word.ocrText' : (1, 'text', 0, 0), + b'info.word.firstGlyph' : (1, 'raw', 0, 0), + b'info.word.lastGlyph' : (1, 'raw', 0, 0), + b'info.word.bl' : (1, 'raw', 0, 0), + b'info.word.link_id' : (1, 'number', 0, 0), - 'glyph' : (0, 'number', 1, 1), - 'glyph.x' : (1, 'number', 0, 0), - 'glyph.y' : (1, 'number', 0, 0), - 'glyph.glyphID' : (1, 'number', 0, 0), + b'glyph' : (0, 'number', 1, 1), + b'glyph.x' : (1, 'number', 0, 0), + b'glyph.y' : (1, 'number', 0, 0), + b'glyph.glyphID' : (1, 'number', 0, 0), - 'dehyphen' : (0, 'number', 1, 1), - 'dehyphen.rootID' : (1, 'number', 0, 0), - 'dehyphen.stemID' : (1, 'number', 0, 0), - 'dehyphen.stemPage' : (1, 'number', 0, 0), - 'dehyphen.sh' : (1, 'number', 0, 0), + b'dehyphen' : (0, 'number', 1, 1), + b'dehyphen.rootID' : (1, 'number', 0, 0), + b'dehyphen.stemID' : (1, 'number', 0, 0), + b'dehyphen.stemPage' : (1, 'number', 0, 0), + b'dehyphen.sh' : (1, 'number', 0, 0), - 'links' : (0, 'number', 1, 1), - 'links.page' : (1, 'number', 0, 0), - 'links.rel' : (1, 'number', 0, 0), - 'links.row' : (1, 'number', 0, 0), - 'links.title' : (1, 'text', 0, 0), - 'links.href' : (1, 'text', 0, 0), - 'links.type' : (1, 'text', 0, 0), - 'links.id' : (1, 'number', 0, 0), + b'links' : (0, 'number', 1, 1), + b'links.page' : (1, 'number', 0, 0), + b'links.rel' : (1, 'number', 0, 0), + b'links.row' : (1, 'number', 0, 0), + b'links.title' : (1, 'text', 0, 0), + b'links.href' : (1, 'text', 0, 0), + b'links.type' : (1, 'text', 0, 0), + b'links.id' : (1, 'number', 0, 0), - 'paraCont' : (0, 'number', 1, 1), - 'paraCont.rootID' : (1, 'number', 0, 0), - 'paraCont.stemID' : (1, 'number', 0, 0), - 'paraCont.stemPage' : (1, 'number', 0, 0), + b'paraCont' : (0, 'number', 1, 1), + b'paraCont.rootID' : (1, 'number', 0, 0), + b'paraCont.stemID' : (1, 'number', 0, 0), + b'paraCont.stemPage' : (1, 'number', 0, 0), - 'paraStems' : (0, 'number', 1, 1), - 'paraStems.stemID' : (1, 'number', 0, 0), + b'paraStems' : (0, 'number', 1, 1), + b'paraStems.stemID' : (1, 'number', 0, 0), - 'wordStems' : (0, 'number', 1, 1), - 'wordStems.stemID' : (1, 'number', 0, 0), + b'wordStems' : (0, 'number', 1, 1), + b'wordStems.stemID' : (1, 'number', 0, 0), - 'empty' : (1, 'snippets', 1, 0), + b'empty' : (1, 'snippets', 1, 0), - 'page' : (1, 'snippets', 1, 0), - 'page.class' : (1, 'scalar_text', 0, 0), - 'page.pageid' : (1, 'scalar_text', 0, 0), - 'page.pagelabel' : (1, 'scalar_text', 0, 0), - 'page.type' : (1, 'scalar_text', 0, 0), - 'page.h' : (1, 'scalar_number', 0, 0), - 'page.w' : (1, 'scalar_number', 0, 0), - 'page.startID' : (1, 'scalar_number', 0, 0), + b'page' : (1, 'snippets', 1, 0), + b'page.class' : (1, 'scalar_text', 0, 0), + b'page.pageid' : (1, 'scalar_text', 0, 0), + b'page.pagelabel' : (1, 'scalar_text', 0, 0), + b'page.type' : (1, 'scalar_text', 0, 0), + b'page.h' : (1, 'scalar_number', 0, 0), + b'page.w' : (1, 'scalar_number', 0, 0), + b'page.startID' : (1, 'scalar_number', 0, 0), - 'group' : (1, 'snippets', 1, 0), - 'group.class' : (1, 'scalar_text', 0, 0), - 'group.type' : (1, 'scalar_text', 0, 0), - 'group._tag' : (1, 'scalar_text', 0, 0), - 'group.orientation': (1, 'scalar_text', 0, 0), + b'group' : (1, 'snippets', 1, 0), + b'group.class' : (1, 'scalar_text', 0, 0), + b'group.type' : (1, 'scalar_text', 0, 0), + b'group._tag' : (1, 'scalar_text', 0, 0), + b'group.orientation': (1, 'scalar_text', 0, 0), - 'region' : (1, 'snippets', 1, 0), - 'region.class' : (1, 'scalar_text', 0, 0), - 'region.type' : (1, 'scalar_text', 0, 0), - 'region.x' : (1, 'scalar_number', 0, 0), - 'region.y' : (1, 'scalar_number', 0, 0), - 'region.h' : (1, 'scalar_number', 0, 0), - 'region.w' : (1, 'scalar_number', 0, 0), - 'region.orientation' : (1, 'scalar_text', 0, 0), + b'region' : (1, 'snippets', 1, 0), + b'region.class' : (1, 'scalar_text', 0, 0), + b'region.type' : (1, 'scalar_text', 0, 0), + b'region.x' : (1, 'scalar_number', 0, 0), + b'region.y' : (1, 'scalar_number', 0, 0), + b'region.h' : (1, 'scalar_number', 0, 0), + b'region.w' : (1, 'scalar_number', 0, 0), + b'region.orientation' : (1, 'scalar_text', 0, 0), - 'empty_text_region' : (1, 'snippets', 1, 0), + b'empty_text_region' : (1, 'snippets', 1, 0), - 'img' : (1, 'snippets', 1, 0), - 'img.x' : (1, 'scalar_number', 0, 0), - 'img.y' : (1, 'scalar_number', 0, 0), - 'img.h' : (1, 'scalar_number', 0, 0), - 'img.w' : (1, 'scalar_number', 0, 0), - 'img.src' : (1, 'scalar_number', 0, 0), - 'img.color_src' : (1, 'scalar_number', 0, 0), - 'img.gridSize' : (1, 'scalar_number', 0, 0), - 'img.gridBottomCenter' : (1, 'scalar_number', 0, 0), - 'img.gridTopCenter' : (1, 'scalar_number', 0, 0), - 'img.gridBeginCenter' : (1, 'scalar_number', 0, 0), - 'img.gridEndCenter' : (1, 'scalar_number', 0, 0), - 'img.image_type' : (1, 'scalar_number', 0, 0), + b'img' : (1, 'snippets', 1, 0), + b'img.x' : (1, 'scalar_number', 0, 0), + b'img.y' : (1, 'scalar_number', 0, 0), + b'img.h' : (1, 'scalar_number', 0, 0), + b'img.w' : (1, 'scalar_number', 0, 0), + b'img.src' : (1, 'scalar_number', 0, 0), + b'img.color_src' : (1, 'scalar_number', 0, 0), + b'img.gridSize' : (1, 'scalar_number', 0, 0), + b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0), + b'img.gridTopCenter' : (1, 'scalar_number', 0, 0), + b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0), + b'img.gridEndCenter' : (1, 'scalar_number', 0, 0), + b'img.image_type' : (1, 'scalar_number', 0, 0), - 'paragraph' : (1, 'snippets', 1, 0), - 'paragraph.class' : (1, 'scalar_text', 0, 0), - 'paragraph.firstWord' : (1, 'scalar_number', 0, 0), - 'paragraph.lastWord' : (1, 'scalar_number', 0, 0), - 'paragraph.lastWord' : (1, 'scalar_number', 0, 0), - 'paragraph.gridSize' : (1, 'scalar_number', 0, 0), - 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0), - 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0), - 'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0), - 'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0), + b'paragraph' : (1, 'snippets', 1, 0), + b'paragraph.class' : (1, 'scalar_text', 0, 0), + b'paragraph.firstWord' : (1, 'scalar_number', 0, 0), + b'paragraph.lastWord' : (1, 'scalar_number', 0, 0), + b'paragraph.lastWord' : (1, 'scalar_number', 0, 0), + b'paragraph.gridSize' : (1, 'scalar_number', 0, 0), + b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0), + b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0), + b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0), + b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0), - 'word_semantic' : (1, 'snippets', 1, 1), - 'word_semantic.type' : (1, 'scalar_text', 0, 0), - 'word_semantic.class' : (1, 'scalar_text', 0, 0), - 'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), - 'word_semantic.lastWord' : (1, 'scalar_number', 0, 0), - 'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0), - 'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0), - 'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0), - 'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0), + b'word_semantic' : (1, 'snippets', 1, 1), + b'word_semantic.type' : (1, 'scalar_text', 0, 0), + b'word_semantic.class' : (1, 'scalar_text', 0, 0), + b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), + b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0), + b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0), + b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0), + b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0), + b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0), - 'word' : (1, 'snippets', 1, 0), - 'word.type' : (1, 'scalar_text', 0, 0), - 'word.class' : (1, 'scalar_text', 0, 0), - 'word.firstGlyph' : (1, 'scalar_number', 0, 0), - 'word.lastGlyph' : (1, 'scalar_number', 0, 0), + b'word' : (1, 'snippets', 1, 0), + b'word.type' : (1, 'scalar_text', 0, 0), + b'word.class' : (1, 'scalar_text', 0, 0), + b'word.firstGlyph' : (1, 'scalar_number', 0, 0), + b'word.lastGlyph' : (1, 'scalar_number', 0, 0), - '_span' : (1, 'snippets', 1, 0), - '_span.class' : (1, 'scalar_text', 0, 0), - '_span.firstWord' : (1, 'scalar_number', 0, 0), - '_span.lastWord' : (1, 'scalar_number', 0, 0), - '_span.gridSize' : (1, 'scalar_number', 0, 0), - '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0), - '_span.gridTopCenter' : (1, 'scalar_number', 0, 0), - '_span.gridBeginCenter' : (1, 'scalar_number', 0, 0), - '_span.gridEndCenter' : (1, 'scalar_number', 0, 0), + b'_span' : (1, 'snippets', 1, 0), + b'_span.class' : (1, 'scalar_text', 0, 0), + b'_span.firstWord' : (1, 'scalar_number', 0, 0), + b'_span.lastWord' : (1, 'scalar_number', 0, 0), + b'_span.gridSize' : (1, 'scalar_number', 0, 0), + b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0), + b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0), + b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0), + b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0), - 'span' : (1, 'snippets', 1, 0), - 'span.firstWord' : (1, 'scalar_number', 0, 0), - 'span.lastWord' : (1, 'scalar_number', 0, 0), - 'span.gridSize' : (1, 'scalar_number', 0, 0), - 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0), - 'span.gridTopCenter' : (1, 'scalar_number', 0, 0), - 'span.gridBeginCenter' : (1, 'scalar_number', 0, 0), - 'span.gridEndCenter' : (1, 'scalar_number', 0, 0), + b'span' : (1, 'snippets', 1, 0), + b'span.firstWord' : (1, 'scalar_number', 0, 0), + b'span.lastWord' : (1, 'scalar_number', 0, 0), + b'span.gridSize' : (1, 'scalar_number', 0, 0), + b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0), + b'span.gridTopCenter' : (1, 'scalar_number', 0, 0), + b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0), + b'span.gridEndCenter' : (1, 'scalar_number', 0, 0), - 'extratokens' : (1, 'snippets', 1, 0), - 'extratokens.class' : (1, 'scalar_text', 0, 0), - 'extratokens.type' : (1, 'scalar_text', 0, 0), - 'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0), - 'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0), - 'extratokens.gridSize' : (1, 'scalar_number', 0, 0), - 'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0), - 'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0), - 'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0), - 'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0), + b'extratokens' : (1, 'snippets', 1, 0), + b'extratokens.class' : (1, 'scalar_text', 0, 0), + b'extratokens.type' : (1, 'scalar_text', 0, 0), + b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0), + b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0), + b'extratokens.gridSize' : (1, 'scalar_number', 0, 0), + b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0), + b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0), + b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0), + b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0), - 'glyph.h' : (1, 'number', 0, 0), - 'glyph.w' : (1, 'number', 0, 0), - 'glyph.use' : (1, 'number', 0, 0), - 'glyph.vtx' : (1, 'number', 0, 1), - 'glyph.len' : (1, 'number', 0, 1), - 'glyph.dpi' : (1, 'number', 0, 0), - 'vtx' : (0, 'number', 1, 1), - 'vtx.x' : (1, 'number', 0, 0), - 'vtx.y' : (1, 'number', 0, 0), - 'len' : (0, 'number', 1, 1), - 'len.n' : (1, 'number', 0, 0), + b'glyph.h' : (1, 'number', 0, 0), + b'glyph.w' : (1, 'number', 0, 0), + b'glyph.use' : (1, 'number', 0, 0), + b'glyph.vtx' : (1, 'number', 0, 1), + b'glyph.len' : (1, 'number', 0, 1), + b'glyph.dpi' : (1, 'number', 0, 0), + b'vtx' : (0, 'number', 1, 1), + b'vtx.x' : (1, 'number', 0, 0), + b'vtx.y' : (1, 'number', 0, 0), + b'len' : (0, 'number', 1, 1), + b'len.n' : (1, 'number', 0, 0), - 'book' : (1, 'snippets', 1, 0), - 'version' : (1, 'snippets', 1, 0), - 'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0), - 'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0), - 'version.Schema_id' : (1, 'scalar_text', 0, 0), - 'version.Schema_version' : (1, 'scalar_text', 0, 0), - 'version.Topaz_version' : (1, 'scalar_text', 0, 0), - 'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), - 'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), - 'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), - 'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), - 'version.chapterheaders' : (1, 'scalar_text', 0, 0), - 'version.creation_date' : (1, 'scalar_text', 0, 0), - 'version.header_footer' : (1, 'scalar_text', 0, 0), - 'version.init_from_ocr' : (1, 'scalar_text', 0, 0), - 'version.letter_insertion' : (1, 'scalar_text', 0, 0), - 'version.xmlinj_convert' : (1, 'scalar_text', 0, 0), - 'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0), - 'version.xmlinj_transform' : (1, 'scalar_text', 0, 0), - 'version.findlists' : (1, 'scalar_text', 0, 0), - 'version.page_num' : (1, 'scalar_text', 0, 0), - 'version.page_type' : (1, 'scalar_text', 0, 0), - 'version.bad_text' : (1, 'scalar_text', 0, 0), - 'version.glyph_mismatch' : (1, 'scalar_text', 0, 0), - 'version.margins' : (1, 'scalar_text', 0, 0), - 'version.staggered_lines' : (1, 'scalar_text', 0, 0), - 'version.paragraph_continuation' : (1, 'scalar_text', 0, 0), - 'version.toc' : (1, 'scalar_text', 0, 0), + b'book' : (1, 'snippets', 1, 0), + b'version' : (1, 'snippets', 1, 0), + b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0), + b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0), + b'version.Schema_id' : (1, 'scalar_text', 0, 0), + b'version.Schema_version' : (1, 'scalar_text', 0, 0), + b'version.Topaz_version' : (1, 'scalar_text', 0, 0), + b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), + b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), + b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), + b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), + b'version.chapterheaders' : (1, 'scalar_text', 0, 0), + b'version.creation_date' : (1, 'scalar_text', 0, 0), + b'version.header_footer' : (1, 'scalar_text', 0, 0), + b'version.init_from_ocr' : (1, 'scalar_text', 0, 0), + b'version.letter_insertion' : (1, 'scalar_text', 0, 0), + b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0), + b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0), + b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0), + b'version.findlists' : (1, 'scalar_text', 0, 0), + b'version.page_num' : (1, 'scalar_text', 0, 0), + b'version.page_type' : (1, 'scalar_text', 0, 0), + b'version.bad_text' : (1, 'scalar_text', 0, 0), + b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0), + b'version.margins' : (1, 'scalar_text', 0, 0), + b'version.staggered_lines' : (1, 'scalar_text', 0, 0), + b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0), + b'version.toc' : (1, 'scalar_text', 0, 0), - 'stylesheet' : (1, 'snippets', 1, 0), - 'style' : (1, 'snippets', 1, 0), - 'style._tag' : (1, 'scalar_text', 0, 0), - 'style.type' : (1, 'scalar_text', 0, 0), - 'style._after_type' : (1, 'scalar_text', 0, 0), - 'style._parent_type' : (1, 'scalar_text', 0, 0), - 'style._after_parent_type' : (1, 'scalar_text', 0, 0), - 'style.class' : (1, 'scalar_text', 0, 0), - 'style._after_class' : (1, 'scalar_text', 0, 0), - 'rule' : (1, 'snippets', 1, 0), - 'rule.attr' : (1, 'scalar_text', 0, 0), - 'rule.value' : (1, 'scalar_text', 0, 0), + b'stylesheet' : (1, 'snippets', 1, 0), + b'style' : (1, 'snippets', 1, 0), + b'style._tag' : (1, 'scalar_text', 0, 0), + b'style.type' : (1, 'scalar_text', 0, 0), + b'style._after_type' : (1, 'scalar_text', 0, 0), + b'style._parent_type' : (1, 'scalar_text', 0, 0), + b'style._after_parent_type' : (1, 'scalar_text', 0, 0), + b'style.class' : (1, 'scalar_text', 0, 0), + b'style._after_class' : (1, 'scalar_text', 0, 0), + b'rule' : (1, 'snippets', 1, 0), + b'rule.attr' : (1, 'scalar_text', 0, 0), + b'rule.value' : (1, 'scalar_text', 0, 0), - 'original' : (0, 'number', 1, 1), - 'original.pnum' : (1, 'number', 0, 0), - 'original.pid' : (1, 'text', 0, 0), - 'pages' : (0, 'number', 1, 1), - 'pages.ref' : (1, 'number', 0, 0), - 'pages.id' : (1, 'number', 0, 0), - 'startID' : (0, 'number', 1, 1), - 'startID.page' : (1, 'number', 0, 0), - 'startID.id' : (1, 'number', 0, 0), + b'original' : (0, 'number', 1, 1), + b'original.pnum' : (1, 'number', 0, 0), + b'original.pid' : (1, 'text', 0, 0), + b'pages' : (0, 'number', 1, 1), + b'pages.ref' : (1, 'number', 0, 0), + b'pages.id' : (1, 'number', 0, 0), + b'startID' : (0, 'number', 1, 1), + b'startID.page' : (1, 'number', 0, 0), + b'startID.id' : (1, 'number', 0, 0), - 'median_d' : (1, 'number', 0, 0), - 'median_h' : (1, 'number', 0, 0), - 'median_firsty' : (1, 'number', 0, 0), - 'median_lasty' : (1, 'number', 0, 0), + b'median_d' : (1, 'number', 0, 0), + b'median_h' : (1, 'number', 0, 0), + b'median_firsty' : (1, 'number', 0, 0), + b'median_lasty' : (1, 'number', 0, 0), - 'num_footers_maybe' : (1, 'number', 0, 0), - 'num_footers_yes' : (1, 'number', 0, 0), - 'num_headers_maybe' : (1, 'number', 0, 0), - 'num_headers_yes' : (1, 'number', 0, 0), + b'num_footers_maybe' : (1, 'number', 0, 0), + b'num_footers_yes' : (1, 'number', 0, 0), + b'num_headers_maybe' : (1, 'number', 0, 0), + b'num_headers_yes' : (1, 'number', 0, 0), - 'tracking' : (1, 'number', 0, 0), - 'src' : (1, 'text', 0, 0), + b'tracking' : (1, 'number', 0, 0), + b'src' : (1, 'text', 0, 0), } @@ -430,7 +430,7 @@ class PageParser(object): cnt = len(self.tagpath) if i < cnt : result = self.tagpath[i] for j in range(i+1, cnt) : - result += '.' + self.tagpath[j] + result += b'.' + self.tagpath[j] return result @@ -505,7 +505,7 @@ class PageParser(object): if (subtags == 1): ntags = readEncodedNumber(self.fo) - if self.debug : print('subtags: ' + token + ' has ' + str(ntags)) + if self.debug : print('subtags: ', token , ' has ' , str(ntags)) for j in range(ntags): val = readEncodedNumber(self.fo) subtagres.append(self.procToken(self.dict.lookup(val))) @@ -613,7 +613,7 @@ class PageParser(object): subtagList = tag[1] argtype = tag[2] argList = tag[3] - nname = prefix + '.' + name + nname = prefix + b'.' + name nsubtaglist = [] for j in subtagList: nsubtaglist.append(self.updateName(j,prefix)) @@ -662,34 +662,34 @@ class PageParser(object): subtagList = node[1] argtype = node[2] argList = node[3] - fullpathname = name.split('.') + fullpathname = name.split(b'.') nodename = fullpathname.pop() ilvl = len(fullpathname) - indent = ' ' * (3 * ilvl) + indent = b' ' * (3 * ilvl) rlst = [] - rlst.append(indent + '<' + nodename + '>') + rlst.append(indent + b'<' + nodename + b'>') if len(argList) > 0: alst = [] for j in argList: - if (argtype == 'text') or (argtype == 'scalar_text') : - alst.append(j + '|') + if (argtype == b'text') or (argtype == b'scalar_text') : + alst.append(j + b'|') else : - alst.append(str(j) + ',') - argres = "".join(alst) + alst.append(str(j).encode('utf-8') + b',') + argres = b"".join(alst) argres = argres[0:-1] - if argtype == 'snippets' : - rlst.append('snippets:' + argres) + if argtype == b'snippets' : + rlst.append(b'snippets:' + argres) else : rlst.append(argres) if len(subtagList) > 0 : - rlst.append('\n') + rlst.append(b'\n') for j in subtagList: if len(j) > 0 : rlst.append(self.formatTag(j)) - rlst.append(indent + '\n') + rlst.append(indent + b'\n') else: - rlst.append('\n') - return "".join(rlst) + rlst.append(b'\n') + return b"".join(rlst) # flatten tag @@ -704,20 +704,20 @@ class PageParser(object): alst = [] for j in argList: if (argtype == 'text') or (argtype == 'scalar_text') : - alst.append(j + '|') + alst.append(j + b'|') else : - alst.append(str(j) + '|') - argres = "".join(alst) + alst.append(str(j).encode('utf-8') + b'|') + argres = b"".join(alst) argres = argres[0:-1] - if argtype == 'snippets' : - rlst.append('.snippets=' + argres) + if argtype == b'snippets' : + rlst.append(b'.snippets=' + argres) else : - rlst.append('=' + argres) - rlst.append('\n') + rlst.append(b'=' + argres) + rlst.append(b'\n') for j in subtagList: if len(j) > 0 : rlst.append(self.flattenTag(j)) - return "".join(rlst) + return b"".join(rlst) # reduce create xml output @@ -729,7 +729,7 @@ class PageParser(object): rlst.append(self.flattenTag(j)) else: rlst.append(self.formatTag(j)) - result = "".join(rlst) + result = b"".join(rlst) if self.debug : print(result) return result @@ -747,16 +747,16 @@ class PageParser(object): # peek at the first bytes to see what type of file it is magic = self.fo.read(9) - if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'): - first_token = 'info' - elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'): + if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'): + first_token = b'info' + elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'): skip = self.fo.read(2) - first_token = 'info' - elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'): - first_token = 'info' - elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'): + first_token = b'info' + elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'): + first_token = b'info' + elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'): skip = self.fo.read(3) - first_token = 'info' + first_token = b'info' else : # other0.dat file first_token = None @@ -778,7 +778,7 @@ class PageParser(object): break if (v == 0x72): - self.doLoop72('number') + self.doLoop72(b'number') elif (v > 0) and (v < self.dict.getSize()) : tag = self.procToken(self.dict.lookup(v)) if len(tag) > 0 : @@ -789,7 +789,7 @@ class PageParser(object): if (v == 0): if (self.peek(1) == 0x5f): skip = self.fo.read(1) - first_token = 'info' + first_token = b'info' # now do snippet injection if len(self.snippetList) > 0 : @@ -809,14 +809,14 @@ class PageParser(object): def fromData(dict, fname): flat_xml = True - debug = False + debug = True pp = PageParser(fname, dict, debug, flat_xml) xmlpage = pp.process() return xmlpage def getXML(dict, fname): flat_xml = False - debug = False + debug = True pp = PageParser(fname, dict, debug, flat_xml) xmlpage = pp.process() return xmlpage @@ -845,7 +845,7 @@ def main(argv): sys.stderr=SafeUnbuffered(sys.stderr) dictFile = "" pageFile = "" - debug = False + debug = True flat_xml = False printOutput = False if len(argv) == 0: diff --git a/DeDRM_plugin/flatxml2html.py b/DeDRM_plugin/flatxml2html.py index 6f839ce..f1ca81d 100644 --- a/DeDRM_plugin/flatxml2html.py +++ b/DeDRM_plugin/flatxml2html.py @@ -7,6 +7,7 @@ import csv import os import math import getopt +import functools from struct import pack from struct import unpack @@ -15,14 +16,14 @@ class DocParser(object): def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage): self.id = os.path.basename(fileid).replace('.dat','') self.svgcount = 0 - self.docList = flatxml.split('\n') + self.docList = flatxml.split(b'\n') self.docSize = len(self.docList) self.classList = {} self.bookDir = bookDir self.gdict = gdict tmpList = classlst.split('\n') for pclass in tmpList: - if pclass != '': + if pclass != b'': # remove the leading period from the css name cname = pclass[1:] self.classList[cname] = True @@ -57,9 +58,9 @@ class DocParser(object): imgfile = os.path.join(imgDir,imgname) # get glyph information - gxList = self.getData('info.glyph.x',0,-1) - gyList = self.getData('info.glyph.y',0,-1) - gidList = self.getData('info.glyph.glyphID',0,-1) + gxList = self.getData(b'info.glyph.x',0,-1) + gyList = self.getData(b'info.glyph.y',0,-1) + gidList = self.getData(b'info.glyph.glyphID',0,-1) gids = [] maxws = [] @@ -122,11 +123,11 @@ class DocParser(object): def lineinDoc(self, pos) : if (pos >= 0) and (pos < self.docSize) : item = self.docList[pos] - if item.find('=') >= 0: - (name, argres) = item.split('=',1) + if item.find(b'=') >= 0: + (name, argres) = item.split(b'=',1) else : name = item - argres = '' + argres = b'' return name, argres @@ -140,11 +141,13 @@ class DocParser(object): foundat = -1 for j in range(pos, end): item = self.docList[j] - if item.find('=') >= 0: - (name, argres) = item.split('=',1) + if item.find(b'=') >= 0: + (name, argres) = item.split(b'=',1) else : name = item argres = '' + if (isinstance(tagpath,str)): + tagpath = tagpath.encode('utf-8') if name.endswith(tagpath) : result = argres foundat = j @@ -170,7 +173,7 @@ class DocParser(object): argres=[] (foundat, argt) = self.findinDoc(tagpath, pos, end) if (argt != None) and (len(argt) > 0) : - argList = argt.split('|') + argList = argt.split(b'|') argres = [ int(strval) for strval in argList] return argres @@ -191,21 +194,21 @@ class DocParser(object): # also some class names have spaces in them so need to convert to dashes if nclass != None : - nclass = nclass.replace(' ','-') - classres = '' + nclass = nclass.replace(b' ',b'-') + classres = b'' nclass = nclass.lower() - nclass = 'cl-' + nclass - baseclass = '' + nclass = b'cl-' + nclass + baseclass = b'' # graphic is the base class for captions - if nclass.find('cl-cap-') >=0 : - classres = 'graphic' + ' ' + if nclass.find(b'cl-cap-') >=0 : + classres = b'graphic' + b' ' else : # strip to find baseclass - p = nclass.find('_') + p = nclass.find(b'_') if p > 0 : baseclass = nclass[0:p] if baseclass in self.classList: - classres += baseclass + ' ' + classres += baseclass + b' ' classres += nclass nclass = classres return nclass @@ -225,11 +228,11 @@ class DocParser(object): return -1 result = [] - (pos, pagetype) = self.findinDoc('page.type',0,-1) + (pos, pagetype) = self.findinDoc(b'page.type',0,-1) - groupList = self.posinDoc('page.group') - groupregionList = self.posinDoc('page.group.region') - pageregionList = self.posinDoc('page.region') + groupList = self.posinDoc(b'page.group') + groupregionList = self.posinDoc(b'page.group.region') + pageregionList = self.posinDoc(b'page.region') # integrate into one list for j in groupList: result.append(('grpbeg',j)) @@ -237,7 +240,7 @@ class DocParser(object): result.append(('gregion',j)) for j in pageregionList: result.append(('pregion',j)) - result.sort(compare) + result.sort(key=functools.cmp_to_key(compare)) # insert group end and page end indicators inGroup = False @@ -267,33 +270,33 @@ class DocParser(object): result = [] # paragraph - (pos, pclass) = self.findinDoc('paragraph.class',start,end) + (pos, pclass) = self.findinDoc(b'paragraph.class',start,end) pclass = self.getClass(pclass) # if paragraph uses extratokens (extra glyphs) then make it fixed - (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end) + (pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end) # build up a description of the paragraph in result and return it # first check for the basic - all words paragraph - (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) - (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) + (pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end) + (pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end) if (sfirst != None) and (slast != None) : first = int(sfirst) last = int(slast) - makeImage = (regtype == 'vertical') or (regtype == 'table') + makeImage = (regtype == b'vertical') or (regtype == b'table') makeImage = makeImage or (extraglyphs != None) if self.fixedimage: - makeImage = makeImage or (regtype == 'fixed') + makeImage = makeImage or (regtype == b'fixed') if (pclass != None): - makeImage = makeImage or (pclass.find('.inverted') >= 0) + makeImage = makeImage or (pclass.find(b'.inverted') >= 0) if self.fixedimage : - makeImage = makeImage or (pclass.find('cl-f-') >= 0) + makeImage = makeImage or (pclass.find(b'cl-f-') >= 0) # before creating an image make sure glyph info exists - gidList = self.getData('info.glyph.glyphID',0,-1) + gidList = self.getData(b'info.glyph.glyphID',0,-1) makeImage = makeImage & (len(gidList) > 0) @@ -307,8 +310,8 @@ class DocParser(object): # translate first and last word into first and last glyphs # and generate inline image and include it glyphList = [] - firstglyphList = self.getData('word.firstGlyph',0,-1) - gidList = self.getData('info.glyph.glyphID',0,-1) + firstglyphList = self.getData(b'word.firstGlyph',0,-1) + gidList = self.getData(b'info.glyph.glyphID',0,-1) firstGlyph = firstglyphList[first] if last < len(firstglyphList): lastGlyph = firstglyphList[last] @@ -326,8 +329,8 @@ class DocParser(object): for glyphnum in range(firstGlyph, lastGlyph): glyphList.append(glyphnum) # include any extratokens if they exist - (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end) - (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end) + (pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end) + (pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end) if (sfg != None) and (slg != None): for glyphnum in range(int(sfg), int(slg)): glyphList.append(glyphnum) @@ -368,39 +371,39 @@ class DocParser(object): (name, argres) = self.lineinDoc(line) - if name.endswith('span.firstWord') : + if name.endswith(b'span.firstWord') : sp_first = int(argres) - elif name.endswith('span.lastWord') : + elif name.endswith(b'span.lastWord') : sp_last = int(argres) - elif name.endswith('word.firstGlyph') : + elif name.endswith(b'word.firstGlyph') : gl_first = int(argres) - elif name.endswith('word.lastGlyph') : + elif name.endswith(b'word.lastGlyph') : gl_last = int(argres) - elif name.endswith('word_semantic.firstWord'): + elif name.endswith(b'word_semantic.firstWord'): ws_first = int(argres) - elif name.endswith('word_semantic.lastWord'): + elif name.endswith(b'word_semantic.lastWord'): ws_last = int(argres) - elif name.endswith('word.class'): + elif name.endswith(b'word.class'): # we only handle spaceafter word class try: - (cname, space) = argres.split('-',1) - if space == '' : space = '0' - if (cname == 'spaceafter') and (int(space) > 0) : + (cname, space) = argres.split(b'-',1) + if space == b'' : space = b'0' + if (cname == b'spaceafter') and (int(space) > 0) : word_class = 'sa' except: pass - elif name.endswith('word.img.src'): + elif name.endswith(b'word.img.src'): result.append(('img' + word_class, int(argres))) word_class = '' - elif name.endswith('region.img.src'): + elif name.endswith(b'region.img.src'): result.append(('img' + word_class, int(argres))) if (sp_first != -1) and (sp_last != -1): @@ -437,7 +440,7 @@ class DocParser(object): classres = '' if pclass : - classres = ' class="' + pclass + '"' + classres = ' class="' + pclass.decode('utf-8') + '"' br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical') @@ -470,8 +473,8 @@ class DocParser(object): if (link > 0): linktype = self.link_type[link-1] title = self.link_title[link-1] - if (title == "") or (parares.rfind(title) < 0): - title=parares[lstart:] + if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0): + title=parares[lstart:].encode('utf-8') if linktype == 'external' : linkhref = self.link_href[link-1] linkhtml = '' % linkhref @@ -482,33 +485,34 @@ class DocParser(object): else : # just link to the current page linkhtml = '' - linkhtml += title + '' - pos = parares.rfind(title) + linkhtml += title.decode('utf-8') + linkhtml += '' + pos = parares.rfind(title.decode('utf-8')) if pos >= 0: parares = parares[0:pos] + linkhtml + parares[pos+len(title):] else : parares += linkhtml lstart = len(parares) - if word == '_link_' : word = '' + if word == b'_link_' : word = b'' elif (link < 0) : - if word == '_link_' : word = '' + if word == b'_link_' : word = b'' - if word == '_lb_': + if word == b'_lb_': if ((num-1) in self.dehyphen_rootid ) or handle_links: - word = '' + word = b'' sep = '' elif br_lb : - word = '
\n' + word = b'
\n' sep = '' else : - word = '\n' + word = b'\n' sep = '' if num in self.dehyphen_rootid : word = word[0:-1] sep = '' - parares += word + sep + parares += word.decode('utf-8') + sep elif wtype == 'img' : sep = '' @@ -522,7 +526,9 @@ class DocParser(object): elif wtype == 'svg' : sep = '' - parares += '' % num + parares += '' % num parares += sep if len(sep) > 0 : parares = parares[0:-1] @@ -545,7 +551,7 @@ class DocParser(object): (wtype, num) = pdesc[j] if wtype == 'ocr' : - word = self.ocrtext[num] + word = self.ocrtext[num].decode('utf-8') sep = ' ' if handle_links: @@ -553,7 +559,7 @@ class DocParser(object): if (link > 0): linktype = self.link_type[link-1] title = self.link_title[link-1] - title = title.rstrip('. ') + title = title.rstrip(b'. ') alt_title = parares[lstart:] alt_title = alt_title.strip() # now strip off the actual printed page number @@ -607,38 +613,38 @@ class DocParser(object): hlst = [] # get the ocr text - (pos, argres) = self.findinDoc('info.word.ocrText',0,-1) - if argres : self.ocrtext = argres.split('|') + (pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1) + if argres : self.ocrtext = argres.split(b'|') # get information to dehyphenate the text - self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1) + self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1) # determine if first paragraph is continued from previous page - (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) + (pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1) first_para_continued = (self.parastems_stemid != None) # determine if last paragraph is continued onto the next page - (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1) + (pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1) last_para_continued = (self.paracont_stemid != None) # collect link ids - self.link_id = self.getData('info.word.link_id',0,-1) + self.link_id = self.getData(b'info.word.link_id',0,-1) # collect link destination page numbers - self.link_page = self.getData('info.links.page',0,-1) + self.link_page = self.getData(b'info.links.page',0,-1) # collect link types (container versus external) - (pos, argres) = self.findinDoc('info.links.type',0,-1) - if argres : self.link_type = argres.split('|') + (pos, argres) = self.findinDoc(b'info.links.type',0,-1) + if argres : self.link_type = argres.split(b'|') # collect link destinations - (pos, argres) = self.findinDoc('info.links.href',0,-1) - if argres : self.link_href = argres.split('|') + (pos, argres) = self.findinDoc(b'info.links.href',0,-1) + if argres : self.link_href = argres.split(b'|') # collect link titles - (pos, argres) = self.findinDoc('info.links.title',0,-1) + (pos, argres) = self.findinDoc(b'info.links.title',0,-1) if argres : - self.link_title = argres.split('|') + self.link_title = argres.split(b'|') else: self.link_title.append('') @@ -662,51 +668,51 @@ class DocParser(object): # set anchor for link target on this page if not anchorSet and not first_para_continued: hlst.append('\n') + hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '">\n') anchorSet = True # handle groups of graphics with text captions - if (etype == 'grpbeg'): - (pos, grptype) = self.findinDoc('group.type', start, end) + if (etype == b'grpbeg'): + (pos, grptype) = self.findinDoc(b'group.type', start, end) if grptype != None: - if grptype == 'graphic': - gcstr = ' class="' + grptype + '"' + if grptype == b'graphic': + gcstr = ' class="' + grptype.decode('utf-8') + '"' hlst.append('') inGroup = True - elif (etype == 'grpend'): + elif (etype == b'grpend'): if inGroup: hlst.append('\n') inGroup = False else: - (pos, regtype) = self.findinDoc('region.type',start,end) + (pos, regtype) = self.findinDoc(b'region.type',start,end) - if regtype == 'graphic' : - (pos, simgsrc) = self.findinDoc('img.src',start,end) + if regtype == b'graphic' : + (pos, simgsrc) = self.findinDoc(b'img.src',start,end) if simgsrc: if inGroup: hlst.append('' % int(simgsrc)) else: hlst.append('
' % int(simgsrc)) - elif regtype == 'chapterheading' : + elif regtype == b'chapterheading' : (pclass, pdesc) = self.getParaDescription(start,end, regtype) if not breakSet: hlst.append('
 
\n') breakSet = True tag = 'h1' if pclass and (len(pclass) >= 7): - if pclass[3:7] == 'ch1-' : tag = 'h1' - if pclass[3:7] == 'ch2-' : tag = 'h2' - if pclass[3:7] == 'ch3-' : tag = 'h3' - hlst.append('<' + tag + ' class="' + pclass + '">') + if pclass[3:7] == b'ch1-' : tag = 'h1' + if pclass[3:7] == b'ch2-' : tag = 'h2' + if pclass[3:7] == b'ch3-' : tag = 'h3' + hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">') else: hlst.append('<' + tag + '>') hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) hlst.append('') - elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'): + elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'): ptype = 'full' # check to see if this is a continution from the previous page if first_para_continued : @@ -715,16 +721,16 @@ class DocParser(object): (pclass, pdesc) = self.getParaDescription(start,end, regtype) if pclass and (len(pclass) >= 6) and (ptype == 'full'): tag = 'p' - if pclass[3:6] == 'h1-' : tag = 'h4' - if pclass[3:6] == 'h2-' : tag = 'h5' - if pclass[3:6] == 'h3-' : tag = 'h6' - hlst.append('<' + tag + ' class="' + pclass + '">') + if pclass[3:6] == b'h1-' : tag = 'h4' + if pclass[3:6] == b'h2-' : tag = 'h5' + if pclass[3:6] == b'h3-' : tag = 'h6' + hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">') hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) hlst.append('') else : hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) - elif (regtype == 'tocentry') : + elif (regtype == b'tocentry') : ptype = 'full' if first_para_continued : ptype = 'end' @@ -733,7 +739,7 @@ class DocParser(object): tocinfo += self.buildTOCEntry(pdesc) hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) - elif (regtype == 'vertical') or (regtype == 'table') : + elif (regtype == b'vertical') or (regtype == b'table') : ptype = 'full' if inGroup: ptype = 'middle' @@ -744,19 +750,19 @@ class DocParser(object): hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) - elif (regtype == 'synth_fcvr.center'): - (pos, simgsrc) = self.findinDoc('img.src',start,end) + elif (regtype == b'synth_fcvr.center'): + (pos, simgsrc) = self.findinDoc(b'img.src',start,end) if simgsrc: hlst.append('
' % int(simgsrc)) else : print(' Making region type', regtype, end=' ') - (pos, temp) = self.findinDoc('paragraph',start,end) - (pos2, temp) = self.findinDoc('span',start,end) + (pos, temp) = self.findinDoc(b'paragraph',start,end) + (pos2, temp) = self.findinDoc(b'span',start,end) if pos != -1 or pos2 != -1: print(' a "text" region') orig_regtype = regtype - regtype = 'fixed' + regtype = b'fixed' ptype = 'full' # check to see if this is a continution from the previous page if first_para_continued : @@ -764,23 +770,23 @@ class DocParser(object): first_para_continued = False (pclass, pdesc) = self.getParaDescription(start,end, regtype) if not pclass: - if orig_regtype.endswith('.right') : pclass = 'cl-right' - elif orig_regtype.endswith('.center') : pclass = 'cl-center' - elif orig_regtype.endswith('.left') : pclass = 'cl-left' - elif orig_regtype.endswith('.justify') : pclass = 'cl-justify' + if orig_regtype.endswith(b'.right') : pclass = 'cl-right' + elif orig_regtype.endswith(b'.center') : pclass = 'cl-center' + elif orig_regtype.endswith(b'.left') : pclass = 'cl-left' + elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify' if pclass and (ptype == 'full') and (len(pclass) >= 6): tag = 'p' - if pclass[3:6] == 'h1-' : tag = 'h4' - if pclass[3:6] == 'h2-' : tag = 'h5' - if pclass[3:6] == 'h3-' : tag = 'h6' - hlst.append('<' + tag + ' class="' + pclass + '">') + if pclass[3:6] == b'h1-' : tag = 'h4' + if pclass[3:6] == b'h2-' : tag = 'h5' + if pclass[3:6] == b'h3-' : tag = 'h6' + hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">') hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) hlst.append('') else : hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) else : print(' a "graphic" region') - (pos, simgsrc) = self.findinDoc('img.src',start,end) + (pos, simgsrc) = self.findinDoc(b'img.src',start,end) if simgsrc: hlst.append('
' % int(simgsrc)) diff --git a/DeDRM_plugin/flatxml2svg.py b/DeDRM_plugin/flatxml2svg.py index 72c7e3c..3768358 100644 --- a/DeDRM_plugin/flatxml2svg.py +++ b/DeDRM_plugin/flatxml2svg.py @@ -12,7 +12,7 @@ from struct import unpack class PParser(object): def __init__(self, gd, flatxml, meta_array): self.gd = gd - self.flatdoc = flatxml.split('\n') + self.flatdoc = flatxml.split(b'\n') self.docSize = len(self.flatdoc) self.temp = [] @@ -58,11 +58,11 @@ class PParser(object): def lineinDoc(self, pos) : if (pos >= 0) and (pos < self.docSize) : item = self.flatdoc[pos] - if item.find('=') >= 0: - (name, argres) = item.split('=',1) + if item.find(b'=') >= 0: + (name, argres) = item.split(b'=',1) else : name = item - argres = '' + argres = b'' return name, argres # find tag in doc if within pos to end inclusive @@ -75,11 +75,13 @@ class PParser(object): foundat = -1 for j in range(pos, end): item = self.flatdoc[j] - if item.find('=') >= 0: - (name, argres) = item.split('=',1) + if item.find(b'=') >= 0: + (name, argres) = item.split(b'=',1) else : name = item - argres = '' + argres = b'' + if (isinstance(tagpath,str)): + tagpath = tagpath.encode('utf-8') if name.endswith(tagpath) : result = argres foundat = j @@ -103,9 +105,9 @@ class PParser(object): cnt = len(self.flatdoc) for j in range(cnt): item = self.flatdoc[j] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') + if item.find(b'=') >= 0: + (name, argt) = item.split(b'=') + argres = argt.split(b'|') else: name = item argres = [] @@ -120,15 +122,17 @@ class PParser(object): def getDataatPos(self, path, pos): result = None item = self.flatdoc[pos] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') + if item.find(b'=') >= 0: + (name, argt) = item.split(b'=') + argres = argt.split(b'|') else: name = item argres = [] if (len(argres) > 0) : for j in range(0,len(argres)): argres[j] = int(argres[j]) + if (isinstance(path,str)): + path = path.encode('utf-8') if (name.endswith(path)): result = argres return result @@ -138,12 +142,14 @@ class PParser(object): cnt = len(self.temp) for j in range(cnt): item = self.temp[j] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') + if item.find(b'=') >= 0: + (name, argt) = item.split(b'=') + argres = argt.split(b'|') else: name = item argres = [] + if (isinstance(path,str)): + path = path.encode('utf-8') if (name.endswith(path)): result = argres self.temp.pop(j) diff --git a/DeDRM_plugin/genbook.py b/DeDRM_plugin/genbook.py index ea1ca38..915bd30 100644 --- a/DeDRM_plugin/genbook.py +++ b/DeDRM_plugin/genbook.py @@ -44,10 +44,10 @@ if inCalibre : from calibre_plugins.dedrm import flatxml2svg from calibre_plugins.dedrm import stylexml2css else : - from . import convert2xml - from . import flatxml2html - from . import flatxml2svg - from . import stylexml2css + import convert2xml + import flatxml2html + import flatxml2svg + import stylexml2css # global switch buildXML = False @@ -117,10 +117,10 @@ class Dictionary(object): self.stable.append(self.escapestr(readString(self.fo))) self.pos = 0 def escapestr(self, str): - str = str.replace('&','&') - str = str.replace('<','<') - str = str.replace('>','>') - str = str.replace('=','=') + str = str.replace(b'&',b'&') + str = str.replace(b'<',b'<') + str = str.replace(b'>',b'>') + str = str.replace(b'=',b'=') return str def lookup(self,val): if ((val >= 0) and (val < self.size)) : @@ -138,7 +138,7 @@ class Dictionary(object): class PageDimParser(object): def __init__(self, flatxml): - self.flatdoc = flatxml.split('\n') + self.flatdoc = flatxml.split(b'\n') # find tag if within pos to end inclusive def findinDoc(self, tagpath, pos, end) : result = None @@ -151,8 +151,8 @@ class PageDimParser(object): foundat = -1 for j in range(pos, end): item = docList[j] - if item.find('=') >= 0: - (name, argres) = item.split('=') + if item.find(b'=') >= 0: + (name, argres) = item.split(b'=') else : name = item argres = '' @@ -162,8 +162,8 @@ class PageDimParser(object): break return foundat, result def process(self): - (pos, sph) = self.findinDoc('page.h',0,-1) - (pos, spw) = self.findinDoc('page.w',0,-1) + (pos, sph) = self.findinDoc(b'page.h',0,-1) + (pos, spw) = self.findinDoc(b'page.w',0,-1) if (sph == None): sph = '-1' if (spw == None): spw = '-1' return sph, spw @@ -176,21 +176,21 @@ def getPageDim(flatxml): class GParser(object): def __init__(self, flatxml): - self.flatdoc = flatxml.split('\n') + self.flatdoc = flatxml.split(b'\n') self.dpi = 1440 - self.gh = self.getData('info.glyph.h') - self.gw = self.getData('info.glyph.w') - self.guse = self.getData('info.glyph.use') + self.gh = self.getData(b'info.glyph.h') + self.gw = self.getData(b'info.glyph.w') + self.guse = self.getData(b'info.glyph.use') if self.guse : self.count = len(self.guse) else : self.count = 0 - self.gvtx = self.getData('info.glyph.vtx') - self.glen = self.getData('info.glyph.len') - self.gdpi = self.getData('info.glyph.dpi') - self.vx = self.getData('info.vtx.x') - self.vy = self.getData('info.vtx.y') - self.vlen = self.getData('info.len.n') + self.gvtx = self.getData(b'info.glyph.vtx') + self.glen = self.getData(b'info.glyph.len') + self.gdpi = self.getData(b'info.glyph.dpi') + self.vx = self.getData(b'info.vtx.x') + self.vy = self.getData(b'info.vtx.y') + self.vlen = self.getData(b'info.len.n') if self.vlen : self.glen.append(len(self.vlen)) elif self.glen: @@ -204,9 +204,9 @@ class GParser(object): cnt = len(self.flatdoc) for j in range(cnt): item = self.flatdoc[j] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') + if item.find(b'=') >= 0: + (name, argt) = item.split(b'=') + argres = argt.split(b'|') else: name = item argres = [] @@ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage): # now get the css info cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw) - open(xname, 'wb').write(cssstr) + open(xname, 'w').write(cssstr) if buildXML: xname = os.path.join(xmlDir, 'other0000.xml') open(xname, 'wb').write(convert2xml.getXML(dict, otherFile)) @@ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage): hlst.append('\n\n') htmlstr = "".join(hlst) hlst = None - open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr) + open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr) print(" ") print('Extracting Table of Contents from Amazon OCR') @@ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage): tlst.append('\n') tlst.append('\n') tochtml = "".join(tlst) - open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml) + open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml) # now create index_svg.xhtml that points to all required files @@ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage): flst = [] for page in pagelst: flst.append(xmllst[page]) - flat_svg = "".join(flst) + flat_svg = b"".join(flst) flst=None svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi) if (raw) : @@ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage): slst.append('\n\n') svgindex = "".join(slst) slst = None - open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex) + open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex) print(" ") @@ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage): olst.append('\n') # adding metadata olst.append(' \n') - if 'GUID' in meta_array: - olst.append(' ' + meta_array['GUID'] + '\n') - if 'ASIN' in meta_array: - olst.append(' ' + meta_array['ASIN'] + '\n') - if 'oASIN' in meta_array: - olst.append(' ' + meta_array['oASIN'] + '\n') - olst.append(' ' + meta_array['Title'] + '\n') - olst.append(' ' + meta_array['Authors'] + '\n') + if b'GUID' in meta_array: + olst.append(' ' + meta_array[b'GUID'].decode('utf-8') + '\n') + if b'ASIN' in meta_array: + olst.append(' ' + meta_array[b'ASIN'].decode('utf-8') + '\n') + if b'oASIN' in meta_array: + olst.append(' ' + meta_array[b'oASIN'].decode('utf-8') + '\n') + olst.append(' ' + meta_array[b'Title'].decode('utf-8') + '\n') + olst.append(' ' + meta_array[b'Authors'].decode('utf-8') + '\n') olst.append(' en\n') - olst.append(' ' + meta_array['UpdateTime'] + '\n') + olst.append(' ' + meta_array[b'UpdateTime'].decode('utf-8') + '\n') if isCover: olst.append(' \n') olst.append(' \n') @@ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage): olst.append('\n') opfstr = "".join(olst) olst = None - open(opfname, 'wb').write(opfstr) + open(opfname, 'w').write(opfstr) print('Processing Complete') diff --git a/DeDRM_plugin/kgenpids.py b/DeDRM_plugin/kgenpids.py index 466cf5c..86ffab7 100644 --- a/DeDRM_plugin/kgenpids.py +++ b/DeDRM_plugin/kgenpids.py @@ -49,14 +49,15 @@ def SHA1(message): # Encode the bytes in data with the characters in map +# data and map should be byte arrays def encode(data, map): - result = '' + result = b'' for char in data: - value = ord(char) + value = char Q = (value ^ 0x80) // len(map) R = value % len(map) - result += map[Q] - result += map[R] + result += bytes([map[Q]]) + result += bytes([map[R]]) return result # Hash the bytes in data and then encode the digest with the characters in map @@ -117,7 +118,7 @@ def generatePidEncryptionTable() : def generatePidSeed(table,dsn) : value = 0 for counter in range (0,4) : - index = (ord(dsn[counter]) ^ value) &0xFF + index = (dsn[counter] ^ value) & 0xFF value = (value >> 8) ^ table[index] return value @@ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll): pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF] index = 0 for counter in range (0,nbRoll): - pid[index] = pid[index] ^ ord(dsn[counter]) + pid[index] = pid[index] ^ dsn[counter] index = (index+1) %8 for counter in range (0,8): index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7) @@ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase): try: # Get the kindle account token, if present - kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode() + kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens']) except KeyError: kindleAccountToken="" @@ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase): try: # Get the DSN token, if present - DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode() + DSN = bytearray.fromhex((kindleDatabase[1])['DSN']) print("Got DSN key from database {0}".format(kindleDatabase[0])) except KeyError: # See if we have the info to generate the DSN try: # Get the Mazama Random number - MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode() + MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber']) #print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0]) try: # Get the SerialNumber token, if present - IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode() + IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber']) print("Got SerialNumber from database {0}".format(kindleDatabase[0])) except KeyError: # Get the IDString we added - IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode() + IDString = bytearray.fromhex((kindleDatabase[1])['IDString']) try: # Get the UsernameHash token, if present - encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode() + encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash']) print("Got UsernameHash from database {0}".format(kindleDatabase[0])) except KeyError: # Get the UserName we added - UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode() + UserName = bytearray.fromhex((kindleDatabase[1])['UserName']) # encode it encodedUsername = encodeHash(UserName,charMap1) #print "encodedUsername",encodedUsername.encode('hex') @@ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase): # Compute book PIDs # book pid - pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token) + pidHash = SHA1(DSN+kindleAccountToken+rec209+token) bookPID = encodePID(pidHash) bookPID = checksumPid(bookPID) pids.append(bookPID) # variant 1 - pidHash = SHA1(kindleAccountToken.encode()+rec209+token) + pidHash = SHA1(kindleAccountToken+rec209+token) bookPID = encodePID(pidHash) bookPID = checksumPid(bookPID) pids.append(bookPID) # variant 2 - pidHash = SHA1(DSN.encode()+rec209+token) + pidHash = SHA1(DSN+rec209+token) bookPID = encodePID(pidHash) bookPID = checksumPid(bookPID) pids.append(bookPID) diff --git a/DeDRM_plugin/mobidedrm.py b/DeDRM_plugin/mobidedrm.py index e9b0fc1..ce21fbd 100644 --- a/DeDRM_plugin/mobidedrm.py +++ b/DeDRM_plugin/mobidedrm.py @@ -7,7 +7,7 @@ from __future__ import print_function __license__ = 'GPL v3' -__version__ = "1.00" +__version__ = "1.0" # This is a python script. You need a Python interpreter to run it. # For example, ActiveState Python, which exists for windows. @@ -73,7 +73,7 @@ __version__ = "1.00" # 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility # 0.41 - Fixed potential unicode problem in command line calls # 0.42 - Added GPL v3 licence. updated/removed some print statements -# 1.00 - Python 3 compatibility for calibre 5.0 +# 1.0 - Python 3 compatibility for calibre 5.0 import sys import os @@ -330,7 +330,7 @@ class MobiBook: } title = '' codec = 'windows-1252' - if self.magic == 'BOOKMOBI': + if self.magic == b'BOOKMOBI': if 503 in self.meta_array: title = self.meta_array[503] else: diff --git a/DeDRM_plugin/stylexml2css.py b/DeDRM_plugin/stylexml2css.py index 3e360a4..1d46a9e 100644 --- a/DeDRM_plugin/stylexml2css.py +++ b/DeDRM_plugin/stylexml2css.py @@ -15,36 +15,36 @@ debug = False class DocParser(object): def __init__(self, flatxml, fontsize, ph, pw): - self.flatdoc = flatxml.split('\n') + self.flatdoc = flatxml.split(b'\n') self.fontsize = int(fontsize) self.ph = int(ph) * 1.0 self.pw = int(pw) * 1.0 stags = { - 'paragraph' : 'p', - 'graphic' : '.graphic' + b'paragraph' : 'p', + b'graphic' : '.graphic' } attr_val_map = { - 'hang' : 'text-indent: ', - 'indent' : 'text-indent: ', - 'line-space' : 'line-height: ', - 'margin-bottom' : 'margin-bottom: ', - 'margin-left' : 'margin-left: ', - 'margin-right' : 'margin-right: ', - 'margin-top' : 'margin-top: ', - 'space-after' : 'padding-bottom: ', + b'hang' : 'text-indent: ', + b'indent' : 'text-indent: ', + b'line-space' : 'line-height: ', + b'margin-bottom' : 'margin-bottom: ', + b'margin-left' : 'margin-left: ', + b'margin-right' : 'margin-right: ', + b'margin-top' : 'margin-top: ', + b'space-after' : 'padding-bottom: ', } attr_str_map = { - 'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', - 'align-left' : 'text-align: left;', - 'align-right' : 'text-align: right;', - 'align-justify' : 'text-align: justify;', - 'display-inline' : 'display: inline;', - 'pos-left' : 'text-align: left;', - 'pos-right' : 'text-align: right;', - 'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', + b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', + b'align-left' : 'text-align: left;', + b'align-right' : 'text-align: right;', + b'align-justify' : 'text-align: justify;', + b'display-inline' : 'display: inline;', + b'pos-left' : 'text-align: left;', + b'pos-right' : 'text-align: right;', + b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', } @@ -60,11 +60,13 @@ class DocParser(object): foundat = -1 for j in range(pos, end): item = docList[j] - if item.find('=') >= 0: - (name, argres) = item.split('=',1) + if item.find(b'=') >= 0: + (name, argres) = item.split(b'=',1) else : name = item - argres = '' + argres = b'' + if (isinstance(tagpath,str)): + tagpath = tagpath.encode('utf-8') if name.endswith(tagpath) : result = argres foundat = j @@ -76,7 +78,7 @@ class DocParser(object): def posinDoc(self, tagpath): startpos = [] pos = 0 - res = "" + res = b"" while res != None : (foundpos, res) = self.findinDoc(tagpath, pos, -1) if res != None : @@ -87,11 +89,11 @@ class DocParser(object): # returns a vector of integers for the tagpath def getData(self, tagpath, pos, end, clean=False): if clean: - digits_only = re.compile(r'''([0-9]+)''') + digits_only = re.compile(rb'''([0-9]+)''') argres=[] (foundat, argt) = self.findinDoc(tagpath, pos, end) if (argt != None) and (len(argt) > 0) : - argList = argt.split('|') + argList = argt.split(b'|') for strval in argList: if clean: m = re.search(digits_only, strval) @@ -109,7 +111,7 @@ class DocParser(object): csspage += '.cl-justify { text-align: justify; }\n' # generate a list of each