More fixes for Amazon books, fixing identity checks, started on Topaz.

pull/156/head
Apprentice Harper 4 years ago
parent dc27c36761
commit 939cdbb0c9

@ -56,7 +56,7 @@ def readEncodedNumber(file):
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
data = c[0]
datax = (datax <<7) + (data & 0x7F)
data = datax
@ -188,232 +188,232 @@ class PageParser(object):
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = {
'x' : (1, 'scalar_number', 0, 0),
'y' : (1, 'scalar_number', 0, 0),
'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0),
'rootID' : (1, 'scalar_number', 0, 0),
'stemID' : (1, 'scalar_number', 0, 0),
'type' : (1, 'scalar_text', 0, 0),
'info' : (0, 'number', 1, 0),
'info.word' : (0, 'number', 1, 1),
'info.word.ocrText' : (1, 'text', 0, 0),
'info.word.firstGlyph' : (1, 'raw', 0, 0),
'info.word.lastGlyph' : (1, 'raw', 0, 0),
'info.word.bl' : (1, 'raw', 0, 0),
'info.word.link_id' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1),
'glyph.x' : (1, 'number', 0, 0),
'glyph.y' : (1, 'number', 0, 0),
'glyph.glyphID' : (1, 'number', 0, 0),
'dehyphen' : (0, 'number', 1, 1),
'dehyphen.rootID' : (1, 'number', 0, 0),
'dehyphen.stemID' : (1, 'number', 0, 0),
'dehyphen.stemPage' : (1, 'number', 0, 0),
'dehyphen.sh' : (1, 'number', 0, 0),
'links' : (0, 'number', 1, 1),
'links.page' : (1, 'number', 0, 0),
'links.rel' : (1, 'number', 0, 0),
'links.row' : (1, 'number', 0, 0),
'links.title' : (1, 'text', 0, 0),
'links.href' : (1, 'text', 0, 0),
'links.type' : (1, 'text', 0, 0),
'links.id' : (1, 'number', 0, 0),
'paraCont' : (0, 'number', 1, 1),
'paraCont.rootID' : (1, 'number', 0, 0),
'paraCont.stemID' : (1, 'number', 0, 0),
'paraCont.stemPage' : (1, 'number', 0, 0),
'paraStems' : (0, 'number', 1, 1),
'paraStems.stemID' : (1, 'number', 0, 0),
'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0),
'empty' : (1, 'snippets', 1, 0),
'page' : (1, 'snippets', 1, 0),
'page.class' : (1, 'scalar_text', 0, 0),
'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0),
'page.type' : (1, 'scalar_text', 0, 0),
'page.h' : (1, 'scalar_number', 0, 0),
'page.w' : (1, 'scalar_number', 0, 0),
'page.startID' : (1, 'scalar_number', 0, 0),
'group' : (1, 'snippets', 1, 0),
'group.class' : (1, 'scalar_text', 0, 0),
'group.type' : (1, 'scalar_text', 0, 0),
'group._tag' : (1, 'scalar_text', 0, 0),
'group.orientation': (1, 'scalar_text', 0, 0),
'region' : (1, 'snippets', 1, 0),
'region.class' : (1, 'scalar_text', 0, 0),
'region.type' : (1, 'scalar_text', 0, 0),
'region.x' : (1, 'scalar_number', 0, 0),
'region.y' : (1, 'scalar_number', 0, 0),
'region.h' : (1, 'scalar_number', 0, 0),
'region.w' : (1, 'scalar_number', 0, 0),
'region.orientation' : (1, 'scalar_text', 0, 0),
'empty_text_region' : (1, 'snippets', 1, 0),
'img' : (1, 'snippets', 1, 0),
'img.x' : (1, 'scalar_number', 0, 0),
'img.y' : (1, 'scalar_number', 0, 0),
'img.h' : (1, 'scalar_number', 0, 0),
'img.w' : (1, 'scalar_number', 0, 0),
'img.src' : (1, 'scalar_number', 0, 0),
'img.color_src' : (1, 'scalar_number', 0, 0),
'img.gridSize' : (1, 'scalar_number', 0, 0),
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
'img.image_type' : (1, 'scalar_number', 0, 0),
'paragraph' : (1, 'snippets', 1, 0),
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'word_semantic.class' : (1, 'scalar_text', 0, 0),
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
'word' : (1, 'snippets', 1, 0),
'word.type' : (1, 'scalar_text', 0, 0),
'word.class' : (1, 'scalar_text', 0, 0),
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.class' : (1, 'scalar_text', 0, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
'_span.lastWord' : (1, 'scalar_number', 0, 0),
'_span.gridSize' : (1, 'scalar_number', 0, 0),
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
'span.gridSize' : (1, 'scalar_number', 0, 0),
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.class' : (1, 'scalar_text', 0, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0),
'glyph.use' : (1, 'number', 0, 0),
'glyph.vtx' : (1, 'number', 0, 1),
'glyph.len' : (1, 'number', 0, 1),
'glyph.dpi' : (1, 'number', 0, 0),
'vtx' : (0, 'number', 1, 1),
'vtx.x' : (1, 'number', 0, 0),
'vtx.y' : (1, 'number', 0, 0),
'len' : (0, 'number', 1, 1),
'len.n' : (1, 'number', 0, 0),
'book' : (1, 'snippets', 1, 0),
'version' : (1, 'snippets', 1, 0),
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.Schema_id' : (1, 'scalar_text', 0, 0),
'version.Schema_version' : (1, 'scalar_text', 0, 0),
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
'version.creation_date' : (1, 'scalar_text', 0, 0),
'version.header_footer' : (1, 'scalar_text', 0, 0),
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
'version.findlists' : (1, 'scalar_text', 0, 0),
'version.page_num' : (1, 'scalar_text', 0, 0),
'version.page_type' : (1, 'scalar_text', 0, 0),
'version.bad_text' : (1, 'scalar_text', 0, 0),
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
'version.margins' : (1, 'scalar_text', 0, 0),
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
'version.toc' : (1, 'scalar_text', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0),
'style._tag' : (1, 'scalar_text', 0, 0),
'style.type' : (1, 'scalar_text', 0, 0),
'style._after_type' : (1, 'scalar_text', 0, 0),
'style._parent_type' : (1, 'scalar_text', 0, 0),
'style._after_parent_type' : (1, 'scalar_text', 0, 0),
'style.class' : (1, 'scalar_text', 0, 0),
'style._after_class' : (1, 'scalar_text', 0, 0),
'rule' : (1, 'snippets', 1, 0),
'rule.attr' : (1, 'scalar_text', 0, 0),
'rule.value' : (1, 'scalar_text', 0, 0),
'original' : (0, 'number', 1, 1),
'original.pnum' : (1, 'number', 0, 0),
'original.pid' : (1, 'text', 0, 0),
'pages' : (0, 'number', 1, 1),
'pages.ref' : (1, 'number', 0, 0),
'pages.id' : (1, 'number', 0, 0),
'startID' : (0, 'number', 1, 1),
'startID.page' : (1, 'number', 0, 0),
'startID.id' : (1, 'number', 0, 0),
'median_d' : (1, 'number', 0, 0),
'median_h' : (1, 'number', 0, 0),
'median_firsty' : (1, 'number', 0, 0),
'median_lasty' : (1, 'number', 0, 0),
'num_footers_maybe' : (1, 'number', 0, 0),
'num_footers_yes' : (1, 'number', 0, 0),
'num_headers_maybe' : (1, 'number', 0, 0),
'num_headers_yes' : (1, 'number', 0, 0),
'tracking' : (1, 'number', 0, 0),
'src' : (1, 'text', 0, 0),
b'x' : (1, 'scalar_number', 0, 0),
b'y' : (1, 'scalar_number', 0, 0),
b'h' : (1, 'scalar_number', 0, 0),
b'w' : (1, 'scalar_number', 0, 0),
b'firstWord' : (1, 'scalar_number', 0, 0),
b'lastWord' : (1, 'scalar_number', 0, 0),
b'rootID' : (1, 'scalar_number', 0, 0),
b'stemID' : (1, 'scalar_number', 0, 0),
b'type' : (1, 'scalar_text', 0, 0),
b'info' : (0, 'number', 1, 0),
b'info.word' : (0, 'number', 1, 1),
b'info.word.ocrText' : (1, 'text', 0, 0),
b'info.word.firstGlyph' : (1, 'raw', 0, 0),
b'info.word.lastGlyph' : (1, 'raw', 0, 0),
b'info.word.bl' : (1, 'raw', 0, 0),
b'info.word.link_id' : (1, 'number', 0, 0),
b'glyph' : (0, 'number', 1, 1),
b'glyph.x' : (1, 'number', 0, 0),
b'glyph.y' : (1, 'number', 0, 0),
b'glyph.glyphID' : (1, 'number', 0, 0),
b'dehyphen' : (0, 'number', 1, 1),
b'dehyphen.rootID' : (1, 'number', 0, 0),
b'dehyphen.stemID' : (1, 'number', 0, 0),
b'dehyphen.stemPage' : (1, 'number', 0, 0),
b'dehyphen.sh' : (1, 'number', 0, 0),
b'links' : (0, 'number', 1, 1),
b'links.page' : (1, 'number', 0, 0),
b'links.rel' : (1, 'number', 0, 0),
b'links.row' : (1, 'number', 0, 0),
b'links.title' : (1, 'text', 0, 0),
b'links.href' : (1, 'text', 0, 0),
b'links.type' : (1, 'text', 0, 0),
b'links.id' : (1, 'number', 0, 0),
b'paraCont' : (0, 'number', 1, 1),
b'paraCont.rootID' : (1, 'number', 0, 0),
b'paraCont.stemID' : (1, 'number', 0, 0),
b'paraCont.stemPage' : (1, 'number', 0, 0),
b'paraStems' : (0, 'number', 1, 1),
b'paraStems.stemID' : (1, 'number', 0, 0),
b'wordStems' : (0, 'number', 1, 1),
b'wordStems.stemID' : (1, 'number', 0, 0),
b'empty' : (1, 'snippets', 1, 0),
b'page' : (1, 'snippets', 1, 0),
b'page.class' : (1, 'scalar_text', 0, 0),
b'page.pageid' : (1, 'scalar_text', 0, 0),
b'page.pagelabel' : (1, 'scalar_text', 0, 0),
b'page.type' : (1, 'scalar_text', 0, 0),
b'page.h' : (1, 'scalar_number', 0, 0),
b'page.w' : (1, 'scalar_number', 0, 0),
b'page.startID' : (1, 'scalar_number', 0, 0),
b'group' : (1, 'snippets', 1, 0),
b'group.class' : (1, 'scalar_text', 0, 0),
b'group.type' : (1, 'scalar_text', 0, 0),
b'group._tag' : (1, 'scalar_text', 0, 0),
b'group.orientation': (1, 'scalar_text', 0, 0),
b'region' : (1, 'snippets', 1, 0),
b'region.class' : (1, 'scalar_text', 0, 0),
b'region.type' : (1, 'scalar_text', 0, 0),
b'region.x' : (1, 'scalar_number', 0, 0),
b'region.y' : (1, 'scalar_number', 0, 0),
b'region.h' : (1, 'scalar_number', 0, 0),
b'region.w' : (1, 'scalar_number', 0, 0),
b'region.orientation' : (1, 'scalar_text', 0, 0),
b'empty_text_region' : (1, 'snippets', 1, 0),
b'img' : (1, 'snippets', 1, 0),
b'img.x' : (1, 'scalar_number', 0, 0),
b'img.y' : (1, 'scalar_number', 0, 0),
b'img.h' : (1, 'scalar_number', 0, 0),
b'img.w' : (1, 'scalar_number', 0, 0),
b'img.src' : (1, 'scalar_number', 0, 0),
b'img.color_src' : (1, 'scalar_number', 0, 0),
b'img.gridSize' : (1, 'scalar_number', 0, 0),
b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'img.image_type' : (1, 'scalar_number', 0, 0),
b'paragraph' : (1, 'snippets', 1, 0),
b'paragraph.class' : (1, 'scalar_text', 0, 0),
b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic' : (1, 'snippets', 1, 1),
b'word_semantic.type' : (1, 'scalar_text', 0, 0),
b'word_semantic.class' : (1, 'scalar_text', 0, 0),
b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'word' : (1, 'snippets', 1, 0),
b'word.type' : (1, 'scalar_text', 0, 0),
b'word.class' : (1, 'scalar_text', 0, 0),
b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
b'_span' : (1, 'snippets', 1, 0),
b'_span.class' : (1, 'scalar_text', 0, 0),
b'_span.firstWord' : (1, 'scalar_number', 0, 0),
b'_span.lastWord' : (1, 'scalar_number', 0, 0),
b'_span.gridSize' : (1, 'scalar_number', 0, 0),
b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'span' : (1, 'snippets', 1, 0),
b'span.firstWord' : (1, 'scalar_number', 0, 0),
b'span.lastWord' : (1, 'scalar_number', 0, 0),
b'span.gridSize' : (1, 'scalar_number', 0, 0),
b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'extratokens' : (1, 'snippets', 1, 0),
b'extratokens.class' : (1, 'scalar_text', 0, 0),
b'extratokens.type' : (1, 'scalar_text', 0, 0),
b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'glyph.h' : (1, 'number', 0, 0),
b'glyph.w' : (1, 'number', 0, 0),
b'glyph.use' : (1, 'number', 0, 0),
b'glyph.vtx' : (1, 'number', 0, 1),
b'glyph.len' : (1, 'number', 0, 1),
b'glyph.dpi' : (1, 'number', 0, 0),
b'vtx' : (0, 'number', 1, 1),
b'vtx.x' : (1, 'number', 0, 0),
b'vtx.y' : (1, 'number', 0, 0),
b'len' : (0, 'number', 1, 1),
b'len.n' : (1, 'number', 0, 0),
b'book' : (1, 'snippets', 1, 0),
b'version' : (1, 'snippets', 1, 0),
b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
b'version.Schema_id' : (1, 'scalar_text', 0, 0),
b'version.Schema_version' : (1, 'scalar_text', 0, 0),
b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
b'version.creation_date' : (1, 'scalar_text', 0, 0),
b'version.header_footer' : (1, 'scalar_text', 0, 0),
b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
b'version.findlists' : (1, 'scalar_text', 0, 0),
b'version.page_num' : (1, 'scalar_text', 0, 0),
b'version.page_type' : (1, 'scalar_text', 0, 0),
b'version.bad_text' : (1, 'scalar_text', 0, 0),
b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
b'version.margins' : (1, 'scalar_text', 0, 0),
b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
b'version.toc' : (1, 'scalar_text', 0, 0),
b'stylesheet' : (1, 'snippets', 1, 0),
b'style' : (1, 'snippets', 1, 0),
b'style._tag' : (1, 'scalar_text', 0, 0),
b'style.type' : (1, 'scalar_text', 0, 0),
b'style._after_type' : (1, 'scalar_text', 0, 0),
b'style._parent_type' : (1, 'scalar_text', 0, 0),
b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
b'style.class' : (1, 'scalar_text', 0, 0),
b'style._after_class' : (1, 'scalar_text', 0, 0),
b'rule' : (1, 'snippets', 1, 0),
b'rule.attr' : (1, 'scalar_text', 0, 0),
b'rule.value' : (1, 'scalar_text', 0, 0),
b'original' : (0, 'number', 1, 1),
b'original.pnum' : (1, 'number', 0, 0),
b'original.pid' : (1, 'text', 0, 0),
b'pages' : (0, 'number', 1, 1),
b'pages.ref' : (1, 'number', 0, 0),
b'pages.id' : (1, 'number', 0, 0),
b'startID' : (0, 'number', 1, 1),
b'startID.page' : (1, 'number', 0, 0),
b'startID.id' : (1, 'number', 0, 0),
b'median_d' : (1, 'number', 0, 0),
b'median_h' : (1, 'number', 0, 0),
b'median_firsty' : (1, 'number', 0, 0),
b'median_lasty' : (1, 'number', 0, 0),
b'num_footers_maybe' : (1, 'number', 0, 0),
b'num_footers_yes' : (1, 'number', 0, 0),
b'num_headers_maybe' : (1, 'number', 0, 0),
b'num_headers_yes' : (1, 'number', 0, 0),
b'tracking' : (1, 'number', 0, 0),
b'src' : (1, 'text', 0, 0),
}
@ -430,7 +430,7 @@ class PageParser(object):
cnt = len(self.tagpath)
if i < cnt : result = self.tagpath[i]
for j in range(i+1, cnt) :
result += '.' + self.tagpath[j]
result += b'.' + self.tagpath[j]
return result
@ -505,7 +505,7 @@ class PageParser(object):
if (subtags == 1):
ntags = readEncodedNumber(self.fo)
if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
if self.debug : print('subtags: ', token , ' has ' , str(ntags))
for j in range(ntags):
val = readEncodedNumber(self.fo)
subtagres.append(self.procToken(self.dict.lookup(val)))
@ -613,7 +613,7 @@ class PageParser(object):
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
nname = prefix + '.' + name
nname = prefix + b'.' + name
nsubtaglist = []
for j in subtagList:
nsubtaglist.append(self.updateName(j,prefix))
@ -662,34 +662,34 @@ class PageParser(object):
subtagList = node[1]
argtype = node[2]
argList = node[3]
fullpathname = name.split('.')
fullpathname = name.split(b'.')
nodename = fullpathname.pop()
ilvl = len(fullpathname)
indent = ' ' * (3 * ilvl)
indent = b' ' * (3 * ilvl)
rlst = []
rlst.append(indent + '<' + nodename + '>')
rlst.append(indent + b'<' + nodename + b'>')
if len(argList) > 0:
alst = []
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
alst.append(j + '|')
if (argtype == b'text') or (argtype == b'scalar_text') :
alst.append(j + b'|')
else :
alst.append(str(j) + ',')
argres = "".join(alst)
alst.append(str(j).encode('utf-8') + b',')
argres = b"".join(alst)
argres = argres[0:-1]
if argtype == 'snippets' :
rlst.append('snippets:' + argres)
if argtype == b'snippets' :
rlst.append(b'snippets:' + argres)
else :
rlst.append(argres)
if len(subtagList) > 0 :
rlst.append('\n')
rlst.append(b'\n')
for j in subtagList:
if len(j) > 0 :
rlst.append(self.formatTag(j))
rlst.append(indent + '</' + nodename + '>\n')
rlst.append(indent + b'</' + nodename + b'>\n')
else:
rlst.append('</' + nodename + '>\n')
return "".join(rlst)
rlst.append(b'</' + nodename + b'>\n')
return b"".join(rlst)
# flatten tag
@ -704,20 +704,20 @@ class PageParser(object):
alst = []
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
alst.append(j + '|')
alst.append(j + b'|')
else :
alst.append(str(j) + '|')
argres = "".join(alst)
alst.append(str(j).encode('utf-8') + b'|')
argres = b"".join(alst)
argres = argres[0:-1]
if argtype == 'snippets' :
rlst.append('.snippets=' + argres)
if argtype == b'snippets' :
rlst.append(b'.snippets=' + argres)
else :
rlst.append('=' + argres)
rlst.append('\n')
rlst.append(b'=' + argres)
rlst.append(b'\n')
for j in subtagList:
if len(j) > 0 :
rlst.append(self.flattenTag(j))
return "".join(rlst)
return b"".join(rlst)
# reduce create xml output
@ -729,7 +729,7 @@ class PageParser(object):
rlst.append(self.flattenTag(j))
else:
rlst.append(self.formatTag(j))
result = "".join(rlst)
result = b"".join(rlst)
if self.debug : print(result)
return result
@ -747,16 +747,16 @@ class PageParser(object):
# peek at the first bytes to see what type of file it is
magic = self.fo.read(9)
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
first_token = 'info'
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
first_token = b'info'
elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
skip = self.fo.read(2)
first_token = 'info'
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
first_token = 'info'
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
first_token = b'info'
elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
first_token = b'info'
elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
skip = self.fo.read(3)
first_token = 'info'
first_token = b'info'
else :
# other0.dat file
first_token = None
@ -778,7 +778,7 @@ class PageParser(object):
break
if (v == 0x72):
self.doLoop72('number')
self.doLoop72(b'number')
elif (v > 0) and (v < self.dict.getSize()) :
tag = self.procToken(self.dict.lookup(v))
if len(tag) > 0 :
@ -789,7 +789,7 @@ class PageParser(object):
if (v == 0):
if (self.peek(1) == 0x5f):
skip = self.fo.read(1)
first_token = 'info'
first_token = b'info'
# now do snippet injection
if len(self.snippetList) > 0 :
@ -809,14 +809,14 @@ class PageParser(object):
def fromData(dict, fname):
flat_xml = True
debug = False
debug = True
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
def getXML(dict, fname):
flat_xml = False
debug = False
debug = True
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
@ -845,7 +845,7 @@ def main(argv):
sys.stderr=SafeUnbuffered(sys.stderr)
dictFile = ""
pageFile = ""
debug = False
debug = True
flat_xml = False
printOutput = False
if len(argv) == 0:

@ -7,6 +7,7 @@ import csv
import os
import math
import getopt
import functools
from struct import pack
from struct import unpack
@ -15,14 +16,14 @@ class DocParser(object):
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0
self.docList = flatxml.split('\n')
self.docList = flatxml.split(b'\n')
self.docSize = len(self.docList)
self.classList = {}
self.bookDir = bookDir
self.gdict = gdict
tmpList = classlst.split('\n')
for pclass in tmpList:
if pclass != '':
if pclass != b'':
# remove the leading period from the css name
cname = pclass[1:]
self.classList[cname] = True
@ -57,9 +58,9 @@ class DocParser(object):
imgfile = os.path.join(imgDir,imgname)
# get glyph information
gxList = self.getData('info.glyph.x',0,-1)
gyList = self.getData('info.glyph.y',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
gxList = self.getData(b'info.glyph.x',0,-1)
gyList = self.getData(b'info.glyph.y',0,-1)
gidList = self.getData(b'info.glyph.glyphID',0,-1)
gids = []
maxws = []
@ -122,11 +123,11 @@ class DocParser(object):
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
item = self.docList[pos]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
if item.find(b'=') >= 0:
(name, argres) = item.split(b'=',1)
else :
name = item
argres = ''
argres = b''
return name, argres
@ -140,11 +141,13 @@ class DocParser(object):
foundat = -1
for j in range(pos, end):
item = self.docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
if item.find(b'=') >= 0:
(name, argres) = item.split(b'=',1)
else :
name = item
argres = ''
if (isinstance(tagpath,str)):
tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) :
result = argres
foundat = j
@ -170,7 +173,7 @@ class DocParser(object):
argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) :
argList = argt.split('|')
argList = argt.split(b'|')
argres = [ int(strval) for strval in argList]
return argres
@ -191,21 +194,21 @@ class DocParser(object):
# also some class names have spaces in them so need to convert to dashes
if nclass != None :
nclass = nclass.replace(' ','-')
classres = ''
nclass = nclass.replace(b' ',b'-')
classres = b''
nclass = nclass.lower()
nclass = 'cl-' + nclass
baseclass = ''
nclass = b'cl-' + nclass
baseclass = b''
# graphic is the base class for captions
if nclass.find('cl-cap-') >=0 :
classres = 'graphic' + ' '
if nclass.find(b'cl-cap-') >=0 :
classres = b'graphic' + b' '
else :
# strip to find baseclass
p = nclass.find('_')
p = nclass.find(b'_')
if p > 0 :
baseclass = nclass[0:p]
if baseclass in self.classList:
classres += baseclass + ' '
classres += baseclass + b' '
classres += nclass
nclass = classres
return nclass
@ -225,11 +228,11 @@ class DocParser(object):
return -1
result = []
(pos, pagetype) = self.findinDoc('page.type',0,-1)
(pos, pagetype) = self.findinDoc(b'page.type',0,-1)
groupList = self.posinDoc('page.group')
groupregionList = self.posinDoc('page.group.region')
pageregionList = self.posinDoc('page.region')
groupList = self.posinDoc(b'page.group')
groupregionList = self.posinDoc(b'page.group.region')
pageregionList = self.posinDoc(b'page.region')
# integrate into one list
for j in groupList:
result.append(('grpbeg',j))
@ -237,7 +240,7 @@ class DocParser(object):
result.append(('gregion',j))
for j in pageregionList:
result.append(('pregion',j))
result.sort(compare)
result.sort(key=functools.cmp_to_key(compare))
# insert group end and page end indicators
inGroup = False
@ -267,33 +270,33 @@ class DocParser(object):
result = []
# paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
(pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
pclass = self.getClass(pclass)
# if paragraph uses extratokens (extra glyphs) then make it fixed
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
(pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
(pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) :
first = int(sfirst)
last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
makeImage = (regtype == b'vertical') or (regtype == b'table')
makeImage = makeImage or (extraglyphs != None)
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
makeImage = makeImage or (regtype == b'fixed')
if (pclass != None):
makeImage = makeImage or (pclass.find('.inverted') >= 0)
makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
if self.fixedimage :
makeImage = makeImage or (pclass.find('cl-f-') >= 0)
makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
# before creating an image make sure glyph info exists
gidList = self.getData('info.glyph.glyphID',0,-1)
gidList = self.getData(b'info.glyph.glyphID',0,-1)
makeImage = makeImage & (len(gidList) > 0)
@ -307,8 +310,8 @@ class DocParser(object):
# translate first and last word into first and last glyphs
# and generate inline image and include it
glyphList = []
firstglyphList = self.getData('word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
firstglyphList = self.getData(b'word.firstGlyph',0,-1)
gidList = self.getData(b'info.glyph.glyphID',0,-1)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
@ -326,8 +329,8 @@ class DocParser(object):
for glyphnum in range(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
# include any extratokens if they exist
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
(pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
(pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
if (sfg != None) and (slg != None):
for glyphnum in range(int(sfg), int(slg)):
glyphList.append(glyphnum)
@ -368,39 +371,39 @@ class DocParser(object):
(name, argres) = self.lineinDoc(line)
if name.endswith('span.firstWord') :
if name.endswith(b'span.firstWord') :
sp_first = int(argres)
elif name.endswith('span.lastWord') :
elif name.endswith(b'span.lastWord') :
sp_last = int(argres)
elif name.endswith('word.firstGlyph') :
elif name.endswith(b'word.firstGlyph') :
gl_first = int(argres)
elif name.endswith('word.lastGlyph') :
elif name.endswith(b'word.lastGlyph') :
gl_last = int(argres)
elif name.endswith('word_semantic.firstWord'):
elif name.endswith(b'word_semantic.firstWord'):
ws_first = int(argres)
elif name.endswith('word_semantic.lastWord'):
elif name.endswith(b'word_semantic.lastWord'):
ws_last = int(argres)
elif name.endswith('word.class'):
elif name.endswith(b'word.class'):
# we only handle spaceafter word class
try:
(cname, space) = argres.split('-',1)
if space == '' : space = '0'
if (cname == 'spaceafter') and (int(space) > 0) :
(cname, space) = argres.split(b'-',1)
if space == b'' : space = b'0'
if (cname == b'spaceafter') and (int(space) > 0) :
word_class = 'sa'
except:
pass
elif name.endswith('word.img.src'):
elif name.endswith(b'word.img.src'):
result.append(('img' + word_class, int(argres)))
word_class = ''
elif name.endswith('region.img.src'):
elif name.endswith(b'region.img.src'):
result.append(('img' + word_class, int(argres)))
if (sp_first != -1) and (sp_last != -1):
@ -437,7 +440,7 @@ class DocParser(object):
classres = ''
if pclass :
classres = ' class="' + pclass + '"'
classres = ' class="' + pclass.decode('utf-8') + '"'
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
@ -470,8 +473,8 @@ class DocParser(object):
if (link > 0):
linktype = self.link_type[link-1]
title = self.link_title[link-1]
if (title == "") or (parares.rfind(title) < 0):
title=parares[lstart:]
if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
title=parares[lstart:].encode('utf-8')
if linktype == 'external' :
linkhref = self.link_href[link-1]
linkhtml = '<a href="%s">' % linkhref
@ -482,33 +485,34 @@ class DocParser(object):
else :
# just link to the current page
linkhtml = '<a href="#' + self.id + '">'
linkhtml += title + '</a>'
pos = parares.rfind(title)
linkhtml += title.decode('utf-8')
linkhtml += '</a>'
pos = parares.rfind(title.decode('utf-8'))
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
lstart = len(parares)
if word == '_link_' : word = ''
if word == b'_link_' : word = b''
elif (link < 0) :
if word == '_link_' : word = ''
if word == b'_link_' : word = b''
if word == '_lb_':
if word == b'_lb_':
if ((num-1) in self.dehyphen_rootid ) or handle_links:
word = ''
word = b''
sep = ''
elif br_lb :
word = '<br />\n'
word = b'<br />\n'
sep = ''
else :
word = '\n'
word = b'\n'
sep = ''
if num in self.dehyphen_rootid :
word = word[0:-1]
sep = ''
parares += word + sep
parares += word.decode('utf-8') + sep
elif wtype == 'img' :
sep = ''
@ -522,7 +526,9 @@ class DocParser(object):
elif wtype == 'svg' :
sep = ''
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
parares += '<img src="img/'
parares += self.id
parares += '_%04d.svg" alt="" />' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1]
@ -545,7 +551,7 @@ class DocParser(object):
(wtype, num) = pdesc[j]
if wtype == 'ocr' :
word = self.ocrtext[num]
word = self.ocrtext[num].decode('utf-8')
sep = ' '
if handle_links:
@ -553,7 +559,7 @@ class DocParser(object):
if (link > 0):
linktype = self.link_type[link-1]
title = self.link_title[link-1]
title = title.rstrip('. ')
title = title.rstrip(b'. ')
alt_title = parares[lstart:]
alt_title = alt_title.strip()
# now strip off the actual printed page number
@ -607,38 +613,38 @@ class DocParser(object):
hlst = []
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
if argres : self.ocrtext = argres.split('|')
(pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
if argres : self.ocrtext = argres.split(b'|')
# get information to dehyphenate the text
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
(pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
first_para_continued = (self.parastems_stemid != None)
# determine if last paragraph is continued onto the next page
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
(pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
last_para_continued = (self.paracont_stemid != None)
# collect link ids
self.link_id = self.getData('info.word.link_id',0,-1)
self.link_id = self.getData(b'info.word.link_id',0,-1)
# collect link destination page numbers
self.link_page = self.getData('info.links.page',0,-1)
self.link_page = self.getData(b'info.links.page',0,-1)
# collect link types (container versus external)
(pos, argres) = self.findinDoc('info.links.type',0,-1)
if argres : self.link_type = argres.split('|')
(pos, argres) = self.findinDoc(b'info.links.type',0,-1)
if argres : self.link_type = argres.split(b'|')
# collect link destinations
(pos, argres) = self.findinDoc('info.links.href',0,-1)
if argres : self.link_href = argres.split('|')
(pos, argres) = self.findinDoc(b'info.links.href',0,-1)
if argres : self.link_href = argres.split(b'|')
# collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1)
(pos, argres) = self.findinDoc(b'info.links.title',0,-1)
if argres :
self.link_title = argres.split('|')
self.link_title = argres.split(b'|')
else:
self.link_title.append('')
@ -662,51 +668,51 @@ class DocParser(object):
# set anchor for link target on this page
if not anchorSet and not first_para_continued:
hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n')
hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
anchorSet = True
# handle groups of graphics with text captions
if (etype == 'grpbeg'):
(pos, grptype) = self.findinDoc('group.type', start, end)
if (etype == b'grpbeg'):
(pos, grptype) = self.findinDoc(b'group.type', start, end)
if grptype != None:
if grptype == 'graphic':
gcstr = ' class="' + grptype + '"'
if grptype == b'graphic':
gcstr = ' class="' + grptype.decode('utf-8') + '"'
hlst.append('<div' + gcstr + '>')
inGroup = True
elif (etype == 'grpend'):
elif (etype == b'grpend'):
if inGroup:
hlst.append('</div>\n')
inGroup = False
else:
(pos, regtype) = self.findinDoc('region.type',start,end)
(pos, regtype) = self.findinDoc(b'region.type',start,end)
if regtype == 'graphic' :
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if regtype == b'graphic' :
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
if simgsrc:
if inGroup:
hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
else:
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
elif regtype == 'chapterheading' :
elif regtype == b'chapterheading' :
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not breakSet:
hlst.append('<div style="page-break-after: always;">&nbsp;</div>\n')
breakSet = True
tag = 'h1'
if pclass and (len(pclass) >= 7):
if pclass[3:7] == 'ch1-' : tag = 'h1'
if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3'
hlst.append('<' + tag + ' class="' + pclass + '">')
if pclass[3:7] == b'ch1-' : tag = 'h1'
if pclass[3:7] == b'ch2-' : tag = 'h2'
if pclass[3:7] == b'ch3-' : tag = 'h3'
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
else:
hlst.append('<' + tag + '>')
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
hlst.append('</' + tag + '>')
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
ptype = 'full'
# check to see if this is a continution from the previous page
if first_para_continued :
@ -715,16 +721,16 @@ class DocParser(object):
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
hlst.append('<' + tag + ' class="' + pclass + '">')
if pclass[3:6] == b'h1-' : tag = 'h4'
if pclass[3:6] == b'h2-' : tag = 'h5'
if pclass[3:6] == b'h3-' : tag = 'h6'
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
hlst.append('</' + tag + '>')
else :
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
elif (regtype == 'tocentry') :
elif (regtype == b'tocentry') :
ptype = 'full'
if first_para_continued :
ptype = 'end'
@ -733,7 +739,7 @@ class DocParser(object):
tocinfo += self.buildTOCEntry(pdesc)
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
elif (regtype == 'vertical') or (regtype == 'table') :
elif (regtype == b'vertical') or (regtype == b'table') :
ptype = 'full'
if inGroup:
ptype = 'middle'
@ -744,19 +750,19 @@ class DocParser(object):
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
elif (regtype == 'synth_fcvr.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end)
elif (regtype == b'synth_fcvr.center'):
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
if simgsrc:
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
else :
print(' Making region type', regtype, end=' ')
(pos, temp) = self.findinDoc('paragraph',start,end)
(pos2, temp) = self.findinDoc('span',start,end)
(pos, temp) = self.findinDoc(b'paragraph',start,end)
(pos2, temp) = self.findinDoc(b'span',start,end)
if pos != -1 or pos2 != -1:
print(' a "text" region')
orig_regtype = regtype
regtype = 'fixed'
regtype = b'fixed'
ptype = 'full'
# check to see if this is a continution from the previous page
if first_para_continued :
@ -764,23 +770,23 @@ class DocParser(object):
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not pclass:
if orig_regtype.endswith('.right') : pclass = 'cl-right'
elif orig_regtype.endswith('.center') : pclass = 'cl-center'
elif orig_regtype.endswith('.left') : pclass = 'cl-left'
elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
if orig_regtype.endswith(b'.right') : pclass = 'cl-right'
elif orig_regtype.endswith(b'.center') : pclass = 'cl-center'
elif orig_regtype.endswith(b'.left') : pclass = 'cl-left'
elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify'
if pclass and (ptype == 'full') and (len(pclass) >= 6):
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
hlst.append('<' + tag + ' class="' + pclass + '">')
if pclass[3:6] == b'h1-' : tag = 'h4'
if pclass[3:6] == b'h2-' : tag = 'h5'
if pclass[3:6] == b'h3-' : tag = 'h6'
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
hlst.append('</' + tag + '>')
else :
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
else :
print(' a "graphic" region')
(pos, simgsrc) = self.findinDoc('img.src',start,end)
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
if simgsrc:
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))

@ -12,7 +12,7 @@ from struct import unpack
class PParser(object):
def __init__(self, gd, flatxml, meta_array):
self.gd = gd
self.flatdoc = flatxml.split('\n')
self.flatdoc = flatxml.split(b'\n')
self.docSize = len(self.flatdoc)
self.temp = []
@ -58,11 +58,11 @@ class PParser(object):
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
item = self.flatdoc[pos]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
if item.find(b'=') >= 0:
(name, argres) = item.split(b'=',1)
else :
name = item
argres = ''
argres = b''
return name, argres
# find tag in doc if within pos to end inclusive
@ -75,11 +75,13 @@ class PParser(object):
foundat = -1
for j in range(pos, end):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
if item.find(b'=') >= 0:
(name, argres) = item.split(b'=',1)
else :
name = item
argres = ''
argres = b''
if (isinstance(tagpath,str)):
tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) :
result = argres
foundat = j
@ -103,9 +105,9 @@ class PParser(object):
cnt = len(self.flatdoc)
for j in range(cnt):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
if item.find(b'=') >= 0:
(name, argt) = item.split(b'=')
argres = argt.split(b'|')
else:
name = item
argres = []
@ -120,15 +122,17 @@ class PParser(object):
def getDataatPos(self, path, pos):
result = None
item = self.flatdoc[pos]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
if item.find(b'=') >= 0:
(name, argt) = item.split(b'=')
argres = argt.split(b'|')
else:
name = item
argres = []
if (len(argres) > 0) :
for j in range(0,len(argres)):
argres[j] = int(argres[j])
if (isinstance(path,str)):
path = path.encode('utf-8')
if (name.endswith(path)):
result = argres
return result
@ -138,12 +142,14 @@ class PParser(object):
cnt = len(self.temp)
for j in range(cnt):
item = self.temp[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
if item.find(b'=') >= 0:
(name, argt) = item.split(b'=')
argres = argt.split(b'|')
else:
name = item
argres = []
if (isinstance(path,str)):
path = path.encode('utf-8')
if (name.endswith(path)):
result = argres
self.temp.pop(j)

@ -44,10 +44,10 @@ if inCalibre :
from calibre_plugins.dedrm import flatxml2svg
from calibre_plugins.dedrm import stylexml2css
else :
from . import convert2xml
from . import flatxml2html
from . import flatxml2svg
from . import stylexml2css
import convert2xml
import flatxml2html
import flatxml2svg
import stylexml2css
# global switch
buildXML = False
@ -117,10 +117,10 @@ class Dictionary(object):
self.stable.append(self.escapestr(readString(self.fo)))
self.pos = 0
def escapestr(self, str):
str = str.replace('&','&amp;')
str = str.replace('<','&lt;')
str = str.replace('>','&gt;')
str = str.replace('=','&#61;')
str = str.replace(b'&',b'&amp;')
str = str.replace(b'<',b'&lt;')
str = str.replace(b'>',b'&gt;')
str = str.replace(b'=',b'&#61;')
return str
def lookup(self,val):
if ((val >= 0) and (val < self.size)) :
@ -138,7 +138,7 @@ class Dictionary(object):
class PageDimParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
self.flatdoc = flatxml.split(b'\n')
# find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
@ -151,8 +151,8 @@ class PageDimParser(object):
foundat = -1
for j in range(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=')
if item.find(b'=') >= 0:
(name, argres) = item.split(b'=')
else :
name = item
argres = ''
@ -162,8 +162,8 @@ class PageDimParser(object):
break
return foundat, result
def process(self):
(pos, sph) = self.findinDoc('page.h',0,-1)
(pos, spw) = self.findinDoc('page.w',0,-1)
(pos, sph) = self.findinDoc(b'page.h',0,-1)
(pos, spw) = self.findinDoc(b'page.w',0,-1)
if (sph == None): sph = '-1'
if (spw == None): spw = '-1'
return sph, spw
@ -176,21 +176,21 @@ def getPageDim(flatxml):
class GParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
self.flatdoc = flatxml.split(b'\n')
self.dpi = 1440
self.gh = self.getData('info.glyph.h')
self.gw = self.getData('info.glyph.w')
self.guse = self.getData('info.glyph.use')
self.gh = self.getData(b'info.glyph.h')
self.gw = self.getData(b'info.glyph.w')
self.guse = self.getData(b'info.glyph.use')
if self.guse :
self.count = len(self.guse)
else :
self.count = 0
self.gvtx = self.getData('info.glyph.vtx')
self.glen = self.getData('info.glyph.len')
self.gdpi = self.getData('info.glyph.dpi')
self.vx = self.getData('info.vtx.x')
self.vy = self.getData('info.vtx.y')
self.vlen = self.getData('info.len.n')
self.gvtx = self.getData(b'info.glyph.vtx')
self.glen = self.getData(b'info.glyph.len')
self.gdpi = self.getData(b'info.glyph.dpi')
self.vx = self.getData(b'info.vtx.x')
self.vy = self.getData(b'info.vtx.y')
self.vlen = self.getData(b'info.len.n')
if self.vlen :
self.glen.append(len(self.vlen))
elif self.glen:
@ -204,9 +204,9 @@ class GParser(object):
cnt = len(self.flatdoc)
for j in range(cnt):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
if item.find(b'=') >= 0:
(name, argt) = item.split(b'=')
argres = argt.split(b'|')
else:
name = item
argres = []
@ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage):
# now get the css info
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
open(xname, 'wb').write(cssstr)
open(xname, 'w').write(cssstr)
if buildXML:
xname = os.path.join(xmlDir, 'other0000.xml')
open(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
@ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage):
hlst.append('</body>\n</html>\n')
htmlstr = "".join(hlst)
hlst = None
open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr)
print(" ")
print('Extracting Table of Contents from Amazon OCR')
@ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage):
tlst.append('</body>\n')
tlst.append('</html>\n')
tochtml = "".join(tlst)
open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml)
# now create index_svg.xhtml that points to all required files
@ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage):
flst = []
for page in pagelst:
flst.append(xmllst[page])
flat_svg = "".join(flst)
flat_svg = b"".join(flst)
flst=None
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
if (raw) :
@ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage):
slst.append('</body>\n</html>\n')
svgindex = "".join(slst)
slst = None
open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex)
print(" ")
@ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage):
olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n')
# adding metadata
olst.append(' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n')
if 'GUID' in meta_array:
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n')
if 'ASIN' in meta_array:
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n')
if 'oASIN' in meta_array:
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n')
olst.append(' <dc:title>' + meta_array['Title'] + '</dc:title>\n')
olst.append(' <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n')
if b'GUID' in meta_array:
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array[b'GUID'].decode('utf-8') + '</dc:identifier>\n')
if b'ASIN' in meta_array:
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array[b'ASIN'].decode('utf-8') + '</dc:identifier>\n')
if b'oASIN' in meta_array:
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array[b'oASIN'].decode('utf-8') + '</dc:identifier>\n')
olst.append(' <dc:title>' + meta_array[b'Title'].decode('utf-8') + '</dc:title>\n')
olst.append(' <dc:creator opf:role="aut">' + meta_array[b'Authors'].decode('utf-8') + '</dc:creator>\n')
olst.append(' <dc:language>en</dc:language>\n')
olst.append(' <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n')
olst.append(' <dc:date>' + meta_array[b'UpdateTime'].decode('utf-8') + '</dc:date>\n')
if isCover:
olst.append(' <meta name="cover" content="bookcover"/>\n')
olst.append(' </metadata>\n')
@ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage):
olst.append('</package>\n')
opfstr = "".join(olst)
olst = None
open(opfname, 'wb').write(opfstr)
open(opfname, 'w').write(opfstr)
print('Processing Complete')

@ -49,14 +49,15 @@ def SHA1(message):
# Encode the bytes in data with the characters in map
# data and map should be byte arrays
def encode(data, map):
result = ''
result = b''
for char in data:
value = ord(char)
value = char
Q = (value ^ 0x80) // len(map)
R = value % len(map)
result += map[Q]
result += map[R]
result += bytes([map[Q]])
result += bytes([map[R]])
return result
# Hash the bytes in data and then encode the digest with the characters in map
@ -117,7 +118,7 @@ def generatePidEncryptionTable() :
def generatePidSeed(table,dsn) :
value = 0
for counter in range (0,4) :
index = (ord(dsn[counter]) ^ value) &0xFF
index = (dsn[counter] ^ value) & 0xFF
value = (value >> 8) ^ table[index]
return value
@ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll):
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
index = 0
for counter in range (0,nbRoll):
pid[index] = pid[index] ^ ord(dsn[counter])
pid[index] = pid[index] ^ dsn[counter]
index = (index+1) %8
for counter in range (0,8):
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
@ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase):
try:
# Get the kindle account token, if present
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode()
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens'])
except KeyError:
kindleAccountToken=""
@ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase):
try:
# Get the DSN token, if present
DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode()
DSN = bytearray.fromhex((kindleDatabase[1])['DSN'])
print("Got DSN key from database {0}".format(kindleDatabase[0]))
except KeyError:
# See if we have the info to generate the DSN
try:
# Get the Mazama Random number
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode()
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber'])
#print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0])
try:
# Get the SerialNumber token, if present
IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode()
IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber'])
print("Got SerialNumber from database {0}".format(kindleDatabase[0]))
except KeyError:
# Get the IDString we added
IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode()
IDString = bytearray.fromhex((kindleDatabase[1])['IDString'])
try:
# Get the UsernameHash token, if present
encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode()
encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash'])
print("Got UsernameHash from database {0}".format(kindleDatabase[0]))
except KeyError:
# Get the UserName we added
UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode()
UserName = bytearray.fromhex((kindleDatabase[1])['UserName'])
# encode it
encodedUsername = encodeHash(UserName,charMap1)
#print "encodedUsername",encodedUsername.encode('hex')
@ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase):
# Compute book PIDs
# book pid
pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token)
pidHash = SHA1(DSN+kindleAccountToken+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pids.append(bookPID)
# variant 1
pidHash = SHA1(kindleAccountToken.encode()+rec209+token)
pidHash = SHA1(kindleAccountToken+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pids.append(bookPID)
# variant 2
pidHash = SHA1(DSN.encode()+rec209+token)
pidHash = SHA1(DSN+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pids.append(bookPID)

@ -7,7 +7,7 @@
from __future__ import print_function
__license__ = 'GPL v3'
__version__ = "1.00"
__version__ = "1.0"
# This is a python script. You need a Python interpreter to run it.
# For example, ActiveState Python, which exists for windows.
@ -73,7 +73,7 @@ __version__ = "1.00"
# 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility
# 0.41 - Fixed potential unicode problem in command line calls
# 0.42 - Added GPL v3 licence. updated/removed some print statements
# 1.00 - Python 3 compatibility for calibre 5.0
# 1.0 - Python 3 compatibility for calibre 5.0
import sys
import os
@ -330,7 +330,7 @@ class MobiBook:
}
title = ''
codec = 'windows-1252'
if self.magic == 'BOOKMOBI':
if self.magic == b'BOOKMOBI':
if 503 in self.meta_array:
title = self.meta_array[503]
else:

@ -15,36 +15,36 @@ debug = False
class DocParser(object):
def __init__(self, flatxml, fontsize, ph, pw):
self.flatdoc = flatxml.split('\n')
self.flatdoc = flatxml.split(b'\n')
self.fontsize = int(fontsize)
self.ph = int(ph) * 1.0
self.pw = int(pw) * 1.0
stags = {
'paragraph' : 'p',
'graphic' : '.graphic'
b'paragraph' : 'p',
b'graphic' : '.graphic'
}
attr_val_map = {
'hang' : 'text-indent: ',
'indent' : 'text-indent: ',
'line-space' : 'line-height: ',
'margin-bottom' : 'margin-bottom: ',
'margin-left' : 'margin-left: ',
'margin-right' : 'margin-right: ',
'margin-top' : 'margin-top: ',
'space-after' : 'padding-bottom: ',
b'hang' : 'text-indent: ',
b'indent' : 'text-indent: ',
b'line-space' : 'line-height: ',
b'margin-bottom' : 'margin-bottom: ',
b'margin-left' : 'margin-left: ',
b'margin-right' : 'margin-right: ',
b'margin-top' : 'margin-top: ',
b'space-after' : 'padding-bottom: ',
}
attr_str_map = {
'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
'align-left' : 'text-align: left;',
'align-right' : 'text-align: right;',
'align-justify' : 'text-align: justify;',
'display-inline' : 'display: inline;',
'pos-left' : 'text-align: left;',
'pos-right' : 'text-align: right;',
'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
b'align-left' : 'text-align: left;',
b'align-right' : 'text-align: right;',
b'align-justify' : 'text-align: justify;',
b'display-inline' : 'display: inline;',
b'pos-left' : 'text-align: left;',
b'pos-right' : 'text-align: right;',
b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
}
@ -60,11 +60,13 @@ class DocParser(object):
foundat = -1
for j in range(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
if item.find(b'=') >= 0:
(name, argres) = item.split(b'=',1)
else :
name = item
argres = ''
argres = b''
if (isinstance(tagpath,str)):
tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) :
result = argres
foundat = j
@ -76,7 +78,7 @@ class DocParser(object):
def posinDoc(self, tagpath):
startpos = []
pos = 0
res = ""
res = b""
while res != None :
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
if res != None :
@ -87,11 +89,11 @@ class DocParser(object):
# returns a vector of integers for the tagpath
def getData(self, tagpath, pos, end, clean=False):
if clean:
digits_only = re.compile(r'''([0-9]+)''')
digits_only = re.compile(rb'''([0-9]+)''')
argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) :
argList = argt.split('|')
argList = argt.split(b'|')
for strval in argList:
if clean:
m = re.search(digits_only, strval)
@ -109,7 +111,7 @@ class DocParser(object):
csspage += '.cl-justify { text-align: justify; }\n'
# generate a list of each <style> starting point in the stylesheet
styleList= self.posinDoc('book.stylesheet.style')
styleList= self.posinDoc(b'book.stylesheet.style')
stylecnt = len(styleList)
styleList.append(-1)
@ -121,30 +123,30 @@ class DocParser(object):
start = styleList[j]
end = styleList[j+1]
(pos, tag) = self.findinDoc('style._tag',start,end)
(pos, tag) = self.findinDoc(b'style._tag',start,end)
if tag == None :
(pos, tag) = self.findinDoc('style.type',start,end)
(pos, tag) = self.findinDoc(b'style.type',start,end)
# Is this something we know how to convert to css
if tag in self.stags :
# get the style class
(pos, sclass) = self.findinDoc('style.class',start,end)
(pos, sclass) = self.findinDoc(b'style.class',start,end)
if sclass != None:
sclass = sclass.replace(' ','-')
sclass = '.cl-' + sclass.lower()
sclass = sclass.replace(b' ',b'-')
sclass = b'.cl-' + sclass.lower()
else :
sclass = ''
sclass = b''
if debug: print('sclass', sclass)
# check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end)
(pos, aftclass) = self.findinDoc(b'style._after_class',start,end)
if aftclass != None:
aftclass = aftclass.replace(' ','-')
aftclass = '.cl-' + aftclass.lower()
aftclass = aftclass.replace(b' ',b'-')
aftclass = b'.cl-' + aftclass.lower()
else :
aftclass = ''
aftclass = b''
if debug: print('aftclass', aftclass)
@ -152,34 +154,37 @@ class DocParser(object):
while True :
(pos1, attr) = self.findinDoc('style.rule.attr', start, end)
(pos2, val) = self.findinDoc('style.rule.value', start, end)
(pos1, attr) = self.findinDoc(b'style.rule.attr', start, end)
(pos2, val) = self.findinDoc(b'style.rule.value', start, end)
if debug: print('attr', attr)
if debug: print('val', val)
if attr == None : break
if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
if (attr == b'display') or (attr == b'pos') or (attr == b'align'):
# handle text based attributess
attr = attr + '-' + val
attr = attr + b'-' + val
if attr in self.attr_str_map :
cssargs[attr] = (self.attr_str_map[attr], '')
cssargs[attr] = (self.attr_str_map[attr], b'')
else :
# handle value based attributes
if attr in self.attr_val_map :
name = self.attr_val_map[attr]
if attr in ('margin-bottom', 'margin-top', 'space-after') :
if attr in (b'margin-bottom', b'margin-top', b'space-after') :
scale = self.ph
elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
elif attr in (b'margin-right', b'indent', b'margin-left', b'hang') :
scale = self.pw
elif attr == 'line-space':
elif attr == b'line-space':
scale = self.fontsize * 2.0
else:
print("Scale not defined!")
scale = 1.0
if val == "":
val = 0
if not ((attr == 'hang') and (int(val) == 0)):
if not ((attr == b'hang') and (int(val) == 0)):
try:
f = float(val)
except:
@ -198,32 +203,32 @@ class DocParser(object):
if debug: print('keeping style')
# make sure line-space does not go below 100% or above 300% since
# it can be wacky in some styles
if 'line-space' in cssargs:
seg = cssargs['line-space'][0]
val = cssargs['line-space'][1]
if b'line-space' in cssargs:
seg = cssargs[b'line-space'][0]
val = cssargs[b'line-space'][1]
if val < 1.0: val = 1.0
if val > 3.0: val = 3.0
del cssargs['line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'], val)
del cssargs[b'line-space']
cssargs[b'line-space'] = (self.attr_val_map[b'line-space'], val)
# handle modifications for css style hanging indents
if 'hang' in cssargs:
hseg = cssargs['hang'][0]
hval = cssargs['hang'][1]
del cssargs['hang']
cssargs['hang'] = (self.attr_val_map['hang'], -hval)
if b'hang' in cssargs:
hseg = cssargs[b'hang'][0]
hval = cssargs[b'hang'][1]
del cssargs[b'hang']
cssargs[b'hang'] = (self.attr_val_map[b'hang'], -hval)
mval = 0
mseg = 'margin-left: '
mval = hval
if 'margin-left' in cssargs:
mseg = cssargs['margin-left'][0]
mval = cssargs['margin-left'][1]
if b'margin-left' in cssargs:
mseg = cssargs[b'margin-left'][0]
mval = cssargs[b'margin-left'][1]
if mval < 0: mval = 0
mval = hval + mval
cssargs['margin-left'] = (mseg, mval)
if 'indent' in cssargs:
del cssargs['indent']
cssargs[b'margin-left'] = (mseg, mval)
if b'indent' in cssargs:
del cssargs[b'indent']
cssline = sclass + ' { '
for key in iter(cssargs):

@ -173,7 +173,7 @@ def decryptRecord(data,PID):
def decryptDkeyRecord(data,PID):
record = decryptRecord(data,PID)
fields = unpack('3sB8sB8s3s',record)
if fields[0] != 'PID' or fields[5] != 'pid' :
if fields[0] != b'PID' or fields[5] != b'pid' :
raise DrmException("Didn't find PID magic numbers in record")
elif fields[1] != 8 or fields[3] != 8 :
raise DrmException("Record didn't contain correct length fields")
@ -183,11 +183,11 @@ def decryptDkeyRecord(data,PID):
# Decrypt all dkey records (contain the book PID)
def decryptDkeyRecords(data,PID):
nbKeyRecords = ord(data[0])
nbKeyRecords = data[0]
records = []
data = data[1:]
for i in range (0,nbKeyRecords):
length = ord(data[0])
length = data[0]
try:
key = decryptDkeyRecord(data[1:length+1],PID)
records.append(key)
@ -209,7 +209,7 @@ class TopazBook:
self.bookMetadata = {}
self.bookKey = None
magic = unpack('4s',self.fo.read(4))[0]
if magic != 'TPZ0':
if magic != b'TPZ0':
raise DrmException("Parse Error : Invalid Header, not a Topaz file")
self.parseTopazHeaders()
self.parseMetadata()
@ -244,9 +244,9 @@ class TopazBook:
def parseMetadata(self):
# Parse the metadata record from the book payload and return a list of [key,values]
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords['metadata'][0][0])
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords[b'metadata'][0][0])
tag = bookReadString(self.fo)
if tag != 'metadata' :
if tag != b'metadata' :
raise DrmException("Parse Error : Record Names Don't Match")
flags = ord(self.fo.read(1))
nbRecords = ord(self.fo.read(1))
@ -260,18 +260,18 @@ class TopazBook:
return self.bookMetadata
def getPIDMetaInfo(self):
keysRecord = self.bookMetadata.get('keys','')
keysRecordRecord = ''
if keysRecord != '':
keylst = keysRecord.split(',')
keysRecord = self.bookMetadata.get(b'keys',b'')
keysRecordRecord = b''
if keysRecord != b'':
keylst = keysRecord.split(b',')
for keyval in keylst:
keysRecordRecord += self.bookMetadata.get(keyval,'')
keysRecordRecord += self.bookMetadata.get(keyval,b'')
return keysRecord, keysRecordRecord
def getBookTitle(self):
title = ''
if 'Title' in self.bookMetadata:
title = self.bookMetadata['Title']
title = b''
if b'Title' in self.bookMetadata:
title = self.bookMetadata[b'Title']
return title.decode('utf-8')
def setBookKey(self, key):
@ -323,7 +323,7 @@ class TopazBook:
raw = 0
fixedimage=True
try:
keydata = self.getBookPayloadRecord('dkey', 0)
keydata = self.getBookPayloadRecord(b'dkey', 0)
except DrmException as e:
print("no dkey record found, book may not be encrypted")
print("attempting to extrct files without a book key")
@ -354,7 +354,7 @@ class TopazBook:
pass
else:
bookKey = bookKeys[0]
print("Book Key Found! ({0})".format(bookKey.encode('hex')))
print("Book Key Found! ({0})".format(bookKey.hex()))
break
if not bookKey:
@ -396,26 +396,26 @@ class TopazBook:
outdir = self.outdir
for headerRecord in self.bookHeaderRecords:
name = headerRecord
if name != 'dkey':
if name != b'dkey':
ext = ".dat"
if name == 'img': ext = ".jpg"
if name == 'color' : ext = ".jpg"
print("Processing Section: {0}\n. . .".format(name), end=' ')
if name == b'img': ext = ".jpg"
if name == b'color' : ext = ".jpg"
print("Processing Section: {0}\n. . .".format(name.decode('utf-8')), end=' ')
for index in range (0,len(self.bookHeaderRecords[name])) :
fname = "{0}{1:04d}{2}".format(name,index,ext)
fname = "{0}{1:04d}{2}".format(name.decode('utf-8'),index,ext)
destdir = outdir
if name == 'img':
if name == b'img':
destdir = os.path.join(outdir,"img")
if name == 'color':
if name == b'color':
destdir = os.path.join(outdir,"color_img")
if name == 'page':
if name == b'page':
destdir = os.path.join(outdir,"page")
if name == 'glyphs':
if name == b'glyphs':
destdir = os.path.join(outdir,"glyphs")
outputFile = os.path.join(destdir,fname)
print(".", end=' ')
record = self.getBookPayloadRecord(name,index)
if record != '':
if record != b'':
open(outputFile, 'wb').write(record)
print(" ")

Loading…
Cancel
Save