diff --git a/DeDRM_plugin/convert2xml.py b/DeDRM_plugin/convert2xml.py
index 3249db5..abdaeb3 100644
--- a/DeDRM_plugin/convert2xml.py
+++ b/DeDRM_plugin/convert2xml.py
@@ -56,7 +56,7 @@ def readEncodedNumber(file):
c = file.read(1)
if (len(c) == 0):
return None
- data = ord(c)
+ data = c[0]
datax = (datax <<7) + (data & 0x7F)
data = datax
@@ -188,232 +188,232 @@ class PageParser(object):
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = {
- 'x' : (1, 'scalar_number', 0, 0),
- 'y' : (1, 'scalar_number', 0, 0),
- 'h' : (1, 'scalar_number', 0, 0),
- 'w' : (1, 'scalar_number', 0, 0),
- 'firstWord' : (1, 'scalar_number', 0, 0),
- 'lastWord' : (1, 'scalar_number', 0, 0),
- 'rootID' : (1, 'scalar_number', 0, 0),
- 'stemID' : (1, 'scalar_number', 0, 0),
- 'type' : (1, 'scalar_text', 0, 0),
+ b'x' : (1, 'scalar_number', 0, 0),
+ b'y' : (1, 'scalar_number', 0, 0),
+ b'h' : (1, 'scalar_number', 0, 0),
+ b'w' : (1, 'scalar_number', 0, 0),
+ b'firstWord' : (1, 'scalar_number', 0, 0),
+ b'lastWord' : (1, 'scalar_number', 0, 0),
+ b'rootID' : (1, 'scalar_number', 0, 0),
+ b'stemID' : (1, 'scalar_number', 0, 0),
+ b'type' : (1, 'scalar_text', 0, 0),
- 'info' : (0, 'number', 1, 0),
+ b'info' : (0, 'number', 1, 0),
- 'info.word' : (0, 'number', 1, 1),
- 'info.word.ocrText' : (1, 'text', 0, 0),
- 'info.word.firstGlyph' : (1, 'raw', 0, 0),
- 'info.word.lastGlyph' : (1, 'raw', 0, 0),
- 'info.word.bl' : (1, 'raw', 0, 0),
- 'info.word.link_id' : (1, 'number', 0, 0),
+ b'info.word' : (0, 'number', 1, 1),
+ b'info.word.ocrText' : (1, 'text', 0, 0),
+ b'info.word.firstGlyph' : (1, 'raw', 0, 0),
+ b'info.word.lastGlyph' : (1, 'raw', 0, 0),
+ b'info.word.bl' : (1, 'raw', 0, 0),
+ b'info.word.link_id' : (1, 'number', 0, 0),
- 'glyph' : (0, 'number', 1, 1),
- 'glyph.x' : (1, 'number', 0, 0),
- 'glyph.y' : (1, 'number', 0, 0),
- 'glyph.glyphID' : (1, 'number', 0, 0),
+ b'glyph' : (0, 'number', 1, 1),
+ b'glyph.x' : (1, 'number', 0, 0),
+ b'glyph.y' : (1, 'number', 0, 0),
+ b'glyph.glyphID' : (1, 'number', 0, 0),
- 'dehyphen' : (0, 'number', 1, 1),
- 'dehyphen.rootID' : (1, 'number', 0, 0),
- 'dehyphen.stemID' : (1, 'number', 0, 0),
- 'dehyphen.stemPage' : (1, 'number', 0, 0),
- 'dehyphen.sh' : (1, 'number', 0, 0),
+ b'dehyphen' : (0, 'number', 1, 1),
+ b'dehyphen.rootID' : (1, 'number', 0, 0),
+ b'dehyphen.stemID' : (1, 'number', 0, 0),
+ b'dehyphen.stemPage' : (1, 'number', 0, 0),
+ b'dehyphen.sh' : (1, 'number', 0, 0),
- 'links' : (0, 'number', 1, 1),
- 'links.page' : (1, 'number', 0, 0),
- 'links.rel' : (1, 'number', 0, 0),
- 'links.row' : (1, 'number', 0, 0),
- 'links.title' : (1, 'text', 0, 0),
- 'links.href' : (1, 'text', 0, 0),
- 'links.type' : (1, 'text', 0, 0),
- 'links.id' : (1, 'number', 0, 0),
+ b'links' : (0, 'number', 1, 1),
+ b'links.page' : (1, 'number', 0, 0),
+ b'links.rel' : (1, 'number', 0, 0),
+ b'links.row' : (1, 'number', 0, 0),
+ b'links.title' : (1, 'text', 0, 0),
+ b'links.href' : (1, 'text', 0, 0),
+ b'links.type' : (1, 'text', 0, 0),
+ b'links.id' : (1, 'number', 0, 0),
- 'paraCont' : (0, 'number', 1, 1),
- 'paraCont.rootID' : (1, 'number', 0, 0),
- 'paraCont.stemID' : (1, 'number', 0, 0),
- 'paraCont.stemPage' : (1, 'number', 0, 0),
+ b'paraCont' : (0, 'number', 1, 1),
+ b'paraCont.rootID' : (1, 'number', 0, 0),
+ b'paraCont.stemID' : (1, 'number', 0, 0),
+ b'paraCont.stemPage' : (1, 'number', 0, 0),
- 'paraStems' : (0, 'number', 1, 1),
- 'paraStems.stemID' : (1, 'number', 0, 0),
+ b'paraStems' : (0, 'number', 1, 1),
+ b'paraStems.stemID' : (1, 'number', 0, 0),
- 'wordStems' : (0, 'number', 1, 1),
- 'wordStems.stemID' : (1, 'number', 0, 0),
+ b'wordStems' : (0, 'number', 1, 1),
+ b'wordStems.stemID' : (1, 'number', 0, 0),
- 'empty' : (1, 'snippets', 1, 0),
+ b'empty' : (1, 'snippets', 1, 0),
- 'page' : (1, 'snippets', 1, 0),
- 'page.class' : (1, 'scalar_text', 0, 0),
- 'page.pageid' : (1, 'scalar_text', 0, 0),
- 'page.pagelabel' : (1, 'scalar_text', 0, 0),
- 'page.type' : (1, 'scalar_text', 0, 0),
- 'page.h' : (1, 'scalar_number', 0, 0),
- 'page.w' : (1, 'scalar_number', 0, 0),
- 'page.startID' : (1, 'scalar_number', 0, 0),
+ b'page' : (1, 'snippets', 1, 0),
+ b'page.class' : (1, 'scalar_text', 0, 0),
+ b'page.pageid' : (1, 'scalar_text', 0, 0),
+ b'page.pagelabel' : (1, 'scalar_text', 0, 0),
+ b'page.type' : (1, 'scalar_text', 0, 0),
+ b'page.h' : (1, 'scalar_number', 0, 0),
+ b'page.w' : (1, 'scalar_number', 0, 0),
+ b'page.startID' : (1, 'scalar_number', 0, 0),
- 'group' : (1, 'snippets', 1, 0),
- 'group.class' : (1, 'scalar_text', 0, 0),
- 'group.type' : (1, 'scalar_text', 0, 0),
- 'group._tag' : (1, 'scalar_text', 0, 0),
- 'group.orientation': (1, 'scalar_text', 0, 0),
+ b'group' : (1, 'snippets', 1, 0),
+ b'group.class' : (1, 'scalar_text', 0, 0),
+ b'group.type' : (1, 'scalar_text', 0, 0),
+ b'group._tag' : (1, 'scalar_text', 0, 0),
+ b'group.orientation': (1, 'scalar_text', 0, 0),
- 'region' : (1, 'snippets', 1, 0),
- 'region.class' : (1, 'scalar_text', 0, 0),
- 'region.type' : (1, 'scalar_text', 0, 0),
- 'region.x' : (1, 'scalar_number', 0, 0),
- 'region.y' : (1, 'scalar_number', 0, 0),
- 'region.h' : (1, 'scalar_number', 0, 0),
- 'region.w' : (1, 'scalar_number', 0, 0),
- 'region.orientation' : (1, 'scalar_text', 0, 0),
+ b'region' : (1, 'snippets', 1, 0),
+ b'region.class' : (1, 'scalar_text', 0, 0),
+ b'region.type' : (1, 'scalar_text', 0, 0),
+ b'region.x' : (1, 'scalar_number', 0, 0),
+ b'region.y' : (1, 'scalar_number', 0, 0),
+ b'region.h' : (1, 'scalar_number', 0, 0),
+ b'region.w' : (1, 'scalar_number', 0, 0),
+ b'region.orientation' : (1, 'scalar_text', 0, 0),
- 'empty_text_region' : (1, 'snippets', 1, 0),
+ b'empty_text_region' : (1, 'snippets', 1, 0),
- 'img' : (1, 'snippets', 1, 0),
- 'img.x' : (1, 'scalar_number', 0, 0),
- 'img.y' : (1, 'scalar_number', 0, 0),
- 'img.h' : (1, 'scalar_number', 0, 0),
- 'img.w' : (1, 'scalar_number', 0, 0),
- 'img.src' : (1, 'scalar_number', 0, 0),
- 'img.color_src' : (1, 'scalar_number', 0, 0),
- 'img.gridSize' : (1, 'scalar_number', 0, 0),
- 'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
- 'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
- 'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
- 'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
- 'img.image_type' : (1, 'scalar_number', 0, 0),
+ b'img' : (1, 'snippets', 1, 0),
+ b'img.x' : (1, 'scalar_number', 0, 0),
+ b'img.y' : (1, 'scalar_number', 0, 0),
+ b'img.h' : (1, 'scalar_number', 0, 0),
+ b'img.w' : (1, 'scalar_number', 0, 0),
+ b'img.src' : (1, 'scalar_number', 0, 0),
+ b'img.color_src' : (1, 'scalar_number', 0, 0),
+ b'img.gridSize' : (1, 'scalar_number', 0, 0),
+ b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
+ b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+ b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
+ b'img.image_type' : (1, 'scalar_number', 0, 0),
- 'paragraph' : (1, 'snippets', 1, 0),
- 'paragraph.class' : (1, 'scalar_text', 0, 0),
- 'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
- 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
- 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
- 'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
- 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
- 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
- 'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
- 'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
+ b'paragraph' : (1, 'snippets', 1, 0),
+ b'paragraph.class' : (1, 'scalar_text', 0, 0),
+ b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
+ b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
+ b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
+ b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+ b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
- 'word_semantic' : (1, 'snippets', 1, 1),
- 'word_semantic.type' : (1, 'scalar_text', 0, 0),
- 'word_semantic.class' : (1, 'scalar_text', 0, 0),
- 'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
- 'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
- 'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
- 'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
- 'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
- 'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
+ b'word_semantic' : (1, 'snippets', 1, 1),
+ b'word_semantic.type' : (1, 'scalar_text', 0, 0),
+ b'word_semantic.class' : (1, 'scalar_text', 0, 0),
+ b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
+ b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
+ b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
+ b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+ b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
- 'word' : (1, 'snippets', 1, 0),
- 'word.type' : (1, 'scalar_text', 0, 0),
- 'word.class' : (1, 'scalar_text', 0, 0),
- 'word.firstGlyph' : (1, 'scalar_number', 0, 0),
- 'word.lastGlyph' : (1, 'scalar_number', 0, 0),
+ b'word' : (1, 'snippets', 1, 0),
+ b'word.type' : (1, 'scalar_text', 0, 0),
+ b'word.class' : (1, 'scalar_text', 0, 0),
+ b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
+ b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
- '_span' : (1, 'snippets', 1, 0),
- '_span.class' : (1, 'scalar_text', 0, 0),
- '_span.firstWord' : (1, 'scalar_number', 0, 0),
- '_span.lastWord' : (1, 'scalar_number', 0, 0),
- '_span.gridSize' : (1, 'scalar_number', 0, 0),
- '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
- '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
- '_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
- '_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
+ b'_span' : (1, 'snippets', 1, 0),
+ b'_span.class' : (1, 'scalar_text', 0, 0),
+ b'_span.firstWord' : (1, 'scalar_number', 0, 0),
+ b'_span.lastWord' : (1, 'scalar_number', 0, 0),
+ b'_span.gridSize' : (1, 'scalar_number', 0, 0),
+ b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
+ b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+ b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
- 'span' : (1, 'snippets', 1, 0),
- 'span.firstWord' : (1, 'scalar_number', 0, 0),
- 'span.lastWord' : (1, 'scalar_number', 0, 0),
- 'span.gridSize' : (1, 'scalar_number', 0, 0),
- 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
- 'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
- 'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
- 'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
+ b'span' : (1, 'snippets', 1, 0),
+ b'span.firstWord' : (1, 'scalar_number', 0, 0),
+ b'span.lastWord' : (1, 'scalar_number', 0, 0),
+ b'span.gridSize' : (1, 'scalar_number', 0, 0),
+ b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
+ b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+ b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
- 'extratokens' : (1, 'snippets', 1, 0),
- 'extratokens.class' : (1, 'scalar_text', 0, 0),
- 'extratokens.type' : (1, 'scalar_text', 0, 0),
- 'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
- 'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
- 'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
- 'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
- 'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
- 'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
- 'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
+ b'extratokens' : (1, 'snippets', 1, 0),
+ b'extratokens.class' : (1, 'scalar_text', 0, 0),
+ b'extratokens.type' : (1, 'scalar_text', 0, 0),
+ b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
+ b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
+ b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
+ b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
+ b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+ b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
- 'glyph.h' : (1, 'number', 0, 0),
- 'glyph.w' : (1, 'number', 0, 0),
- 'glyph.use' : (1, 'number', 0, 0),
- 'glyph.vtx' : (1, 'number', 0, 1),
- 'glyph.len' : (1, 'number', 0, 1),
- 'glyph.dpi' : (1, 'number', 0, 0),
- 'vtx' : (0, 'number', 1, 1),
- 'vtx.x' : (1, 'number', 0, 0),
- 'vtx.y' : (1, 'number', 0, 0),
- 'len' : (0, 'number', 1, 1),
- 'len.n' : (1, 'number', 0, 0),
+ b'glyph.h' : (1, 'number', 0, 0),
+ b'glyph.w' : (1, 'number', 0, 0),
+ b'glyph.use' : (1, 'number', 0, 0),
+ b'glyph.vtx' : (1, 'number', 0, 1),
+ b'glyph.len' : (1, 'number', 0, 1),
+ b'glyph.dpi' : (1, 'number', 0, 0),
+ b'vtx' : (0, 'number', 1, 1),
+ b'vtx.x' : (1, 'number', 0, 0),
+ b'vtx.y' : (1, 'number', 0, 0),
+ b'len' : (0, 'number', 1, 1),
+ b'len.n' : (1, 'number', 0, 0),
- 'book' : (1, 'snippets', 1, 0),
- 'version' : (1, 'snippets', 1, 0),
- 'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
- 'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
- 'version.Schema_id' : (1, 'scalar_text', 0, 0),
- 'version.Schema_version' : (1, 'scalar_text', 0, 0),
- 'version.Topaz_version' : (1, 'scalar_text', 0, 0),
- 'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
- 'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
- 'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
- 'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
- 'version.chapterheaders' : (1, 'scalar_text', 0, 0),
- 'version.creation_date' : (1, 'scalar_text', 0, 0),
- 'version.header_footer' : (1, 'scalar_text', 0, 0),
- 'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
- 'version.letter_insertion' : (1, 'scalar_text', 0, 0),
- 'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
- 'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
- 'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
- 'version.findlists' : (1, 'scalar_text', 0, 0),
- 'version.page_num' : (1, 'scalar_text', 0, 0),
- 'version.page_type' : (1, 'scalar_text', 0, 0),
- 'version.bad_text' : (1, 'scalar_text', 0, 0),
- 'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
- 'version.margins' : (1, 'scalar_text', 0, 0),
- 'version.staggered_lines' : (1, 'scalar_text', 0, 0),
- 'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
- 'version.toc' : (1, 'scalar_text', 0, 0),
+ b'book' : (1, 'snippets', 1, 0),
+ b'version' : (1, 'snippets', 1, 0),
+ b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
+ b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
+ b'version.Schema_id' : (1, 'scalar_text', 0, 0),
+ b'version.Schema_version' : (1, 'scalar_text', 0, 0),
+ b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
+ b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
+ b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
+ b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
+ b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
+ b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
+ b'version.creation_date' : (1, 'scalar_text', 0, 0),
+ b'version.header_footer' : (1, 'scalar_text', 0, 0),
+ b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
+ b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
+ b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
+ b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
+ b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
+ b'version.findlists' : (1, 'scalar_text', 0, 0),
+ b'version.page_num' : (1, 'scalar_text', 0, 0),
+ b'version.page_type' : (1, 'scalar_text', 0, 0),
+ b'version.bad_text' : (1, 'scalar_text', 0, 0),
+ b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
+ b'version.margins' : (1, 'scalar_text', 0, 0),
+ b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
+ b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
+ b'version.toc' : (1, 'scalar_text', 0, 0),
- 'stylesheet' : (1, 'snippets', 1, 0),
- 'style' : (1, 'snippets', 1, 0),
- 'style._tag' : (1, 'scalar_text', 0, 0),
- 'style.type' : (1, 'scalar_text', 0, 0),
- 'style._after_type' : (1, 'scalar_text', 0, 0),
- 'style._parent_type' : (1, 'scalar_text', 0, 0),
- 'style._after_parent_type' : (1, 'scalar_text', 0, 0),
- 'style.class' : (1, 'scalar_text', 0, 0),
- 'style._after_class' : (1, 'scalar_text', 0, 0),
- 'rule' : (1, 'snippets', 1, 0),
- 'rule.attr' : (1, 'scalar_text', 0, 0),
- 'rule.value' : (1, 'scalar_text', 0, 0),
+ b'stylesheet' : (1, 'snippets', 1, 0),
+ b'style' : (1, 'snippets', 1, 0),
+ b'style._tag' : (1, 'scalar_text', 0, 0),
+ b'style.type' : (1, 'scalar_text', 0, 0),
+ b'style._after_type' : (1, 'scalar_text', 0, 0),
+ b'style._parent_type' : (1, 'scalar_text', 0, 0),
+ b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
+ b'style.class' : (1, 'scalar_text', 0, 0),
+ b'style._after_class' : (1, 'scalar_text', 0, 0),
+ b'rule' : (1, 'snippets', 1, 0),
+ b'rule.attr' : (1, 'scalar_text', 0, 0),
+ b'rule.value' : (1, 'scalar_text', 0, 0),
- 'original' : (0, 'number', 1, 1),
- 'original.pnum' : (1, 'number', 0, 0),
- 'original.pid' : (1, 'text', 0, 0),
- 'pages' : (0, 'number', 1, 1),
- 'pages.ref' : (1, 'number', 0, 0),
- 'pages.id' : (1, 'number', 0, 0),
- 'startID' : (0, 'number', 1, 1),
- 'startID.page' : (1, 'number', 0, 0),
- 'startID.id' : (1, 'number', 0, 0),
+ b'original' : (0, 'number', 1, 1),
+ b'original.pnum' : (1, 'number', 0, 0),
+ b'original.pid' : (1, 'text', 0, 0),
+ b'pages' : (0, 'number', 1, 1),
+ b'pages.ref' : (1, 'number', 0, 0),
+ b'pages.id' : (1, 'number', 0, 0),
+ b'startID' : (0, 'number', 1, 1),
+ b'startID.page' : (1, 'number', 0, 0),
+ b'startID.id' : (1, 'number', 0, 0),
- 'median_d' : (1, 'number', 0, 0),
- 'median_h' : (1, 'number', 0, 0),
- 'median_firsty' : (1, 'number', 0, 0),
- 'median_lasty' : (1, 'number', 0, 0),
+ b'median_d' : (1, 'number', 0, 0),
+ b'median_h' : (1, 'number', 0, 0),
+ b'median_firsty' : (1, 'number', 0, 0),
+ b'median_lasty' : (1, 'number', 0, 0),
- 'num_footers_maybe' : (1, 'number', 0, 0),
- 'num_footers_yes' : (1, 'number', 0, 0),
- 'num_headers_maybe' : (1, 'number', 0, 0),
- 'num_headers_yes' : (1, 'number', 0, 0),
+ b'num_footers_maybe' : (1, 'number', 0, 0),
+ b'num_footers_yes' : (1, 'number', 0, 0),
+ b'num_headers_maybe' : (1, 'number', 0, 0),
+ b'num_headers_yes' : (1, 'number', 0, 0),
- 'tracking' : (1, 'number', 0, 0),
- 'src' : (1, 'text', 0, 0),
+ b'tracking' : (1, 'number', 0, 0),
+ b'src' : (1, 'text', 0, 0),
}
@@ -430,7 +430,7 @@ class PageParser(object):
cnt = len(self.tagpath)
if i < cnt : result = self.tagpath[i]
for j in range(i+1, cnt) :
- result += '.' + self.tagpath[j]
+ result += b'.' + self.tagpath[j]
return result
@@ -505,7 +505,7 @@ class PageParser(object):
if (subtags == 1):
ntags = readEncodedNumber(self.fo)
- if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
+ if self.debug : print('subtags: ', token , ' has ' , str(ntags))
for j in range(ntags):
val = readEncodedNumber(self.fo)
subtagres.append(self.procToken(self.dict.lookup(val)))
@@ -613,7 +613,7 @@ class PageParser(object):
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
- nname = prefix + '.' + name
+ nname = prefix + b'.' + name
nsubtaglist = []
for j in subtagList:
nsubtaglist.append(self.updateName(j,prefix))
@@ -662,34 +662,34 @@ class PageParser(object):
subtagList = node[1]
argtype = node[2]
argList = node[3]
- fullpathname = name.split('.')
+ fullpathname = name.split(b'.')
nodename = fullpathname.pop()
ilvl = len(fullpathname)
- indent = ' ' * (3 * ilvl)
+ indent = b' ' * (3 * ilvl)
rlst = []
- rlst.append(indent + '<' + nodename + '>')
+ rlst.append(indent + b'<' + nodename + b'>')
if len(argList) > 0:
alst = []
for j in argList:
- if (argtype == 'text') or (argtype == 'scalar_text') :
- alst.append(j + '|')
+ if (argtype == b'text') or (argtype == b'scalar_text') :
+ alst.append(j + b'|')
else :
- alst.append(str(j) + ',')
- argres = "".join(alst)
+ alst.append(str(j).encode('utf-8') + b',')
+ argres = b"".join(alst)
argres = argres[0:-1]
- if argtype == 'snippets' :
- rlst.append('snippets:' + argres)
+ if argtype == b'snippets' :
+ rlst.append(b'snippets:' + argres)
else :
rlst.append(argres)
if len(subtagList) > 0 :
- rlst.append('\n')
+ rlst.append(b'\n')
for j in subtagList:
if len(j) > 0 :
rlst.append(self.formatTag(j))
- rlst.append(indent + '' + nodename + '>\n')
+ rlst.append(indent + b'' + nodename + b'>\n')
else:
- rlst.append('' + nodename + '>\n')
- return "".join(rlst)
+ rlst.append(b'' + nodename + b'>\n')
+ return b"".join(rlst)
# flatten tag
@@ -704,20 +704,20 @@ class PageParser(object):
alst = []
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
- alst.append(j + '|')
+ alst.append(j + b'|')
else :
- alst.append(str(j) + '|')
- argres = "".join(alst)
+ alst.append(str(j).encode('utf-8') + b'|')
+ argres = b"".join(alst)
argres = argres[0:-1]
- if argtype == 'snippets' :
- rlst.append('.snippets=' + argres)
+ if argtype == b'snippets' :
+ rlst.append(b'.snippets=' + argres)
else :
- rlst.append('=' + argres)
- rlst.append('\n')
+ rlst.append(b'=' + argres)
+ rlst.append(b'\n')
for j in subtagList:
if len(j) > 0 :
rlst.append(self.flattenTag(j))
- return "".join(rlst)
+ return b"".join(rlst)
# reduce create xml output
@@ -729,7 +729,7 @@ class PageParser(object):
rlst.append(self.flattenTag(j))
else:
rlst.append(self.formatTag(j))
- result = "".join(rlst)
+ result = b"".join(rlst)
if self.debug : print(result)
return result
@@ -747,16 +747,16 @@ class PageParser(object):
# peek at the first bytes to see what type of file it is
magic = self.fo.read(9)
- if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
- first_token = 'info'
- elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
+ if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
+ first_token = b'info'
+ elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
skip = self.fo.read(2)
- first_token = 'info'
- elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
- first_token = 'info'
- elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
+ first_token = b'info'
+ elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
+ first_token = b'info'
+ elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
skip = self.fo.read(3)
- first_token = 'info'
+ first_token = b'info'
else :
# other0.dat file
first_token = None
@@ -778,7 +778,7 @@ class PageParser(object):
break
if (v == 0x72):
- self.doLoop72('number')
+ self.doLoop72(b'number')
elif (v > 0) and (v < self.dict.getSize()) :
tag = self.procToken(self.dict.lookup(v))
if len(tag) > 0 :
@@ -789,7 +789,7 @@ class PageParser(object):
if (v == 0):
if (self.peek(1) == 0x5f):
skip = self.fo.read(1)
- first_token = 'info'
+ first_token = b'info'
# now do snippet injection
if len(self.snippetList) > 0 :
@@ -809,14 +809,14 @@ class PageParser(object):
def fromData(dict, fname):
flat_xml = True
- debug = False
+ debug = True
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
def getXML(dict, fname):
flat_xml = False
- debug = False
+ debug = True
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
@@ -845,7 +845,7 @@ def main(argv):
sys.stderr=SafeUnbuffered(sys.stderr)
dictFile = ""
pageFile = ""
- debug = False
+ debug = True
flat_xml = False
printOutput = False
if len(argv) == 0:
diff --git a/DeDRM_plugin/flatxml2html.py b/DeDRM_plugin/flatxml2html.py
index 6f839ce..f1ca81d 100644
--- a/DeDRM_plugin/flatxml2html.py
+++ b/DeDRM_plugin/flatxml2html.py
@@ -7,6 +7,7 @@ import csv
import os
import math
import getopt
+import functools
from struct import pack
from struct import unpack
@@ -15,14 +16,14 @@ class DocParser(object):
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0
- self.docList = flatxml.split('\n')
+ self.docList = flatxml.split(b'\n')
self.docSize = len(self.docList)
self.classList = {}
self.bookDir = bookDir
self.gdict = gdict
tmpList = classlst.split('\n')
for pclass in tmpList:
- if pclass != '':
+ if pclass != b'':
# remove the leading period from the css name
cname = pclass[1:]
self.classList[cname] = True
@@ -57,9 +58,9 @@ class DocParser(object):
imgfile = os.path.join(imgDir,imgname)
# get glyph information
- gxList = self.getData('info.glyph.x',0,-1)
- gyList = self.getData('info.glyph.y',0,-1)
- gidList = self.getData('info.glyph.glyphID',0,-1)
+ gxList = self.getData(b'info.glyph.x',0,-1)
+ gyList = self.getData(b'info.glyph.y',0,-1)
+ gidList = self.getData(b'info.glyph.glyphID',0,-1)
gids = []
maxws = []
@@ -122,11 +123,11 @@ class DocParser(object):
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
item = self.docList[pos]
- if item.find('=') >= 0:
- (name, argres) = item.split('=',1)
+ if item.find(b'=') >= 0:
+ (name, argres) = item.split(b'=',1)
else :
name = item
- argres = ''
+ argres = b''
return name, argres
@@ -140,11 +141,13 @@ class DocParser(object):
foundat = -1
for j in range(pos, end):
item = self.docList[j]
- if item.find('=') >= 0:
- (name, argres) = item.split('=',1)
+ if item.find(b'=') >= 0:
+ (name, argres) = item.split(b'=',1)
else :
name = item
argres = ''
+ if (isinstance(tagpath,str)):
+ tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) :
result = argres
foundat = j
@@ -170,7 +173,7 @@ class DocParser(object):
argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) :
- argList = argt.split('|')
+ argList = argt.split(b'|')
argres = [ int(strval) for strval in argList]
return argres
@@ -191,21 +194,21 @@ class DocParser(object):
# also some class names have spaces in them so need to convert to dashes
if nclass != None :
- nclass = nclass.replace(' ','-')
- classres = ''
+ nclass = nclass.replace(b' ',b'-')
+ classres = b''
nclass = nclass.lower()
- nclass = 'cl-' + nclass
- baseclass = ''
+ nclass = b'cl-' + nclass
+ baseclass = b''
# graphic is the base class for captions
- if nclass.find('cl-cap-') >=0 :
- classres = 'graphic' + ' '
+ if nclass.find(b'cl-cap-') >=0 :
+ classres = b'graphic' + b' '
else :
# strip to find baseclass
- p = nclass.find('_')
+ p = nclass.find(b'_')
if p > 0 :
baseclass = nclass[0:p]
if baseclass in self.classList:
- classres += baseclass + ' '
+ classres += baseclass + b' '
classres += nclass
nclass = classres
return nclass
@@ -225,11 +228,11 @@ class DocParser(object):
return -1
result = []
- (pos, pagetype) = self.findinDoc('page.type',0,-1)
+ (pos, pagetype) = self.findinDoc(b'page.type',0,-1)
- groupList = self.posinDoc('page.group')
- groupregionList = self.posinDoc('page.group.region')
- pageregionList = self.posinDoc('page.region')
+ groupList = self.posinDoc(b'page.group')
+ groupregionList = self.posinDoc(b'page.group.region')
+ pageregionList = self.posinDoc(b'page.region')
# integrate into one list
for j in groupList:
result.append(('grpbeg',j))
@@ -237,7 +240,7 @@ class DocParser(object):
result.append(('gregion',j))
for j in pageregionList:
result.append(('pregion',j))
- result.sort(compare)
+ result.sort(key=functools.cmp_to_key(compare))
# insert group end and page end indicators
inGroup = False
@@ -267,33 +270,33 @@ class DocParser(object):
result = []
# paragraph
- (pos, pclass) = self.findinDoc('paragraph.class',start,end)
+ (pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
pclass = self.getClass(pclass)
# if paragraph uses extratokens (extra glyphs) then make it fixed
- (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+ (pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
- (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
- (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+ (pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
+ (pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) :
first = int(sfirst)
last = int(slast)
- makeImage = (regtype == 'vertical') or (regtype == 'table')
+ makeImage = (regtype == b'vertical') or (regtype == b'table')
makeImage = makeImage or (extraglyphs != None)
if self.fixedimage:
- makeImage = makeImage or (regtype == 'fixed')
+ makeImage = makeImage or (regtype == b'fixed')
if (pclass != None):
- makeImage = makeImage or (pclass.find('.inverted') >= 0)
+ makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
if self.fixedimage :
- makeImage = makeImage or (pclass.find('cl-f-') >= 0)
+ makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
# before creating an image make sure glyph info exists
- gidList = self.getData('info.glyph.glyphID',0,-1)
+ gidList = self.getData(b'info.glyph.glyphID',0,-1)
makeImage = makeImage & (len(gidList) > 0)
@@ -307,8 +310,8 @@ class DocParser(object):
# translate first and last word into first and last glyphs
# and generate inline image and include it
glyphList = []
- firstglyphList = self.getData('word.firstGlyph',0,-1)
- gidList = self.getData('info.glyph.glyphID',0,-1)
+ firstglyphList = self.getData(b'word.firstGlyph',0,-1)
+ gidList = self.getData(b'info.glyph.glyphID',0,-1)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
@@ -326,8 +329,8 @@ class DocParser(object):
for glyphnum in range(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
# include any extratokens if they exist
- (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
- (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
+ (pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
+ (pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
if (sfg != None) and (slg != None):
for glyphnum in range(int(sfg), int(slg)):
glyphList.append(glyphnum)
@@ -368,39 +371,39 @@ class DocParser(object):
(name, argres) = self.lineinDoc(line)
- if name.endswith('span.firstWord') :
+ if name.endswith(b'span.firstWord') :
sp_first = int(argres)
- elif name.endswith('span.lastWord') :
+ elif name.endswith(b'span.lastWord') :
sp_last = int(argres)
- elif name.endswith('word.firstGlyph') :
+ elif name.endswith(b'word.firstGlyph') :
gl_first = int(argres)
- elif name.endswith('word.lastGlyph') :
+ elif name.endswith(b'word.lastGlyph') :
gl_last = int(argres)
- elif name.endswith('word_semantic.firstWord'):
+ elif name.endswith(b'word_semantic.firstWord'):
ws_first = int(argres)
- elif name.endswith('word_semantic.lastWord'):
+ elif name.endswith(b'word_semantic.lastWord'):
ws_last = int(argres)
- elif name.endswith('word.class'):
+ elif name.endswith(b'word.class'):
# we only handle spaceafter word class
try:
- (cname, space) = argres.split('-',1)
- if space == '' : space = '0'
- if (cname == 'spaceafter') and (int(space) > 0) :
+ (cname, space) = argres.split(b'-',1)
+ if space == b'' : space = b'0'
+ if (cname == b'spaceafter') and (int(space) > 0) :
word_class = 'sa'
except:
pass
- elif name.endswith('word.img.src'):
+ elif name.endswith(b'word.img.src'):
result.append(('img' + word_class, int(argres)))
word_class = ''
- elif name.endswith('region.img.src'):
+ elif name.endswith(b'region.img.src'):
result.append(('img' + word_class, int(argres)))
if (sp_first != -1) and (sp_last != -1):
@@ -437,7 +440,7 @@ class DocParser(object):
classres = ''
if pclass :
- classres = ' class="' + pclass + '"'
+ classres = ' class="' + pclass.decode('utf-8') + '"'
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
@@ -470,8 +473,8 @@ class DocParser(object):
if (link > 0):
linktype = self.link_type[link-1]
title = self.link_title[link-1]
- if (title == "") or (parares.rfind(title) < 0):
- title=parares[lstart:]
+ if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
+ title=parares[lstart:].encode('utf-8')
if linktype == 'external' :
linkhref = self.link_href[link-1]
linkhtml = '' % linkhref
@@ -482,33 +485,34 @@ class DocParser(object):
else :
# just link to the current page
linkhtml = ''
- linkhtml += title + ''
- pos = parares.rfind(title)
+ linkhtml += title.decode('utf-8')
+ linkhtml += ''
+ pos = parares.rfind(title.decode('utf-8'))
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
lstart = len(parares)
- if word == '_link_' : word = ''
+ if word == b'_link_' : word = b''
elif (link < 0) :
- if word == '_link_' : word = ''
+ if word == b'_link_' : word = b''
- if word == '_lb_':
+ if word == b'_lb_':
if ((num-1) in self.dehyphen_rootid ) or handle_links:
- word = ''
+ word = b''
sep = ''
elif br_lb :
- word = '
\n'
+ word = b'
\n'
sep = ''
else :
- word = '\n'
+ word = b'\n'
sep = ''
if num in self.dehyphen_rootid :
word = word[0:-1]
sep = ''
- parares += word + sep
+ parares += word.decode('utf-8') + sep
elif wtype == 'img' :
sep = ''
@@ -522,7 +526,9 @@ class DocParser(object):
elif wtype == 'svg' :
sep = ''
- parares += '' % num
+ parares += '' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1]
@@ -545,7 +551,7 @@ class DocParser(object):
(wtype, num) = pdesc[j]
if wtype == 'ocr' :
- word = self.ocrtext[num]
+ word = self.ocrtext[num].decode('utf-8')
sep = ' '
if handle_links:
@@ -553,7 +559,7 @@ class DocParser(object):
if (link > 0):
linktype = self.link_type[link-1]
title = self.link_title[link-1]
- title = title.rstrip('. ')
+ title = title.rstrip(b'. ')
alt_title = parares[lstart:]
alt_title = alt_title.strip()
# now strip off the actual printed page number
@@ -607,38 +613,38 @@ class DocParser(object):
hlst = []
# get the ocr text
- (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
- if argres : self.ocrtext = argres.split('|')
+ (pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
+ if argres : self.ocrtext = argres.split(b'|')
# get information to dehyphenate the text
- self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
+ self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page
- (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
+ (pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
first_para_continued = (self.parastems_stemid != None)
# determine if last paragraph is continued onto the next page
- (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
+ (pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
last_para_continued = (self.paracont_stemid != None)
# collect link ids
- self.link_id = self.getData('info.word.link_id',0,-1)
+ self.link_id = self.getData(b'info.word.link_id',0,-1)
# collect link destination page numbers
- self.link_page = self.getData('info.links.page',0,-1)
+ self.link_page = self.getData(b'info.links.page',0,-1)
# collect link types (container versus external)
- (pos, argres) = self.findinDoc('info.links.type',0,-1)
- if argres : self.link_type = argres.split('|')
+ (pos, argres) = self.findinDoc(b'info.links.type',0,-1)
+ if argres : self.link_type = argres.split(b'|')
# collect link destinations
- (pos, argres) = self.findinDoc('info.links.href',0,-1)
- if argres : self.link_href = argres.split('|')
+ (pos, argres) = self.findinDoc(b'info.links.href',0,-1)
+ if argres : self.link_href = argres.split(b'|')
# collect link titles
- (pos, argres) = self.findinDoc('info.links.title',0,-1)
+ (pos, argres) = self.findinDoc(b'info.links.title',0,-1)
if argres :
- self.link_title = argres.split('|')
+ self.link_title = argres.split(b'|')
else:
self.link_title.append('')
@@ -662,51 +668,51 @@ class DocParser(object):
# set anchor for link target on this page
if not anchorSet and not first_para_continued:
hlst.append('