From 8e7d2657a41054213f3a761efb314b838f915e03 Mon Sep 17 00:00:00 2001 From: Apprentice Alf Date: Tue, 2 Mar 2010 12:46:56 +0000 Subject: [PATCH] tools v1.5 --- Topaz_Tools/lib/cmbtc_dump.py | 2 +- Topaz_Tools/lib/cmbtc_dump_nonK4PC.py | 2 +- Topaz_Tools/lib/convert2xml.py | 21 +++++++--- Topaz_Tools/lib/decode_meta.py | 2 +- Topaz_Tools/lib/flatxml2html.py | 56 +++++++++++++++++++++------ Topaz_Tools/lib/genhtml.py | 2 +- Topaz_Tools/lib/gensvg.py | 2 +- Topaz_Tools/lib/genxml.py | 2 +- Topaz_Tools/lib/getpagedim.py | 2 +- Topaz_Tools/lib/stylexml2css.py | 12 ++++-- Topaz_Tools/lib/topaz-changes.txt | 12 +++++- eReader_Tools/lib/erdr2pml.py | 26 ++++++------- 12 files changed, 98 insertions(+), 43 deletions(-) diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py index 83301dd..d7cef99 100644 --- a/Topaz_Tools/lib/cmbtc_dump.py +++ b/Topaz_Tools/lib/cmbtc_dump.py @@ -1,5 +1,5 @@ #! /usr/bin/python -# For use in Topaz Scripts version 2.3 +# For use in Topaz Scripts version 2.6 """ diff --git a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py index 1508741..0d62404 100644 --- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py +++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py @@ -1,5 +1,5 @@ #!/usr/bin/python -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py index 18ae3f0..e3f0fe2 100644 --- a/Topaz_Tools/lib/convert2xml.py +++ b/Topaz_Tools/lib/convert2xml.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.4 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): @@ -315,6 +315,12 @@ class PageParser(object): 'version.findlists' : (1, 'scalar_text', 0, 0), 'version.page_num' : (1, 'scalar_text', 0, 0), 'version.page_type' : (1, 'scalar_text', 0, 0), + 'version.bad_text' : (1, 'scalar_text', 0, 0), + 'version.glyph_mismatch' : (1, 'scalar_text', 0, 0), + 'version.margins' : (1, 'scalar_text', 0, 0), + 'version.staggered_lines' : (1, 'scalar_text', 0, 0), + 'version.paragraph_continuation' : (1, 'scalar_text', 0, 0), + 'version.toc' : (1, 'scalar_text', 0, 0), 'stylesheet' : (1, 'snippets', 1, 0), 'style' : (1, 'snippets', 1, 0), @@ -662,16 +668,19 @@ class PageParser(object): def process(self): # peek at the first bytes to see what type of file it is - magic = self.fo.read(11) - if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'): + magic = self.fo.read(9) + if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'): first_token = 'info' - elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'): - skip = self.fo.read(1) + elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'): + skip = self.fo.read(2) + first_token = 'info' + elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'): + skip = self.fo.read(3) first_token = 'info' else : # other0.dat file first_token = None - self.fo.seek(-11,1) + self.fo.seek(-9,1) # main loop to read and build the document tree diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py index 038f133..a63c578 100644 --- a/Topaz_Tools/lib/decode_meta.py +++ b/Topaz_Tools/lib/decode_meta.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 import csv import sys diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index 0fb106d..1c4419f 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 import sys import csv @@ -32,6 +32,8 @@ class DocParser(object): self.link_id = [] self.link_title = [] self.link_page = [] + self.link_href = [] + self.link_type = [] self.dehyphen_rootid = [] self.paracont_stemid = [] self.parastems_stemid = [] @@ -197,6 +199,7 @@ class DocParser(object): # get the class def getClass(self, pclass): nclass = pclass + # class names are an issue given topaz may start them with numerals (not allowed), # use a mix of cases (which cause some browsers problems), and actually # attach numbers after "_reclustered*" to the end to deal classeses that inherit @@ -206,7 +209,10 @@ class DocParser(object): # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # that exists in the stylesheet first, and then adding this specific class # after + + # also some class names have spaces in them so need to convert to dashes if nclass != None : + nclass = nclass.replace(' ','-') classres = '' nclass = nclass.lower() nclass = 'cl-' + nclass @@ -334,7 +340,7 @@ class DocParser(object): result.append(('svg', num)) return pclass, result - # this type of paragrph may be made up of multiple spans, inline + # this type of paragraph may be made up of multiple spans, inline # word monograms (images), and words with semantic meaning, # plus glyphs used to form starting letter of first word @@ -391,6 +397,9 @@ class DocParser(object): result.append(('img' + word_class, int(argres))) word_class = '' + elif name.endswith('region.img.src'): + result.append(('img' + word_class, int(argres))) + if (sp_first != -1) and (sp_last != -1): for wordnum in xrange(sp_first, sp_last): result.append(('ocr', wordnum)) @@ -437,6 +446,8 @@ class DocParser(object): if (type == 'end'): parares += ' ' + lstart = len(parares) + cnt = len(pdesc) for j in xrange( 0, cnt) : @@ -449,18 +460,24 @@ class DocParser(object): if handle_links: link = self.link_id[num] - if (link > 0): + if (link > 0): + linktype = self.link_type[link-1] title = self.link_title[link-1] - if (title == "") or (parares.rfind(title) < 0): - title='_link_' - ptarget = self.link_page[link-1] - 1 - linkhtml = '' % ptarget + if (title == "") or (parares.rfind(title) < 0): + title=parares[lstart:] + if linktype == 'external' : + linkhref = self.link_href[link-1] + linkhtml = '' % linkhref + else : + ptarget = self.link_page[link-1] - 1 + linkhtml = '' % ptarget linkhtml += title + '' pos = parares.rfind(title) if pos >= 0: parares = parares[0:pos] + linkhtml + parares[pos+len(title):] else : parares += linkhtml + lstart = len(parares) if word == '_link_' : word = '' elif (link < 0) : if word == '_link_' : word = '' @@ -532,6 +549,14 @@ class DocParser(object): # collect link destination page numbers self.link_page = self.getData('info.links.page',0,-1) + # collect link types (container versus external) + (pos, argres) = self.findinDoc('info.links.type',0,-1) + if argres : self.link_type = argres.split('|') + + # collect link destinations + (pos, argres) = self.findinDoc('info.links.href',0,-1) + if argres : self.link_href = argres.split('|') + # collect link titles (pos, argres) = self.findinDoc('info.links.title',0,-1) if argres : @@ -641,16 +666,18 @@ class DocParser(object): htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): + elif (regtype == 'synth_fcvr.center'): (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '
' % int(simgsrc) else : - print 'Warning: region type', regtype + print ' Making region type', regtype, (pos, temp) = self.findinDoc('paragraph',start,end) - if pos != -1: - print ' is a "text" region' + (pos2, temp) = self.findinDoc('span',start,end) + if pos != -1 or pos2 != -1: + print ' a "text" region' + orig_regtype = regtype regtype = 'fixed' ptype = 'full' # check to see if this is a continution from the previous page @@ -658,6 +685,11 @@ class DocParser(object): ptype = 'end' first_para_continued = False (pclass, pdesc) = self.getParaDescription(start,end, regtype) + if not pclass: + if orig_regtype.endswith('.right') : pclass = 'cl-right' + elif orig_regtype.endswith('.center') : pclass = 'cl-center' + elif orig_regtype.endswith('.left') : pclass = 'cl-left' + elif orig_regtype.endswith('.justify') : pclass = 'cl-justify' if pclass and (ptype == 'full') and (len(pclass) >= 6): tag = 'p' if pclass[3:6] == 'h1-' : tag = 'h4' @@ -669,7 +701,7 @@ class DocParser(object): else : htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) else : - print ' is a "graphic" region' + print ' a "graphic" region' (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '
' % int(simgsrc) diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py index 82b2c72..b3cf940 100644 --- a/Topaz_Tools/lib/genhtml.py +++ b/Topaz_Tools/lib/genhtml.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py index 040fe9b..70f82b4 100644 --- a/Topaz_Tools/lib/gensvg.py +++ b/Topaz_Tools/lib/gensvg.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): diff --git a/Topaz_Tools/lib/genxml.py b/Topaz_Tools/lib/genxml.py index a30c630..be542f0 100644 --- a/Topaz_Tools/lib/genxml.py +++ b/Topaz_Tools/lib/genxml.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): diff --git a/Topaz_Tools/lib/getpagedim.py b/Topaz_Tools/lib/getpagedim.py index af2a6f6..455a38e 100644 --- a/Topaz_Tools/lib/getpagedim.py +++ b/Topaz_Tools/lib/getpagedim.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 import csv import sys diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py index 0f84d69..73f798f 100644 --- a/Topaz_Tools/lib/stylexml2css.py +++ b/Topaz_Tools/lib/stylexml2css.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 import csv import sys @@ -85,7 +85,10 @@ class DocParser(object): def process(self): classlst = '' - csspage = '' + csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n' + csspage += '.cl-right { text-align: right; }\n' + csspage += '.cl-left { text-align: left; }\n' + csspage += '.cl-justify { text-align: justify; }\n' # generate a list of each